diff options
Diffstat (limited to 'fs')
1051 files changed, 68222 insertions, 50233 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 7af425f53be..8482f2d1160 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -156,7 +156,7 @@ int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid) return -EOPNOTSUPP; acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS); if (acl) { - retval = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + retval = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); if (retval) return retval; set_cached_acl(inode, ACL_TYPE_ACCESS, acl); @@ -200,7 +200,7 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep, if (acl) { if (S_ISDIR(mode)) *dpacl = posix_acl_dup(acl); - retval = posix_acl_create(&acl, GFP_NOFS, &mode); + retval = __posix_acl_create(&acl, GFP_NOFS, &mode); if (retval < 0) return retval; if (retval > 0) diff --git a/fs/9p/cache.c b/fs/9p/cache.c index a9ea73d6dcf..a69260f2755 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -90,7 +90,7 @@ void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses) v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index, &v9fs_cache_session_index_def, - v9ses); + v9ses, true); p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n", v9ses, v9ses->fscache); } @@ -204,7 +204,7 @@ void v9fs_cache_inode_get_cookie(struct inode *inode) v9ses = v9fs_inode2v9ses(inode); v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, &v9fs_cache_inode_index_def, - v9inode); + v9inode, true); p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n", inode, v9inode->fscache); @@ -239,13 +239,12 @@ void v9fs_cache_inode_flush_cookie(struct inode *inode) void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp) { struct v9fs_inode *v9inode = V9FS_I(inode); - struct p9_fid *fid; if (!v9inode->fscache) return; spin_lock(&v9inode->fscache_lock); - fid = filp->private_data; + if ((filp->f_flags & O_ACCMODE) != O_RDONLY) v9fs_cache_inode_flush_cookie(inode); else @@ -271,7 +270,7 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode) v9ses = v9fs_inode2v9ses(inode); v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, &v9fs_cache_inode_index_def, - v9inode); + v9inode, true); p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n", inode, old, v9inode->fscache); diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 08f2e1e9a7e..6894b085f0e 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -56,7 +56,7 @@ enum { /* Options that take no arguments */ Opt_nodevmap, /* Cache options */ - Opt_cache_loose, Opt_fscache, + Opt_cache_loose, Opt_fscache, Opt_mmap, /* Access options */ Opt_access, Opt_posixacl, /* Error token */ @@ -74,6 +74,7 @@ static const match_table_t tokens = { {Opt_cache, "cache=%s"}, {Opt_cache_loose, "loose"}, {Opt_fscache, "fscache"}, + {Opt_mmap, "mmap"}, {Opt_cachetag, "cachetag=%s"}, {Opt_access, "access=%s"}, {Opt_posixacl, "posixacl"}, @@ -91,6 +92,9 @@ static int get_cache_mode(char *s) } else if (!strcmp(s, "fscache")) { version = CACHE_FSCACHE; p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n"); + } else if (!strcmp(s, "mmap")) { + version = CACHE_MMAP; + p9_debug(P9_DEBUG_9P, "Cache mode: mmap\n"); } else if (!strcmp(s, "none")) { version = CACHE_NONE; p9_debug(P9_DEBUG_9P, "Cache mode: none\n"); @@ -220,6 +224,9 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) case Opt_fscache: v9ses->cache = CACHE_FSCACHE; break; + case Opt_mmap: + v9ses->cache = CACHE_MMAP; + break; case Opt_cachetag: #ifdef CONFIG_9P_FSCACHE v9ses->cachetag = match_strdup(&args[0]); @@ -530,7 +537,7 @@ static struct attribute_group v9fs_attr_group = { * */ -static int v9fs_sysfs_init(void) +static int __init v9fs_sysfs_init(void) { v9fs_kobj = kobject_create_and_add("9p", fs_kobj); if (!v9fs_kobj) diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index a8e127c8962..099c7712631 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -64,6 +64,7 @@ enum p9_session_flags { enum p9_cache_modes { CACHE_NONE, + CACHE_MMAP, CACHE_LOOSE, CACHE_FSCACHE, }; diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index dc95a252523..b83ebfbf3fd 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -50,6 +50,8 @@ extern const struct dentry_operations v9fs_dentry_operations; extern const struct dentry_operations v9fs_cached_dentry_operations; extern const struct file_operations v9fs_cached_file_operations; extern const struct file_operations v9fs_cached_file_operations_dotl; +extern const struct file_operations v9fs_mmap_file_operations; +extern const struct file_operations v9fs_mmap_file_operations_dotl; extern struct kmem_cache *v9fs_inode_cache; struct inode *v9fs_alloc_inode(struct super_block *sb); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 9ff073f4090..cc1cfae726b 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -202,6 +202,8 @@ static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc) { int retval; + p9_debug(P9_DEBUG_VFS, "page %p\n", page); + retval = v9fs_vfs_writepage_locked(page); if (retval < 0) { if (retval == -EAGAIN) { @@ -257,8 +259,7 @@ static int v9fs_launder_page(struct page *page) * */ static ssize_t -v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t pos, unsigned long nr_segs) +v9fs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos) { /* * FIXME @@ -267,7 +268,7 @@ v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, */ p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) off/no(%lld/%lu) EINVAL\n", iocb->ki_filp->f_path.dentry->d_name.name, - (long long)pos, nr_segs); + (long long)pos, iter->nr_segs); return -EINVAL; } @@ -282,6 +283,9 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping, pgoff_t index = pos >> PAGE_CACHE_SHIFT; struct inode *inode = mapping->host; + + p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); + v9inode = V9FS_I(inode); start: page = grab_cache_page_write_begin(mapping, index, flags); @@ -312,6 +316,8 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, loff_t last_pos = pos + copied; struct inode *inode = page->mapping->host; + p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); + if (unlikely(copied < len)) { /* * zero out the rest of the area diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index f039b104a98..b03dd23feda 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -43,23 +43,6 @@ #include "fid.h" /** - * v9fs_dentry_delete - called when dentry refcount equals 0 - * @dentry: dentry in question - * - * By returning 1 here we should remove cacheing of unused - * dentry components. - * - */ - -static int v9fs_dentry_delete(const struct dentry *dentry) -{ - p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n", - dentry->d_name.name, dentry); - - return 1; -} - -/** * v9fs_cached_dentry_delete - called when dentry refcount equals 0 * @dentry: dentry in question * @@ -134,6 +117,6 @@ const struct dentry_operations v9fs_cached_dentry_operations = { }; const struct dentry_operations v9fs_dentry_operations = { - .d_delete = v9fs_dentry_delete, + .d_delete = always_delete_dentry, .d_release = v9fs_dentry_release, }; diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 4d0c2e0be7e..0b3bfa303dd 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -42,7 +42,6 @@ /** * struct p9_rdir - readdir accounting - * @mutex: mutex protecting readdir * @head: start offset of current dirread buffer * @tail: end offset of current dirread buffer * @buf: dirread buffer diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index a0df3e73c2b..520c11c2dcc 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -45,6 +45,7 @@ #include "cache.h" static const struct vm_operations_struct v9fs_file_vm_ops; +static const struct vm_operations_struct v9fs_mmap_file_vm_ops; /** * v9fs_file_open - open a file (or directory) @@ -87,7 +88,8 @@ int v9fs_file_open(struct inode *inode, struct file *file) file->private_data = fid; mutex_lock(&v9inode->v_mutex); - if (v9ses->cache && !v9inode->writeback_fid && + if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) && + !v9inode->writeback_fid && ((file->f_flags & O_ACCMODE) != O_RDONLY)) { /* * clone a fid and add it to writeback_fid @@ -105,7 +107,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) v9inode->writeback_fid = (void *) fid; } mutex_unlock(&v9inode->v_mutex); - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) v9fs_cache_inode_set_cookie(inode, file); return 0; out_error: @@ -350,9 +352,6 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd, invalidate_mapping_pages(&inode->i_data, 0, -1); } /* Convert flock to posix lock */ - fl->fl_owner = (fl_owner_t)filp; - fl->fl_start = 0; - fl->fl_end = OFFSET_MAX; fl->fl_flags |= FL_POSIX; fl->fl_flags ^= FL_FLOCK; @@ -461,14 +460,12 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid, int n; loff_t i_size; size_t total = 0; - struct p9_client *clnt; loff_t origin = *offset; unsigned long pg_start, pg_end; p9_debug(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, (int)count, (int)*offset); - clnt = fid->clnt; do { n = p9_client_write(fid, NULL, data+total, origin+total, count); if (n <= 0) @@ -581,11 +578,12 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, } static int -v9fs_file_mmap(struct file *file, struct vm_area_struct *vma) +v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma) { int retval; - retval = generic_file_mmap(file, vma); + + retval = generic_file_mmap(filp, vma); if (!retval) vma->vm_ops = &v9fs_file_vm_ops; @@ -593,6 +591,43 @@ v9fs_file_mmap(struct file *file, struct vm_area_struct *vma) } static int +v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma) +{ + int retval; + struct inode *inode; + struct v9fs_inode *v9inode; + struct p9_fid *fid; + + inode = file_inode(filp); + v9inode = V9FS_I(inode); + mutex_lock(&v9inode->v_mutex); + if (!v9inode->writeback_fid && + (vma->vm_flags & VM_WRITE)) { + /* + * clone a fid and add it to writeback_fid + * we do it during mmap instead of + * page dirty time via write_begin/page_mkwrite + * because we want write after unlink usecase + * to work. + */ + fid = v9fs_writeback_fid(filp->f_path.dentry); + if (IS_ERR(fid)) { + retval = PTR_ERR(fid); + mutex_unlock(&v9inode->v_mutex); + return retval; + } + v9inode->writeback_fid = (void *) fid; + } + mutex_unlock(&v9inode->v_mutex); + + retval = generic_file_mmap(filp, vma); + if (!retval) + vma->vm_ops = &v9fs_mmap_file_vm_ops; + + return retval; +} + +static int v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct v9fs_inode *v9inode; @@ -646,7 +681,7 @@ v9fs_direct_read(struct file *filp, char __user *udata, size_t count, /** * v9fs_cached_file_read - read from a file * @filp: file pointer to read - * @udata: user data buffer to read data into + * @data: user data buffer to read data into * @count: size of buffer * @offset: offset at which to read data * @@ -657,7 +692,23 @@ v9fs_cached_file_read(struct file *filp, char __user *data, size_t count, { if (filp->f_flags & O_DIRECT) return v9fs_direct_read(filp, data, count, offset); - return do_sync_read(filp, data, count, offset); + return new_sync_read(filp, data, count, offset); +} + +/** + * v9fs_mmap_file_read - read from a file + * @filp: file pointer to read + * @data: user data buffer to read data into + * @count: size of buffer + * @offset: offset at which to read data + * + */ +static ssize_t +v9fs_mmap_file_read(struct file *filp, char __user *data, size_t count, + loff_t *offset) +{ + /* TODO: Check if there are dirty pages */ + return v9fs_file_read(filp, data, count, offset); } static ssize_t @@ -709,7 +760,7 @@ err_out: buff_write: mutex_unlock(&inode->i_mutex); - return do_sync_write(filp, data, count, offsetp); + return new_sync_write(filp, data, count, offsetp); } /** @@ -727,11 +778,66 @@ v9fs_cached_file_write(struct file *filp, const char __user * data, if (filp->f_flags & O_DIRECT) return v9fs_direct_write(filp, data, count, offset); - return do_sync_write(filp, data, count, offset); + return new_sync_write(filp, data, count, offset); +} + + +/** + * v9fs_mmap_file_write - write to a file + * @filp: file pointer to write + * @data: data buffer to write data from + * @count: size of buffer + * @offset: offset at which to write data + * + */ +static ssize_t +v9fs_mmap_file_write(struct file *filp, const char __user *data, + size_t count, loff_t *offset) +{ + /* + * TODO: invalidate mmaps on filp's inode between + * offset and offset+count + */ + return v9fs_file_write(filp, data, count, offset); +} + +static void v9fs_mmap_vm_close(struct vm_area_struct *vma) +{ + struct inode *inode; + + struct writeback_control wbc = { + .nr_to_write = LONG_MAX, + .sync_mode = WB_SYNC_ALL, + .range_start = vma->vm_pgoff * PAGE_SIZE, + /* absolute end, byte at end included */ + .range_end = vma->vm_pgoff * PAGE_SIZE + + (vma->vm_end - vma->vm_start - 1), + }; + + + p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma); + + inode = file_inode(vma->vm_file); + + if (!mapping_cap_writeback_dirty(inode->i_mapping)) + wbc.nr_to_write = 0; + + might_sleep(); + sync_inode(inode, &wbc); } + static const struct vm_operations_struct v9fs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = v9fs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, +}; + +static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { + .close = v9fs_mmap_vm_close, + .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = v9fs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; @@ -741,8 +847,8 @@ const struct file_operations v9fs_cached_file_operations = { .llseek = generic_file_llseek, .read = v9fs_cached_file_read, .write = v9fs_cached_file_write, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .open = v9fs_file_open, .release = v9fs_dir_release, .lock = v9fs_file_lock, @@ -754,8 +860,8 @@ const struct file_operations v9fs_cached_file_operations_dotl = { .llseek = generic_file_llseek, .read = v9fs_cached_file_read, .write = v9fs_cached_file_write, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .open = v9fs_file_open, .release = v9fs_dir_release, .lock = v9fs_file_lock_dotl, @@ -786,3 +892,26 @@ const struct file_operations v9fs_file_operations_dotl = { .mmap = generic_file_readonly_mmap, .fsync = v9fs_file_fsync_dotl, }; + +const struct file_operations v9fs_mmap_file_operations = { + .llseek = generic_file_llseek, + .read = v9fs_mmap_file_read, + .write = v9fs_mmap_file_write, + .open = v9fs_file_open, + .release = v9fs_dir_release, + .lock = v9fs_file_lock, + .mmap = v9fs_mmap_file_mmap, + .fsync = v9fs_file_fsync, +}; + +const struct file_operations v9fs_mmap_file_operations_dotl = { + .llseek = generic_file_llseek, + .read = v9fs_mmap_file_read, + .write = v9fs_mmap_file_write, + .open = v9fs_file_open, + .release = v9fs_dir_release, + .lock = v9fs_file_lock_dotl, + .flock = v9fs_file_flock_dotl, + .mmap = v9fs_mmap_file_mmap, + .fsync = v9fs_file_fsync_dotl, +}; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 4e65aa90334..7fa4f7a7653 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -147,7 +147,7 @@ static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses, int major = -1, minor = -1; strlcpy(ext, stat->extension, sizeof(ext)); - sscanf(ext, "%c %u %u", &type, &major, &minor); + sscanf(ext, "%c %i %i", &type, &major, &minor); switch (type) { case 'c': res |= S_IFCHR; @@ -299,15 +299,22 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, case S_IFREG: if (v9fs_proto_dotl(v9ses)) { inode->i_op = &v9fs_file_inode_operations_dotl; - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || + v9ses->cache == CACHE_FSCACHE) inode->i_fop = &v9fs_cached_file_operations_dotl; + else if (v9ses->cache == CACHE_MMAP) + inode->i_fop = &v9fs_mmap_file_operations_dotl; else inode->i_fop = &v9fs_file_operations_dotl; } else { inode->i_op = &v9fs_file_inode_operations; - if (v9ses->cache) - inode->i_fop = &v9fs_cached_file_operations; + if (v9ses->cache == CACHE_LOOSE || + v9ses->cache == CACHE_FSCACHE) + inode->i_fop = + &v9fs_cached_file_operations; + else if (v9ses->cache == CACHE_MMAP) + inode->i_fop = &v9fs_mmap_file_operations; else inode->i_fop = &v9fs_file_operations; } @@ -444,7 +451,7 @@ void v9fs_evict_inode(struct inode *inode) { struct v9fs_inode *v9inode = V9FS_I(inode); - truncate_inode_pages(inode->i_mapping, 0); + truncate_inode_pages_final(inode->i_mapping); clear_inode(inode); filemap_fdatawrite(inode->i_mapping); @@ -573,7 +580,7 @@ static int v9fs_at_to_dotl_flags(int flags) * v9fs_remove - helper function to remove files and directories * @dir: directory inode that is being deleted * @dentry: dentry that is being deleted - * @rmdir: removing a directory + * @flags: removing a directory * */ @@ -771,7 +778,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode * @dir: inode that is being walked from * @dentry: dentry that is being walked to? - * @nameidata: path data + * @flags: lookup flags (unused) * */ @@ -779,7 +786,6 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct dentry *res; - struct super_block *sb; struct v9fs_session_info *v9ses; struct p9_fid *dfid, *fid; struct inode *inode; @@ -791,7 +797,6 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); - sb = dir->i_sb; v9ses = v9fs_inode2v9ses(dir); /* We can walk d_parent because we hold the dir->i_mutex */ dfid = v9fs_fid_lookup(dentry->d_parent); @@ -812,7 +817,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, * unlink. For cached mode create calls request for new * inode. But with cache disabled, lookup should do this. */ - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); else inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); @@ -863,7 +868,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, return finish_no_open(file, res); err = 0; - fid = NULL; + v9ses = v9fs_inode2v9ses(dir); perm = unixmode2p9mode(v9ses, mode); fid = v9fs_create(v9ses, dir, dentry, NULL, perm, @@ -878,7 +883,8 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, v9fs_invalidate_inode_attr(dir); v9inode = V9FS_I(dentry->d_inode); mutex_lock(&v9inode->v_mutex); - if (v9ses->cache && !v9inode->writeback_fid && + if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) && + !v9inode->writeback_fid && ((flags & O_ACCMODE) != O_RDONLY)) { /* * clone a fid and add it to writeback_fid @@ -901,7 +907,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, goto error; file->private_data = fid; - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) v9fs_cache_inode_set_cookie(dentry->d_inode, file); *opened |= FILE_CREATED; @@ -1318,7 +1324,7 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) * v9fs_vfs_mkspecial - create a special file * @dir: inode to create special file in * @dentry: dentry to create - * @mode: mode to create special file + * @perm: mode to create special file * @extension: 9p2000.u format extension string representing special file * */ @@ -1479,7 +1485,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) */ i_size = inode->i_size; v9fs_stat2inode(st, inode, inode->i_sb); - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) inode->i_size = i_size; spin_unlock(&inode->i_lock); out: diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 4c10edec26a..1fa85aae24d 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -226,7 +226,7 @@ int v9fs_open_to_dotl_flags(int flags) * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. * @dir: directory inode that is being created * @dentry: dentry that is being deleted - * @mode: create permissions + * @omode: create permissions * */ @@ -330,7 +330,8 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, v9inode = V9FS_I(inode); mutex_lock(&v9inode->v_mutex); - if (v9ses->cache && !v9inode->writeback_fid && + if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) && + !v9inode->writeback_fid && ((flags & O_ACCMODE) != O_RDONLY)) { /* * clone a fid and add it to writeback_fid @@ -353,7 +354,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, if (err) goto err_clunk_old_fid; file->private_data = ofid; - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) v9fs_cache_inode_set_cookie(inode, file); *opened |= FILE_CREATED; out: @@ -374,7 +375,7 @@ err_clunk_old_fid: * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory * @dir: inode that is being unlinked * @dentry: dentry that is being unlinked - * @mode: mode for new directory + * @omode: mode for new directory * */ @@ -473,13 +474,11 @@ static int v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - int err; struct v9fs_session_info *v9ses; struct p9_fid *fid; struct p9_stat_dotl *st; p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); - err = -EPERM; v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { generic_fillattr(dentry->d_inode, stat); @@ -556,7 +555,6 @@ static int v9fs_mapped_iattr_valid(int iattr_valid) int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) { int retval; - struct v9fs_session_info *v9ses; struct p9_fid *fid; struct p9_iattr_dotl p9attr; struct inode *inode = dentry->d_inode; @@ -577,8 +575,6 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) p9attr.mtime_sec = iattr->ia_mtime.tv_sec; p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec; - retval = -EPERM; - v9ses = v9fs_dentry2v9ses(dentry); fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return PTR_ERR(fid); @@ -611,7 +607,6 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) * v9fs_stat2inode_dotl - populate an inode structure with stat info * @stat: stat structure * @inode: inode to populate - * @sb: superblock of filesystem * */ @@ -715,7 +710,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, } v9fs_invalidate_inode_attr(dir); - if (v9ses->cache) { + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { /* Now walk from the parent so we can get an unopened fid. */ fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { @@ -768,7 +763,6 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { int err; - char *name; struct dentry *dir_dentry; struct p9_fid *dfid, *oldfid; struct v9fs_session_info *v9ses; @@ -786,8 +780,6 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, if (IS_ERR(oldfid)) return PTR_ERR(oldfid); - name = (char *) dentry->d_name.name; - err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name); if (err < 0) { @@ -815,7 +807,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, * v9fs_vfs_mknod_dotl - create a special file * @dir: inode destination for new link * @dentry: dentry for file - * @mode: mode for creation + * @omode: mode for creation * @rdev: device associated with special file * */ @@ -973,7 +965,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) */ i_size = inode->i_size; v9fs_stat2inode_dotl(st, inode); - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) inode->i_size = i_size; spin_unlock(&inode->i_lock); out: diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 2756dcd5de6..0afd0382822 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -144,7 +144,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, } v9fs_fill_super(sb, v9ses, flags, data); - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) sb->s_d_op = &v9fs_cached_dentry_operations; else sb->s_d_op = &v9fs_dentry_operations; @@ -282,7 +282,7 @@ static int v9fs_drop_inode(struct inode *inode) { struct v9fs_session_info *v9ses; v9ses = v9fs_inode2v9ses(inode); - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) return generic_drop_inode(inode); /* * in case of non cached mode always drop the @@ -325,10 +325,12 @@ static int v9fs_write_inode_dotl(struct inode *inode, * send an fsync request to server irrespective of * wbc->sync_mode. */ - p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); v9inode = V9FS_I(inode); + p9_debug(P9_DEBUG_VFS, "%s: inode %p, writeback_fid %p\n", + __func__, inode, v9inode->writeback_fid); if (!v9inode->writeback_fid) return 0; + ret = p9_client_fsync(v9inode->writeback_fid, 0); if (ret < 0) { __mark_inode_dirty(inode, I_DIRTY_DATASYNC); diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 3c28cdfb8c4..f95e01e058e 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -138,8 +138,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, if (retval < 0) { p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n", retval); - p9_client_clunk(fid); - return retval; + goto err; } msize = fid->clnt->msize; while (value_len) { @@ -152,12 +151,15 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, if (write_count < 0) { /* error in xattr write */ retval = write_count; - break; + goto err; } offset += write_count; value_len -= write_count; } - return p9_client_clunk(fid); + retval = 0; +err: + p9_client_clunk(fid); + return retval; } ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) diff --git a/fs/Kconfig b/fs/Kconfig index c229f828eb0..312393f3294 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -68,10 +68,6 @@ source "fs/quota/Kconfig" source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" -config GENERIC_ACL - bool - select FS_POSIX_ACL - menu "Caches" source "fs/fscache/Kconfig" @@ -100,6 +96,7 @@ endif # BLOCK menu "Pseudo filesystems" source "fs/proc/Kconfig" +source "fs/kernfs/Kconfig" source "fs/sysfs/Kconfig" config TMPFS @@ -119,7 +116,7 @@ config TMPFS_POSIX_ACL bool "Tmpfs POSIX Access Control Lists" depends on TMPFS select TMPFS_XATTR - select GENERIC_ACL + select FS_POSIX_ACL help POSIX Access Control Lists (ACLs) support additional access rights for users and groups beyond the standard owner/group/world scheme, diff --git a/fs/Makefile b/fs/Makefile index 4fe6df3ec28..4030cbfbc9a 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -14,14 +14,13 @@ obj-y := open.o read_write.o file_table.o super.o \ stack.o fs_struct.o statfs.o ifeq ($(CONFIG_BLOCK),y) -obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o +obj-y += buffer.o block_dev.o direct-io.o mpage.o else obj-y += no-block.o endif obj-$(CONFIG_PROC_FS) += proc_namespace.o -obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o obj-$(CONFIG_ANON_INODES) += anon_inodes.o @@ -42,9 +41,8 @@ obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o obj-$(CONFIG_FS_MBCACHE) += mbcache.o -obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o +obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ -obj-$(CONFIG_GENERIC_ACL) += generic_acl.o obj-$(CONFIG_COREDUMP) += coredump.o obj-$(CONFIG_SYSCTL) += drop_caches.o @@ -53,6 +51,7 @@ obj-$(CONFIG_FHANDLE) += fhandle.o obj-y += quota/ obj-$(CONFIG_PROC_FS) += proc/ +obj-$(CONFIG_KERNFS) += kernfs/ obj-$(CONFIG_SYSFS) += sysfs/ obj-$(CONFIG_CONFIGFS_FS) += configfs/ obj-y += devpts/ diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index 585adafb0cc..c770337c4b4 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -43,9 +43,12 @@ struct adfs_dir_ops; * ADFS file system superblock data in memory */ struct adfs_sb_info { - struct adfs_discmap *s_map; /* bh list containing map */ - struct adfs_dir_ops *s_dir; /* directory operations */ - + union { struct { + struct adfs_discmap *s_map; /* bh list containing map */ + struct adfs_dir_ops *s_dir; /* directory operations */ + }; + struct rcu_head rcu; /* used only at shutdown time */ + }; kuid_t s_uid; /* owner uid */ kgid_t s_gid; /* owner gid */ umode_t s_owner_mask; /* ADFS owner perm -> unix perm */ diff --git a/fs/adfs/file.c b/fs/adfs/file.c index a36da5382b4..07c9edce5aa 100644 --- a/fs/adfs/file.c +++ b/fs/adfs/file.c @@ -23,12 +23,12 @@ const struct file_operations adfs_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, + .read = new_sync_read, + .read_iter = generic_file_read_iter, .mmap = generic_file_mmap, .fsync = generic_file_fsync, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .splice_read = generic_file_splice_read, }; diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 0ff4bae2c2a..9852bdf34d7 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -123,8 +123,7 @@ static void adfs_put_super(struct super_block *sb) for (i = 0; i < asb->s_map_size; i++) brelse(asb->s_map[i].dm_bh); kfree(asb->s_map); - kfree(asb); - sb->s_fs_info = NULL; + kfree_rcu(asb, rcu); } static int adfs_show_options(struct seq_file *seq, struct dentry *root) @@ -213,6 +212,7 @@ static int parse_options(struct super_block *sb, char *options) static int adfs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_NODIRATIME; return parse_options(sb, data); } @@ -266,7 +266,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { adfs_inode_cachep = kmem_cache_create("adfs_inode_cache", sizeof(struct adfs_inode_info), diff --git a/fs/affs/Changes b/fs/affs/Changes index a29409c1ffe..b41c2c9792f 100644 --- a/fs/affs/Changes +++ b/fs/affs/Changes @@ -91,7 +91,7 @@ more 2.4 fixes: [Roman Zippel] Version 3.11 ------------ -- Converted to use 2.3.x page cache [Dave Jones <dave@powertweak.com>] +- Converted to use 2.3.x page cache [Dave Jones] - Corruption in truncate() bugfix [Ken Tyler <kent@werple.net.au>] Version 3.10 diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 3952121f2f2..9bca8815972 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -1,3 +1,9 @@ +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/fs.h> #include <linux/buffer_head.h> @@ -5,14 +11,6 @@ #include <linux/mutex.h> #include <linux/workqueue.h> -/* AmigaOS allows file names with up to 30 characters length. - * Names longer than that will be silently truncated. If you - * want to disallow this, comment out the following #define. - * Creating filesystem objects with longer names will then - * result in an error (ENAMETOOLONG). - */ -/*#define AFFS_NO_TRUNCATE */ - /* Ugly macros make the code more pretty. */ #define GET_END_PTR(st,p,sz) ((st *)((char *)(p)+((sz)-sizeof(st)))) @@ -28,7 +26,6 @@ #define AFFS_CACHE_SIZE PAGE_SIZE -#define AFFS_MAX_PREALLOC 32 #define AFFS_LC_SIZE (AFFS_CACHE_SIZE/sizeof(u32)/2) #define AFFS_AC_SIZE (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2) #define AFFS_AC_MASK (AFFS_AC_SIZE-1) @@ -118,6 +115,7 @@ struct affs_sb_info { #define SF_OFS 0x0200 /* Old filesystem */ #define SF_PREFIX 0x0400 /* Buffer for prefix is allocated */ #define SF_VERBOSE 0x0800 /* Talk about fs when mounting */ +#define SF_NO_TRUNCATE 0x1000 /* Don't truncate filenames */ /* short cut to get to the affs specific sb data */ static inline struct affs_sb_info *AFFS_SB(struct super_block *sb) @@ -137,9 +135,13 @@ extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh); extern void secs_to_datestamp(time_t secs, struct affs_date *ds); extern umode_t prot_to_mode(u32 prot); extern void mode_to_prot(struct inode *inode); -extern void affs_error(struct super_block *sb, const char *function, const char *fmt, ...); -extern void affs_warning(struct super_block *sb, const char *function, const char *fmt, ...); -extern int affs_check_name(const unsigned char *name, int len); +extern void affs_error(struct super_block *sb, const char *function, + const char *fmt, ...); +extern void affs_warning(struct super_block *sb, const char *function, + const char *fmt, ...); +extern bool affs_nofilenametruncate(const struct dentry *dentry); +extern int affs_check_name(const unsigned char *name, int len, + bool notruncate); extern int affs_copy_name(unsigned char *bstr, struct dentry *dentry); /* bitmap. c */ @@ -210,7 +212,7 @@ affs_set_blocksize(struct super_block *sb, int size) static inline struct buffer_head * affs_bread(struct super_block *sb, int block) { - pr_debug("affs_bread: %d\n", block); + pr_debug("%s: %d\n", __func__, block); if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) return sb_bread(sb, block); return NULL; @@ -218,7 +220,7 @@ affs_bread(struct super_block *sb, int block) static inline struct buffer_head * affs_getblk(struct super_block *sb, int block) { - pr_debug("affs_getblk: %d\n", block); + pr_debug("%s: %d\n", __func__, block); if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) return sb_getblk(sb, block); return NULL; @@ -227,7 +229,7 @@ static inline struct buffer_head * affs_getzeroblk(struct super_block *sb, int block) { struct buffer_head *bh; - pr_debug("affs_getzeroblk: %d\n", block); + pr_debug("%s: %d\n", __func__, block); if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) { bh = sb_getblk(sb, block); lock_buffer(bh); @@ -242,7 +244,7 @@ static inline struct buffer_head * affs_getemptyblk(struct super_block *sb, int block) { struct buffer_head *bh; - pr_debug("affs_getemptyblk: %d\n", block); + pr_debug("%s: %d\n", __func__, block); if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) { bh = sb_getblk(sb, block); wait_on_buffer(bh); @@ -255,7 +257,7 @@ static inline void affs_brelse(struct buffer_head *bh) { if (bh) - pr_debug("affs_brelse: %lld\n", (long long) bh->b_blocknr); + pr_debug("%s: %lld\n", __func__, (long long) bh->b_blocknr); brelse(bh); } diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index d9a43674cb9..406b29836b1 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -34,7 +34,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh) ino = bh->b_blocknr; offset = affs_hash_name(sb, AFFS_TAIL(sb, bh)->name + 1, AFFS_TAIL(sb, bh)->name[0]); - pr_debug("AFFS: insert_hash(dir=%u, ino=%d)\n", (u32)dir->i_ino, ino); + pr_debug("%s(dir=%u, ino=%d)\n", __func__, (u32)dir->i_ino, ino); dir_bh = affs_bread(sb, dir->i_ino); if (!dir_bh) @@ -84,7 +84,8 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh) sb = dir->i_sb; rem_ino = rem_bh->b_blocknr; offset = affs_hash_name(sb, AFFS_TAIL(sb, rem_bh)->name+1, AFFS_TAIL(sb, rem_bh)->name[0]); - pr_debug("AFFS: remove_hash(dir=%d, ino=%d, hashval=%d)\n", (u32)dir->i_ino, rem_ino, offset); + pr_debug("%s(dir=%d, ino=%d, hashval=%d)\n", + __func__, (u32)dir->i_ino, rem_ino, offset); bh = affs_bread(sb, dir->i_ino); if (!bh) @@ -147,7 +148,7 @@ affs_remove_link(struct dentry *dentry) u32 link_ino, ino; int retval; - pr_debug("AFFS: remove_link(key=%ld)\n", inode->i_ino); + pr_debug("%s(key=%ld)\n", __func__, inode->i_ino); retval = -EIO; bh = affs_bread(sb, inode->i_ino); if (!bh) @@ -279,7 +280,7 @@ affs_remove_header(struct dentry *dentry) if (!inode) goto done; - pr_debug("AFFS: remove_header(key=%ld)\n", inode->i_ino); + pr_debug("%s(key=%ld)\n", __func__, inode->i_ino); retval = -EIO; bh = affs_bread(sb, (u32)(long)dentry->d_fsdata); if (!bh) @@ -451,10 +452,10 @@ affs_error(struct super_block *sb, const char *function, const char *fmt, ...) vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args); va_end(args); - printk(KERN_CRIT "AFFS error (device %s): %s(): %s\n", sb->s_id, + pr_crit("error (device %s): %s(): %s\n", sb->s_id, function,ErrorBuffer); if (!(sb->s_flags & MS_RDONLY)) - printk(KERN_WARNING "AFFS: Remounting filesystem read-only\n"); + pr_warn("Remounting filesystem read-only\n"); sb->s_flags |= MS_RDONLY; } @@ -467,24 +468,31 @@ affs_warning(struct super_block *sb, const char *function, const char *fmt, ...) vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args); va_end(args); - printk(KERN_WARNING "AFFS warning (device %s): %s(): %s\n", sb->s_id, + pr_warn("(device %s): %s(): %s\n", sb->s_id, function,ErrorBuffer); } +bool +affs_nofilenametruncate(const struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + return AFFS_SB(inode->i_sb)->s_flags & SF_NO_TRUNCATE; + +} + /* Check if the name is valid for a affs object. */ int -affs_check_name(const unsigned char *name, int len) +affs_check_name(const unsigned char *name, int len, bool notruncate) { int i; - if (len > 30) -#ifdef AFFS_NO_TRUNCATE - return -ENAMETOOLONG; -#else - len = 30; -#endif - + if (len > 30) { + if (notruncate) + return -ENAMETOOLONG; + else + len = 30; + } for (i = 0; i < len; i++) { if (name[i] < ' ' || name[i] == ':' || (name[i] > 0x7e && name[i] < 0xa0)) diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c index a32246b8359..c8de51185c2 100644 --- a/fs/affs/bitmap.c +++ b/fs/affs/bitmap.c @@ -17,7 +17,7 @@ affs_count_free_blocks(struct super_block *sb) u32 free; int i; - pr_debug("AFFS: count_free_blocks()\n"); + pr_debug("%s()\n", __func__); if (sb->s_flags & MS_RDONLY) return 0; @@ -43,7 +43,7 @@ affs_free_block(struct super_block *sb, u32 block) u32 blk, bmap, bit, mask, tmp; __be32 *data; - pr_debug("AFFS: free_block(%u)\n", block); + pr_debug("%s(%u)\n", __func__, block); if (block > sbi->s_partition_size) goto err_range; @@ -125,7 +125,7 @@ affs_alloc_block(struct inode *inode, u32 goal) sb = inode->i_sb; sbi = AFFS_SB(sb); - pr_debug("AFFS: balloc(inode=%lu,goal=%u): ", inode->i_ino, goal); + pr_debug("balloc(inode=%lu,goal=%u): ", inode->i_ino, goal); if (AFFS_I(inode)->i_pa_cnt) { pr_debug("%d\n", AFFS_I(inode)->i_lastalloc+1); @@ -254,8 +254,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags) return 0; if (!AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag) { - printk(KERN_NOTICE "AFFS: Bitmap invalid - mounting %s read only\n", - sb->s_id); + pr_notice("Bitmap invalid - mounting %s read only\n", sb->s_id); *flags |= MS_RDONLY; return 0; } @@ -268,7 +267,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags) size = sbi->s_bmap_count * sizeof(*bm); bm = sbi->s_bitmap = kzalloc(size, GFP_KERNEL); if (!sbi->s_bitmap) { - printk(KERN_ERR "AFFS: Bitmap allocation failed\n"); + pr_err("Bitmap allocation failed\n"); return -ENOMEM; } @@ -282,17 +281,17 @@ int affs_init_bitmap(struct super_block *sb, int *flags) bm->bm_key = be32_to_cpu(bmap_blk[blk]); bh = affs_bread(sb, bm->bm_key); if (!bh) { - printk(KERN_ERR "AFFS: Cannot read bitmap\n"); + pr_err("Cannot read bitmap\n"); res = -EIO; goto out; } if (affs_checksum_block(sb, bh)) { - printk(KERN_WARNING "AFFS: Bitmap %u invalid - mounting %s read only.\n", - bm->bm_key, sb->s_id); + pr_warn("Bitmap %u invalid - mounting %s read only.\n", + bm->bm_key, sb->s_id); *flags |= MS_RDONLY; goto out; } - pr_debug("AFFS: read bitmap block %d: %d\n", blk, bm->bm_key); + pr_debug("read bitmap block %d: %d\n", blk, bm->bm_key); bm->bm_free = memweight(bh->b_data + 4, sb->s_blocksize - 4); /* Don't try read the extension if this is the last block, @@ -304,7 +303,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags) affs_brelse(bmap_bh); bmap_bh = affs_bread(sb, be32_to_cpu(bmap_blk[blk])); if (!bmap_bh) { - printk(KERN_ERR "AFFS: Cannot read bitmap extension\n"); + pr_err("Cannot read bitmap extension\n"); res = -EIO; goto out; } diff --git a/fs/affs/dir.c b/fs/affs/dir.c index f1eba8c3644..59f07bec92a 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -52,8 +52,10 @@ affs_readdir(struct file *file, struct dir_context *ctx) int hash_pos; int chain_pos; u32 ino; + int error = 0; - pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)ctx->pos); + pr_debug("%s(ino=%lu,f_pos=%lx)\n", + __func__, inode->i_ino, (unsigned long)ctx->pos); if (ctx->pos < 2) { file->private_data = (void *)0; @@ -72,14 +74,14 @@ affs_readdir(struct file *file, struct dir_context *ctx) } dir_bh = affs_bread(sb, inode->i_ino); if (!dir_bh) - goto readdir_out; + goto out_unlock_dir; /* If the directory hasn't changed since the last call to readdir(), * we can jump directly to where we left off. */ ino = (u32)(long)file->private_data; if (ino && file->f_version == inode->i_version) { - pr_debug("AFFS: readdir() left off=%d\n", ino); + pr_debug("readdir() left off=%d\n", ino); goto inside; } @@ -88,7 +90,8 @@ affs_readdir(struct file *file, struct dir_context *ctx) fh_bh = affs_bread(sb, ino); if (!fh_bh) { affs_error(sb, "readdir","Cannot read block %d", i); - return -EIO; + error = -EIO; + goto out_brelse_dir; } ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain); affs_brelse(fh_bh); @@ -107,29 +110,34 @@ inside: do { fh_bh = affs_bread(sb, ino); if (!fh_bh) { - affs_error(sb, "readdir","Cannot read block %d", ino); + affs_error(sb, "readdir", + "Cannot read block %d", ino); break; } namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30); name = AFFS_TAIL(sb, fh_bh)->name + 1; - pr_debug("AFFS: readdir(): filldir(\"%.*s\", ino=%u), hash=%d, f_pos=%x\n", + pr_debug("readdir(): dir_emit(\"%.*s\", " + "ino=%u), hash=%d, f_pos=%x\n", namelen, name, ino, hash_pos, (u32)ctx->pos); + if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN)) - goto readdir_done; + goto done; ctx->pos++; ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain); affs_brelse(fh_bh); fh_bh = NULL; } while (ino); } -readdir_done: +done: file->f_version = inode->i_version; file->private_data = (void *)(long)ino; + affs_brelse(fh_bh); -readdir_out: +out_brelse_dir: affs_brelse(dir_bh); - affs_brelse(fh_bh); + +out_unlock_dir: affs_unlock_dir(inode); - return 0; + return error; } diff --git a/fs/affs/file.c b/fs/affs/file.c index 8669b6ecdde..a7fe57d2cd9 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -27,10 +27,10 @@ static int affs_file_release(struct inode *inode, struct file *filp); const struct file_operations affs_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .open = affs_file_open, .release = affs_file_release, @@ -45,7 +45,7 @@ const struct inode_operations affs_file_inode_operations = { static int affs_file_open(struct inode *inode, struct file *filp) { - pr_debug("AFFS: open(%lu,%d)\n", + pr_debug("open(%lu,%d)\n", inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt)); atomic_inc(&AFFS_I(inode)->i_opencnt); return 0; @@ -54,7 +54,7 @@ affs_file_open(struct inode *inode, struct file *filp) static int affs_file_release(struct inode *inode, struct file *filp) { - pr_debug("AFFS: release(%lu, %d)\n", + pr_debug("release(%lu, %d)\n", inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt)); if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) { @@ -324,7 +324,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul struct buffer_head *ext_bh; u32 ext; - pr_debug("AFFS: get_block(%u, %lu)\n", (u32)inode->i_ino, (unsigned long)block); + pr_debug("%s(%u, %lu)\n", + __func__, (u32)inode->i_ino, (unsigned long)block); BUG_ON(block > (sector_t)0x7fffffffUL); @@ -498,34 +499,36 @@ affs_getemptyblk_ino(struct inode *inode, int block) } static int -affs_do_readpage_ofs(struct file *file, struct page *page, unsigned from, unsigned to) +affs_do_readpage_ofs(struct page *page, unsigned to) { struct inode *inode = page->mapping->host; struct super_block *sb = inode->i_sb; struct buffer_head *bh; char *data; + unsigned pos = 0; u32 bidx, boff, bsize; u32 tmp; - pr_debug("AFFS: read_page(%u, %ld, %d, %d)\n", (u32)inode->i_ino, page->index, from, to); - BUG_ON(from > to || to > PAGE_CACHE_SIZE); + pr_debug("%s(%u, %ld, 0, %d)\n", __func__, (u32)inode->i_ino, + page->index, to); + BUG_ON(to > PAGE_CACHE_SIZE); kmap(page); data = page_address(page); bsize = AFFS_SB(sb)->s_data_blksize; - tmp = (page->index << PAGE_CACHE_SHIFT) + from; + tmp = page->index << PAGE_CACHE_SHIFT; bidx = tmp / bsize; boff = tmp % bsize; - while (from < to) { + while (pos < to) { bh = affs_bread_ino(inode, bidx, 0); if (IS_ERR(bh)) return PTR_ERR(bh); - tmp = min(bsize - boff, to - from); - BUG_ON(from + tmp > to || tmp > bsize); - memcpy(data + from, AFFS_DATA(bh) + boff, tmp); + tmp = min(bsize - boff, to - pos); + BUG_ON(pos + tmp > to || tmp > bsize); + memcpy(data + pos, AFFS_DATA(bh) + boff, tmp); affs_brelse(bh); bidx++; - from += tmp; + pos += tmp; boff = 0; } flush_dcache_page(page); @@ -542,7 +545,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize) u32 size, bsize; u32 tmp; - pr_debug("AFFS: extent_file(%u, %d)\n", (u32)inode->i_ino, newsize); + pr_debug("%s(%u, %d)\n", __func__, (u32)inode->i_ino, newsize); bsize = AFFS_SB(sb)->s_data_blksize; bh = NULL; size = AFFS_I(inode)->mmu_private; @@ -608,14 +611,14 @@ affs_readpage_ofs(struct file *file, struct page *page) u32 to; int err; - pr_debug("AFFS: read_page(%u, %ld)\n", (u32)inode->i_ino, page->index); + pr_debug("%s(%u, %ld)\n", __func__, (u32)inode->i_ino, page->index); to = PAGE_CACHE_SIZE; if (((page->index + 1) << PAGE_CACHE_SHIFT) > inode->i_size) { to = inode->i_size & ~PAGE_CACHE_MASK; memset(page_address(page) + to, 0, PAGE_CACHE_SIZE - to); } - err = affs_do_readpage_ofs(file, page, 0, to); + err = affs_do_readpage_ofs(page, to); if (!err) SetPageUptodate(page); unlock_page(page); @@ -631,7 +634,8 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping pgoff_t index; int err = 0; - pr_debug("AFFS: write_begin(%u, %llu, %llu)\n", (u32)inode->i_ino, (unsigned long long)pos, (unsigned long long)pos + len); + pr_debug("%s(%u, %llu, %llu)\n", __func__, (u32)inode->i_ino, + (unsigned long long)pos, (unsigned long long)pos + len); if (pos > AFFS_I(inode)->mmu_private) { /* XXX: this probably leaves a too-big i_size in case of * failure. Should really be updating i_size at write_end time @@ -651,7 +655,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping return 0; /* XXX: inefficient but safe in the face of short writes */ - err = affs_do_readpage_ofs(file, page, 0, PAGE_CACHE_SIZE); + err = affs_do_readpage_ofs(page, PAGE_CACHE_SIZE); if (err) { unlock_page(page); page_cache_release(page); @@ -680,7 +684,9 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, * due to write_begin. */ - pr_debug("AFFS: write_begin(%u, %llu, %llu)\n", (u32)inode->i_ino, (unsigned long long)pos, (unsigned long long)pos + len); + pr_debug("%s(%u, %llu, %llu)\n", + __func__, (u32)inode->i_ino, (unsigned long long)pos, + (unsigned long long)pos + len); bsize = AFFS_SB(sb)->s_data_blksize; data = page_address(page); @@ -802,7 +808,7 @@ affs_free_prealloc(struct inode *inode) { struct super_block *sb = inode->i_sb; - pr_debug("AFFS: free_prealloc(ino=%lu)\n", inode->i_ino); + pr_debug("free_prealloc(ino=%lu)\n", inode->i_ino); while (AFFS_I(inode)->i_pa_cnt) { AFFS_I(inode)->i_pa_cnt--; @@ -822,7 +828,7 @@ affs_truncate(struct inode *inode) struct buffer_head *ext_bh; int i; - pr_debug("AFFS: truncate(inode=%d, oldsize=%u, newsize=%u)\n", + pr_debug("truncate(inode=%d, oldsize=%u, newsize=%u)\n", (u32)inode->i_ino, (u32)AFFS_I(inode)->mmu_private, (u32)inode->i_size); last_blk = 0; diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 0e092d08680..bec2d1a0c91 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -34,7 +34,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) if (!(inode->i_state & I_NEW)) return inode; - pr_debug("AFFS: affs_iget(%lu)\n", inode->i_ino); + pr_debug("affs_iget(%lu)\n", inode->i_ino); block = inode->i_ino; bh = affs_bread(sb, block); @@ -175,7 +175,7 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc) uid_t uid; gid_t gid; - pr_debug("AFFS: write_inode(%lu)\n",inode->i_ino); + pr_debug("write_inode(%lu)\n", inode->i_ino); if (!inode->i_nlink) // possibly free block @@ -220,7 +220,7 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr) struct inode *inode = dentry->d_inode; int error; - pr_debug("AFFS: notify_change(%lu,0x%x)\n",inode->i_ino,attr->ia_valid); + pr_debug("notify_change(%lu,0x%x)\n", inode->i_ino, attr->ia_valid); error = inode_change_ok(inode,attr); if (error) @@ -258,8 +258,9 @@ void affs_evict_inode(struct inode *inode) { unsigned long cache_page; - pr_debug("AFFS: evict_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink); - truncate_inode_pages(&inode->i_data, 0); + pr_debug("evict_inode(ino=%lu, nlink=%u)\n", + inode->i_ino, inode->i_nlink); + truncate_inode_pages_final(&inode->i_data); if (!inode->i_nlink) { inode->i_size = 0; @@ -271,7 +272,7 @@ affs_evict_inode(struct inode *inode) affs_free_prealloc(inode); cache_page = (unsigned long)AFFS_I(inode)->i_lc; if (cache_page) { - pr_debug("AFFS: freeing ext cache\n"); + pr_debug("freeing ext cache\n"); AFFS_I(inode)->i_lc = NULL; AFFS_I(inode)->i_ac = NULL; free_page(cache_page); @@ -350,7 +351,8 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3 u32 block = 0; int retval; - pr_debug("AFFS: add_entry(dir=%u, inode=%u, \"%*s\", type=%d)\n", (u32)dir->i_ino, + pr_debug("%s(dir=%u, inode=%u, \"%*s\", type=%d)\n", + __func__, (u32)dir->i_ino, (u32)inode->i_ino, (int)dentry->d_name.len, dentry->d_name.name, type); retval = -EIO; diff --git a/fs/affs/namei.c b/fs/affs/namei.c index c36cbb4537a..035bd31556f 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -60,13 +60,13 @@ affs_get_toupper(struct super_block *sb) * Note: the dentry argument is the parent dentry. */ static inline int -__affs_hash_dentry(struct qstr *qstr, toupper_t toupper) +__affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate) { const u8 *name = qstr->name; unsigned long hash; int i; - i = affs_check_name(qstr->name, qstr->len); + i = affs_check_name(qstr->name, qstr->len, notruncate); if (i) return i; @@ -82,16 +82,22 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper) static int affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr) { - return __affs_hash_dentry(qstr, affs_toupper); + return __affs_hash_dentry(qstr, affs_toupper, + affs_nofilenametruncate(dentry)); + } + static int affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr) { - return __affs_hash_dentry(qstr, affs_intl_toupper); + return __affs_hash_dentry(qstr, affs_intl_toupper, + affs_nofilenametruncate(dentry)); + } static inline int __affs_compare_dentry(unsigned int len, - const char *str, const struct qstr *name, toupper_t toupper) + const char *str, const struct qstr *name, toupper_t toupper, + bool notruncate) { const u8 *aname = str; const u8 *bname = name->name; @@ -101,7 +107,7 @@ static inline int __affs_compare_dentry(unsigned int len, * must be valid. 'name' must be validated first. */ - if (affs_check_name(name->name, name->len)) + if (affs_check_name(name->name, name->len, notruncate)) return 1; /* @@ -126,13 +132,18 @@ static int affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { - return __affs_compare_dentry(len, str, name, affs_toupper); + + return __affs_compare_dentry(len, str, name, affs_toupper, + affs_nofilenametruncate(parent)); } + static int affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { - return __affs_compare_dentry(len, str, name, affs_intl_toupper); + return __affs_compare_dentry(len, str, name, affs_intl_toupper, + affs_nofilenametruncate(parent)); + } /* @@ -179,7 +190,8 @@ affs_find_entry(struct inode *dir, struct dentry *dentry) toupper_t toupper = affs_get_toupper(sb); u32 key; - pr_debug("AFFS: find_entry(\"%.*s\")\n", (int)dentry->d_name.len, dentry->d_name.name); + pr_debug("%s(\"%.*s\")\n", + __func__, (int)dentry->d_name.len, dentry->d_name.name); bh = affs_bread(sb, dir->i_ino); if (!bh) @@ -207,7 +219,8 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) struct buffer_head *bh; struct inode *inode = NULL; - pr_debug("AFFS: lookup(\"%.*s\")\n",(int)dentry->d_name.len,dentry->d_name.name); + pr_debug("%s(\"%.*s\")\n", + __func__, (int)dentry->d_name.len, dentry->d_name.name); affs_lock_dir(dir); bh = affs_find_entry(dir, dentry); @@ -237,9 +250,9 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) int affs_unlink(struct inode *dir, struct dentry *dentry) { - pr_debug("AFFS: unlink(dir=%d, %lu \"%.*s\")\n", (u32)dir->i_ino, - dentry->d_inode->i_ino, - (int)dentry->d_name.len, dentry->d_name.name); + pr_debug("%s(dir=%d, %lu \"%.*s\")\n", + __func__, (u32)dir->i_ino, dentry->d_inode->i_ino, + (int)dentry->d_name.len, dentry->d_name.name); return affs_remove_header(dentry); } @@ -251,7 +264,8 @@ affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) struct inode *inode; int error; - pr_debug("AFFS: create(%lu,\"%.*s\",0%ho)\n",dir->i_ino,(int)dentry->d_name.len, + pr_debug("%s(%lu,\"%.*s\",0%ho)\n", + __func__, dir->i_ino, (int)dentry->d_name.len, dentry->d_name.name,mode); inode = affs_new_inode(dir); @@ -280,8 +294,9 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int error; - pr_debug("AFFS: mkdir(%lu,\"%.*s\",0%ho)\n",dir->i_ino, - (int)dentry->d_name.len,dentry->d_name.name,mode); + pr_debug("%s(%lu,\"%.*s\",0%ho)\n", + __func__, dir->i_ino, (int)dentry->d_name.len, + dentry->d_name.name, mode); inode = affs_new_inode(dir); if (!inode) @@ -306,8 +321,8 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) int affs_rmdir(struct inode *dir, struct dentry *dentry) { - pr_debug("AFFS: rmdir(dir=%u, %lu \"%.*s\")\n", (u32)dir->i_ino, - dentry->d_inode->i_ino, + pr_debug("%s(dir=%u, %lu \"%.*s\")\n", + __func__, (u32)dir->i_ino, dentry->d_inode->i_ino, (int)dentry->d_name.len, dentry->d_name.name); return affs_remove_header(dentry); @@ -323,8 +338,9 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) int i, maxlen, error; char c, lc; - pr_debug("AFFS: symlink(%lu,\"%.*s\" -> \"%s\")\n",dir->i_ino, - (int)dentry->d_name.len,dentry->d_name.name,symname); + pr_debug("%s(%lu,\"%.*s\" -> \"%s\")\n", + __func__, dir->i_ino, (int)dentry->d_name.len, + dentry->d_name.name, symname); maxlen = AFFS_SB(sb)->s_hashsize * sizeof(u32) - 1; inode = affs_new_inode(dir); @@ -393,7 +409,8 @@ affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = old_dentry->d_inode; - pr_debug("AFFS: link(%u, %u, \"%.*s\")\n", (u32)inode->i_ino, (u32)dir->i_ino, + pr_debug("%s(%u, %u, \"%.*s\")\n", + __func__, (u32)inode->i_ino, (u32)dir->i_ino, (int)dentry->d_name.len,dentry->d_name.name); return affs_add_entry(dir, inode, dentry, ST_LINKFILE); @@ -407,11 +424,15 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry, struct buffer_head *bh = NULL; int retval; - pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n", - (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name, - (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name); + pr_debug("%s(old=%u,\"%*s\" to new=%u,\"%*s\")\n", + __func__, (u32)old_dir->i_ino, (int)old_dentry->d_name.len, + old_dentry->d_name.name, (u32)new_dir->i_ino, + (int)new_dentry->d_name.len, new_dentry->d_name.name); + + retval = affs_check_name(new_dentry->d_name.name, + new_dentry->d_name.len, + affs_nofilenametruncate(old_dentry)); - retval = affs_check_name(new_dentry->d_name.name,new_dentry->d_name.len); if (retval) return retval; diff --git a/fs/affs/super.c b/fs/affs/super.c index 45161a832bb..51f1a95bff7 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -46,14 +46,9 @@ static void affs_put_super(struct super_block *sb) { struct affs_sb_info *sbi = AFFS_SB(sb); - pr_debug("AFFS: put_super()\n"); + pr_debug("%s()\n", __func__); cancel_delayed_work_sync(&sbi->sb_work); - kfree(sbi->s_prefix); - affs_free_bitmap(sb); - affs_brelse(sbi->s_root_bh); - kfree(sbi); - sb->s_fs_info = NULL; } static int @@ -133,7 +128,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { affs_inode_cachep = kmem_cache_create("affs_inode_cache", sizeof(struct affs_inode_info), @@ -168,7 +163,7 @@ static const struct super_operations affs_sops = { }; enum { - Opt_bs, Opt_mode, Opt_mufs, Opt_prefix, Opt_protect, + Opt_bs, Opt_mode, Opt_mufs, Opt_notruncate, Opt_prefix, Opt_protect, Opt_reserved, Opt_root, Opt_setgid, Opt_setuid, Opt_verbose, Opt_volume, Opt_ignore, Opt_err, }; @@ -177,6 +172,7 @@ static const match_table_t tokens = { {Opt_bs, "bs=%u"}, {Opt_mode, "mode=%o"}, {Opt_mufs, "mufs"}, + {Opt_notruncate, "nofilenametruncate"}, {Opt_prefix, "prefix=%s"}, {Opt_protect, "protect"}, {Opt_reserved, "reserved=%u"}, @@ -224,7 +220,7 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, return 0; if (n != 512 && n != 1024 && n != 2048 && n != 4096) { - printk ("AFFS: Invalid blocksize (512, 1024, 2048, 4096 allowed)\n"); + pr_warn("Invalid blocksize (512, 1024, 2048, 4096 allowed)\n"); return 0; } *blocksize = n; @@ -238,6 +234,9 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, case Opt_mufs: *mount_opts |= SF_MUFS; break; + case Opt_notruncate: + *mount_opts |= SF_NO_TRUNCATE; + break; case Opt_prefix: *prefix = match_strdup(&args[0]); if (!*prefix) @@ -286,8 +285,8 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, /* Silently ignore the quota options */ break; default: - printk("AFFS: Unrecognized mount option \"%s\" " - "or missing value\n", p); + pr_warn("Unrecognized mount option \"%s\" or missing value\n", + p); return 0; } } @@ -316,11 +315,11 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) unsigned long mount_flags; int tmp_flags; /* fix remount prototype... */ u8 sig[4]; - int ret = -EINVAL; + int ret; save_mount_options(sb, data); - pr_debug("AFFS: read_super(%s)\n",data ? (const char *)data : "no options"); + pr_debug("read_super(%s)\n", data ? (const char *)data : "no options"); sb->s_magic = AFFS_SUPER_MAGIC; sb->s_op = &affs_sops; @@ -340,9 +339,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block, &blocksize,&sbi->s_prefix, sbi->s_volume, &mount_flags)) { - printk(KERN_ERR "AFFS: Error parsing options\n"); - kfree(sbi->s_prefix); - kfree(sbi); + pr_err("Error parsing options\n"); return -EINVAL; } /* N.B. after this point s_prefix must be released */ @@ -359,7 +356,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) */ size = sb->s_bdev->bd_inode->i_size >> 9; - pr_debug("AFFS: initial blocksize=%d, #blocks=%d\n", 512, size); + pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size); affs_set_blocksize(sb, PAGE_SIZE); /* Try to find root block. Its location depends on the block size. */ @@ -374,7 +371,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) sbi->s_root_block = root_block; if (root_block < 0) sbi->s_root_block = (reserved + size - 1) / 2; - pr_debug("AFFS: setting blocksize to %d\n", blocksize); + pr_debug("setting blocksize to %d\n", blocksize); affs_set_blocksize(sb, blocksize); sbi->s_partition_size = size; @@ -389,7 +386,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) * block behind the calculated one. So we check this one, too. */ for (num_bm = 0; num_bm < 2; num_bm++) { - pr_debug("AFFS: Dev %s, trying root=%u, bs=%d, " + pr_debug("Dev %s, trying root=%u, bs=%d, " "size=%d, reserved=%d\n", sb->s_id, sbi->s_root_block + num_bm, @@ -410,19 +407,20 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) } } if (!silent) - printk(KERN_ERR "AFFS: No valid root block on device %s\n", - sb->s_id); - goto out_error; + pr_err("No valid root block on device %s\n", sb->s_id); + return -EINVAL; /* N.B. after this point bh must be released */ got_root: + /* Keep super block in cache */ + sbi->s_root_bh = root_bh; root_block = sbi->s_root_block; /* Find out which kind of FS we have */ boot_bh = sb_bread(sb, 0); if (!boot_bh) { - printk(KERN_ERR "AFFS: Cannot read boot block\n"); - goto out_error; + pr_err("Cannot read boot block\n"); + return -EINVAL; } memcpy(sig, boot_bh->b_data, 4); brelse(boot_bh); @@ -434,8 +432,7 @@ got_root: */ if ((chksum == FS_DCFFS || chksum == MUFS_DCFFS || chksum == FS_DCOFS || chksum == MUFS_DCOFS) && !(sb->s_flags & MS_RDONLY)) { - printk(KERN_NOTICE "AFFS: Dircache FS - mounting %s read only\n", - sb->s_id); + pr_notice("Dircache FS - mounting %s read only\n", sb->s_id); sb->s_flags |= MS_RDONLY; } switch (chksum) { @@ -469,14 +466,14 @@ got_root: sb->s_flags |= MS_NOEXEC; break; default: - printk(KERN_ERR "AFFS: Unknown filesystem on device %s: %08X\n", - sb->s_id, chksum); - goto out_error; + pr_err("Unknown filesystem on device %s: %08X\n", + sb->s_id, chksum); + return -EINVAL; } if (mount_flags & SF_VERBOSE) { u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0]; - printk(KERN_NOTICE "AFFS: Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n", + pr_notice("Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n", len > 31 ? 31 : len, AFFS_ROOT_TAIL(sb, root_bh)->disk_name + 1, sig, sig[3] + '0', blocksize); @@ -488,22 +485,17 @@ got_root: if (sbi->s_flags & SF_OFS) sbi->s_data_blksize -= 24; - /* Keep super block in cache */ - sbi->s_root_bh = root_bh; - /* N.B. after this point s_root_bh must be released */ - tmp_flags = sb->s_flags; - if (affs_init_bitmap(sb, &tmp_flags)) - goto out_error; + ret = affs_init_bitmap(sb, &tmp_flags); + if (ret) + return ret; sb->s_flags = tmp_flags; /* set up enough so that it can read an inode */ root_inode = affs_iget(sb, root_block); - if (IS_ERR(root_inode)) { - ret = PTR_ERR(root_inode); - goto out_error; - } + if (IS_ERR(root_inode)) + return PTR_ERR(root_inode); if (AFFS_SB(sb)->s_flags & SF_INTL) sb->s_d_op = &affs_intl_dentry_operations; @@ -512,23 +504,12 @@ got_root: sb->s_root = d_make_root(root_inode); if (!sb->s_root) { - printk(KERN_ERR "AFFS: Get root inode failed\n"); - goto out_error; + pr_err("AFFS: Get root inode failed\n"); + return -ENOMEM; } - pr_debug("AFFS: s_flags=%lX\n",sb->s_flags); + pr_debug("s_flags=%lX\n", sb->s_flags); return 0; - - /* - * Begin the cascaded cleanup ... - */ -out_error: - kfree(sbi->s_bitmap); - affs_brelse(root_bh); - kfree(sbi->s_prefix); - kfree(sbi); - sb->s_fs_info = NULL; - return ret; } static int @@ -547,8 +528,9 @@ affs_remount(struct super_block *sb, int *flags, char *data) char volume[32]; char *prefix = NULL; - pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data); + pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data); + sync_filesystem(sb); *flags |= MS_NODIRATIME; memcpy(volume, sbi->s_volume, 32); @@ -594,8 +576,9 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf) int free; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); - pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size, - AFFS_SB(sb)->s_reserved); + pr_debug("%s() partsize=%d, reserved=%d\n", + __func__, AFFS_SB(sb)->s_partition_size, + AFFS_SB(sb)->s_reserved); free = affs_count_free_blocks(sb); buf->f_type = AFFS_SUPER_MAGIC; @@ -615,11 +598,23 @@ static struct dentry *affs_mount(struct file_system_type *fs_type, return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super); } +static void affs_kill_sb(struct super_block *sb) +{ + struct affs_sb_info *sbi = AFFS_SB(sb); + kill_block_super(sb); + if (sbi) { + affs_free_bitmap(sb); + affs_brelse(sbi->s_root_bh); + kfree(sbi->s_prefix); + kfree(sbi); + } +} + static struct file_system_type affs_fs_type = { .owner = THIS_MODULE, .name = "affs", .mount = affs_mount, - .kill_sb = kill_block_super, + .kill_sb = affs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("affs"); diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c index ee00f08c4f5..f39b71c3981 100644 --- a/fs/affs/symlink.c +++ b/fs/affs/symlink.c @@ -21,7 +21,7 @@ static int affs_symlink_readpage(struct file *file, struct page *page) char c; char lc; - pr_debug("AFFS: follow_link(ino=%lu)\n",inode->i_ino); + pr_debug("follow_link(ino=%lu)\n", inode->i_ino); err = -EIO; bh = affs_bread(inode->i_sb, inode->i_ino); diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 3c090b7555e..ca0a3cf9379 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -179,7 +179,7 @@ struct afs_cell *afs_cell_create(const char *name, unsigned namesz, /* put it up for caching (this never returns an error) */ cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index, &afs_cell_cache_index_def, - cell); + cell, true); #endif /* add to the cell lists */ diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 1c8c6cc6de3..4b0eff6da67 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -130,6 +130,15 @@ static void afs_cm_destructor(struct afs_call *call) { _enter(""); + /* Break the callbacks here so that we do it after the final ACK is + * received. The step number here must match the final number in + * afs_deliver_cb_callback(). + */ + if (call->unmarshall == 6) { + ASSERT(call->server && call->count && call->request); + afs_break_callbacks(call->server, call->count, call->request); + } + afs_put_server(call->server); call->server = NULL; kfree(call->buffer); @@ -272,6 +281,16 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, _debug("trailer"); if (skb->len != 0) return -EBADMSG; + + /* Record that the message was unmarshalled successfully so + * that the call destructor can know do the callback breaking + * work, even if the final ACK isn't received. + * + * If the step number changes, then afs_cm_destructor() must be + * updated also. + */ + call->unmarshall++; + case 6: break; } diff --git a/fs/afs/file.c b/fs/afs/file.c index 66d50fe2ee4..932ce07948b 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -31,10 +31,10 @@ const struct file_operations afs_file_operations = { .open = afs_open, .release = afs_release, .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = afs_file_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = afs_file_write, .mmap = generic_file_readonly_mmap, .splice_read = generic_file_splice_read, .fsync = afs_fsync, diff --git a/fs/afs/flock.c b/fs/afs/flock.c index a8cf2cff836..4baf1d2b39e 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -555,10 +555,6 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl) return -ENOLCK; /* we're simulating flock() locks using posix locks on the server */ - fl->fl_owner = (fl_owner_t) file; - fl->fl_start = 0; - fl->fl_end = OFFSET_MAX; - if (fl->fl_type == F_UNLCK) return afs_do_unlk(file, fl); return afs_do_setlk(file, fl); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 789bc253b5f..29467128844 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -259,7 +259,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, #ifdef CONFIG_AFS_FSCACHE vnode->cache = fscache_acquire_cookie(vnode->volume->cache, &afs_vnode_cache_index_def, - vnode); + vnode, true); #endif ret = afs_inode_map_status(vnode, key); @@ -422,7 +422,7 @@ void afs_evict_inode(struct inode *inode) ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); afs_give_up_callback(vnode); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index a306bb6d88d..71d5982312f 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -75,6 +75,7 @@ struct afs_call { const struct afs_call_type *type; /* type of call */ const struct afs_wait_mode *wait_mode; /* completion wait mode */ wait_queue_head_t waitq; /* processes awaiting completion */ + void (*async_workfn)(struct afs_call *call); /* asynchronous work function */ struct work_struct async_work; /* asynchronous work processor */ struct work_struct work; /* actual work processor */ struct sk_buff_head rx_queue; /* received packets */ @@ -195,7 +196,6 @@ struct afs_cell { struct list_head link; /* main cell list link */ struct key *anonymous_key; /* anonymous user key for this cell */ struct list_head proc_link; /* /proc cell list link */ - struct proc_dir_entry *proc_dir; /* /proc dir for this cell */ #ifdef CONFIG_AFS_FSCACHE struct fscache_cookie *cache; /* caching cookie */ #endif @@ -747,8 +747,7 @@ extern int afs_write_end(struct file *file, struct address_space *mapping, extern int afs_writepage(struct page *, struct writeback_control *); extern int afs_writepages(struct address_space *, struct writeback_control *); extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *); -extern ssize_t afs_file_write(struct kiocb *, const struct iovec *, - unsigned long, loff_t); +extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *); extern int afs_writeback_all(struct afs_vnode *); extern int afs_fsync(struct file *, loff_t, loff_t, int); diff --git a/fs/afs/main.c b/fs/afs/main.c index 42dd2e499ed..35de0c04729 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -55,13 +55,13 @@ static int __init afs_get_client_UUID(void) afs_uuid.time_low = uuidtime; afs_uuid.time_mid = uuidtime >> 32; afs_uuid.time_hi_and_version = (uuidtime >> 48) & AFS_UUID_TIMEHI_MASK; - afs_uuid.time_hi_and_version = AFS_UUID_VERSION_TIME; + afs_uuid.time_hi_and_version |= AFS_UUID_VERSION_TIME; get_random_bytes(&clockseq, 2); afs_uuid.clock_seq_low = clockseq; afs_uuid.clock_seq_hi_and_reserved = (clockseq >> 8) & AFS_UUID_CLOCKHI_MASK; - afs_uuid.clock_seq_hi_and_reserved = AFS_UUID_VARIANT_STD; + afs_uuid.clock_seq_hi_and_reserved |= AFS_UUID_VARIANT_STD; _debug("AFS UUID: %08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x", afs_uuid.time_low, diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 526e4bbbde5..24a905b076f 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -41,11 +41,8 @@ static const struct file_operations afs_proc_cells_fops = { .write = afs_proc_cells_write, .llseek = seq_lseek, .release = seq_release, - .owner = THIS_MODULE, }; -static int afs_proc_rootcell_open(struct inode *inode, struct file *file); -static int afs_proc_rootcell_release(struct inode *inode, struct file *file); static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf, size_t size, loff_t *_pos); static ssize_t afs_proc_rootcell_write(struct file *file, @@ -53,17 +50,12 @@ static ssize_t afs_proc_rootcell_write(struct file *file, size_t size, loff_t *_pos); static const struct file_operations afs_proc_rootcell_fops = { - .open = afs_proc_rootcell_open, .read = afs_proc_rootcell_read, .write = afs_proc_rootcell_write, .llseek = no_llseek, - .release = afs_proc_rootcell_release, - .owner = THIS_MODULE, }; static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file); -static int afs_proc_cell_volumes_release(struct inode *inode, - struct file *file); static void *afs_proc_cell_volumes_start(struct seq_file *p, loff_t *pos); static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, loff_t *pos); @@ -81,14 +73,11 @@ static const struct file_operations afs_proc_cell_volumes_fops = { .open = afs_proc_cell_volumes_open, .read = seq_read, .llseek = seq_lseek, - .release = afs_proc_cell_volumes_release, - .owner = THIS_MODULE, + .release = seq_release, }; static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file); -static int afs_proc_cell_vlservers_release(struct inode *inode, - struct file *file); static void *afs_proc_cell_vlservers_start(struct seq_file *p, loff_t *pos); static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v, loff_t *pos); @@ -106,13 +95,10 @@ static const struct file_operations afs_proc_cell_vlservers_fops = { .open = afs_proc_cell_vlservers_open, .read = seq_read, .llseek = seq_lseek, - .release = afs_proc_cell_vlservers_release, - .owner = THIS_MODULE, + .release = seq_release, }; static int afs_proc_cell_servers_open(struct inode *inode, struct file *file); -static int afs_proc_cell_servers_release(struct inode *inode, - struct file *file); static void *afs_proc_cell_servers_start(struct seq_file *p, loff_t *pos); static void *afs_proc_cell_servers_next(struct seq_file *p, void *v, loff_t *pos); @@ -130,8 +116,7 @@ static const struct file_operations afs_proc_cell_servers_fops = { .open = afs_proc_cell_servers_open, .read = seq_read, .llseek = seq_lseek, - .release = afs_proc_cell_servers_release, - .owner = THIS_MODULE, + .release = seq_release, }; /* @@ -139,29 +124,21 @@ static const struct file_operations afs_proc_cell_servers_fops = { */ int afs_proc_init(void) { - struct proc_dir_entry *p; - _enter(""); proc_afs = proc_mkdir("fs/afs", NULL); if (!proc_afs) goto error_dir; - p = proc_create("cells", 0, proc_afs, &afs_proc_cells_fops); - if (!p) - goto error_cells; - - p = proc_create("rootcell", 0, proc_afs, &afs_proc_rootcell_fops); - if (!p) - goto error_rootcell; + if (!proc_create("cells", 0644, proc_afs, &afs_proc_cells_fops) || + !proc_create("rootcell", 0644, proc_afs, &afs_proc_rootcell_fops)) + goto error_tree; _leave(" = 0"); return 0; -error_rootcell: - remove_proc_entry("cells", proc_afs); -error_cells: - remove_proc_entry("fs/afs", NULL); +error_tree: + remove_proc_subtree("fs/afs", NULL); error_dir: _leave(" = -ENOMEM"); return -ENOMEM; @@ -172,9 +149,7 @@ error_dir: */ void afs_proc_cleanup(void) { - remove_proc_entry("rootcell", proc_afs); - remove_proc_entry("cells", proc_afs); - remove_proc_entry("fs/afs", NULL); + remove_proc_subtree("fs/afs", NULL); } /* @@ -319,19 +294,6 @@ inval: goto done; } -/* - * Stubs for /proc/fs/afs/rootcell - */ -static int afs_proc_rootcell_open(struct inode *inode, struct file *file) -{ - return 0; -} - -static int afs_proc_rootcell_release(struct inode *inode, struct file *file) -{ - return 0; -} - static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf, size_t size, loff_t *_pos) { @@ -387,38 +349,27 @@ nomem: */ int afs_proc_cell_setup(struct afs_cell *cell) { - struct proc_dir_entry *p; + struct proc_dir_entry *dir; _enter("%p{%s}", cell, cell->name); - cell->proc_dir = proc_mkdir(cell->name, proc_afs); - if (!cell->proc_dir) + dir = proc_mkdir(cell->name, proc_afs); + if (!dir) goto error_dir; - p = proc_create_data("servers", 0, cell->proc_dir, - &afs_proc_cell_servers_fops, cell); - if (!p) - goto error_servers; - - p = proc_create_data("vlservers", 0, cell->proc_dir, - &afs_proc_cell_vlservers_fops, cell); - if (!p) - goto error_vlservers; - - p = proc_create_data("volumes", 0, cell->proc_dir, - &afs_proc_cell_volumes_fops, cell); - if (!p) - goto error_volumes; + if (!proc_create_data("servers", 0, dir, + &afs_proc_cell_servers_fops, cell) || + !proc_create_data("vlservers", 0, dir, + &afs_proc_cell_vlservers_fops, cell) || + !proc_create_data("volumes", 0, dir, + &afs_proc_cell_volumes_fops, cell)) + goto error_tree; _leave(" = 0"); return 0; -error_volumes: - remove_proc_entry("vlservers", cell->proc_dir); -error_vlservers: - remove_proc_entry("servers", cell->proc_dir); -error_servers: - remove_proc_entry(cell->name, proc_afs); +error_tree: + remove_proc_subtree(cell->name, proc_afs); error_dir: _leave(" = -ENOMEM"); return -ENOMEM; @@ -431,10 +382,7 @@ void afs_proc_cell_remove(struct afs_cell *cell) { _enter(""); - remove_proc_entry("volumes", cell->proc_dir); - remove_proc_entry("vlservers", cell->proc_dir); - remove_proc_entry("servers", cell->proc_dir); - remove_proc_entry(cell->name, proc_afs); + remove_proc_subtree(cell->name, proc_afs); _leave(""); } @@ -463,14 +411,6 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file) } /* - * close the file and release the ref to the cell - */ -static int afs_proc_cell_volumes_release(struct inode *inode, struct file *file) -{ - return seq_release(inode, file); -} - -/* * set up the iterator to start reading from the cells list and return the * first item */ @@ -569,15 +509,6 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file) } /* - * close the file and release the ref to the cell - */ -static int afs_proc_cell_vlservers_release(struct inode *inode, - struct file *file) -{ - return seq_release(inode, file); -} - -/* * set up the iterator to start reading from the cells list and return the * first item */ @@ -673,15 +604,6 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file) } /* - * close the file and release the ref to the cell - */ -static int afs_proc_cell_servers_release(struct inode *inode, - struct file *file) -{ - return seq_release(inode, file); -} - -/* * set up the iterator to start reading from the cells list and return the * first item */ diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 8ad8c2a0703..03a3beb1700 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -25,7 +25,7 @@ static void afs_wake_up_call_waiter(struct afs_call *); static int afs_wait_for_call_to_complete(struct afs_call *); static void afs_wake_up_async_call(struct afs_call *); static int afs_dont_wait_for_call_to_complete(struct afs_call *); -static void afs_process_async_call(struct work_struct *); +static void afs_process_async_call(struct afs_call *); static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *); static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool); @@ -58,6 +58,13 @@ static void afs_collect_incoming_call(struct work_struct *); static struct sk_buff_head afs_incoming_calls; static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call); +static void afs_async_workfn(struct work_struct *work) +{ + struct afs_call *call = container_of(work, struct afs_call, async_work); + + call->async_workfn(call); +} + /* * open an RxRPC socket and bind it to be a server for callback notifications * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT @@ -184,6 +191,28 @@ static void afs_free_call(struct afs_call *call) } /* + * End a call but do not free it + */ +static void afs_end_call_nofree(struct afs_call *call) +{ + if (call->rxcall) { + rxrpc_kernel_end_call(call->rxcall); + call->rxcall = NULL; + } + if (call->type->destructor) + call->type->destructor(call); +} + +/* + * End a call and free it + */ +static void afs_end_call(struct afs_call *call) +{ + afs_end_call_nofree(call); + afs_free_call(call); +} + +/* * allocate a call with flat request and reply buffers */ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, @@ -326,7 +355,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, atomic_read(&afs_outstanding_calls)); call->wait_mode = wait_mode; - INIT_WORK(&call->async_work, afs_process_async_call); + call->async_workfn = afs_process_async_call; + INIT_WORK(&call->async_work, afs_async_workfn); memset(&srx, 0, sizeof(srx)); srx.srx_family = AF_RXRPC; @@ -383,11 +413,8 @@ error_do_abort: rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT); while ((skb = skb_dequeue(&call->rx_queue))) afs_free_skb(skb); - rxrpc_kernel_end_call(rxcall); - call->rxcall = NULL; error_kill_call: - call->type->destructor(call); - afs_free_call(call); + afs_end_call(call); _leave(" = %d", ret); return ret; } @@ -509,12 +536,8 @@ static void afs_deliver_to_call(struct afs_call *call) if (call->state >= AFS_CALL_COMPLETE) { while ((skb = skb_dequeue(&call->rx_queue))) afs_free_skb(skb); - if (call->incoming) { - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - call->type->destructor(call); - afs_free_call(call); - } + if (call->incoming) + afs_end_call(call); } _leave(""); @@ -564,10 +587,7 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) } _debug("call complete"); - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - call->type->destructor(call); - afs_free_call(call); + afs_end_call(call); _leave(" = %d", ret); return ret; } @@ -603,11 +623,8 @@ static int afs_dont_wait_for_call_to_complete(struct afs_call *call) /* * delete an asynchronous call */ -static void afs_delete_async_call(struct work_struct *work) +static void afs_delete_async_call(struct afs_call *call) { - struct afs_call *call = - container_of(work, struct afs_call, async_work); - _enter(""); afs_free_call(call); @@ -620,11 +637,8 @@ static void afs_delete_async_call(struct work_struct *work) * - on a multiple-thread workqueue this work item may try to run on several * CPUs at the same time */ -static void afs_process_async_call(struct work_struct *work) +static void afs_process_async_call(struct afs_call *call) { - struct afs_call *call = - container_of(work, struct afs_call, async_work); - _enter(""); if (!skb_queue_empty(&call->rx_queue)) @@ -637,14 +651,11 @@ static void afs_process_async_call(struct work_struct *work) call->reply = NULL; /* kill the call */ - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - if (call->type->destructor) - call->type->destructor(call); + afs_end_call_nofree(call); /* we can't just delete the call because the work item may be * queued */ - PREPARE_WORK(&call->async_work, afs_delete_async_call); + call->async_workfn = afs_delete_async_call; queue_work(afs_async_calls, &call->async_work); } @@ -685,7 +696,8 @@ static void afs_collect_incoming_call(struct work_struct *work) return; } - INIT_WORK(&call->async_work, afs_process_async_call); + call->async_workfn = afs_process_async_call; + INIT_WORK(&call->async_work, afs_async_workfn); call->wait_mode = &afs_async_incoming_call; call->type = &afs_RXCMxxxx; init_waitqueue_head(&call->waitq); @@ -782,10 +794,7 @@ void afs_send_empty_reply(struct afs_call *call) _debug("oom"); rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); default: - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - call->type->destructor(call); - afs_free_call(call); + afs_end_call(call); _leave(" [error]"); return; } @@ -815,17 +824,16 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) call->state = AFS_CALL_AWAIT_ACK; n = rxrpc_kernel_send_data(call->rxcall, &msg, len); if (n >= 0) { + /* Success */ _leave(" [replied]"); return; } + if (n == -ENOMEM) { _debug("oom"); rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); } - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - call->type->destructor(call); - afs_free_call(call); + afs_end_call(call); _leave(" [error]"); } diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index 57bcb159653..b6df2e83809 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -308,7 +308,8 @@ static int afs_vlocation_fill_in_record(struct afs_vlocation *vl, /* see if we have an in-cache copy (will set vl->valid if there is) */ #ifdef CONFIG_AFS_FSCACHE vl->cache = fscache_acquire_cookie(vl->cell->cache, - &afs_vlocation_cache_index_def, vl); + &afs_vlocation_cache_index_def, vl, + true); #endif if (vl->valid) { diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 401eeb21869..2b607257820 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -131,7 +131,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params) #ifdef CONFIG_AFS_FSCACHE volume->cache = fscache_acquire_cookie(vlocation->cache, &afs_volume_cache_index_def, - volume); + volume, true); #endif afs_get_vlocation(vlocation); volume->vlocation = vlocation; diff --git a/fs/afs/write.c b/fs/afs/write.c index a890db4b989..ab6adfd5251 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -625,15 +625,14 @@ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call) /* * write to an AFS file */ -ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from) { struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp)); ssize_t result; - size_t count = iov_length(iov, nr_segs); + size_t count = iov_iter_count(from); - _enter("{%x.%u},{%zu},%lu,", - vnode->fid.vid, vnode->fid.vnode, count, nr_segs); + _enter("{%x.%u},{%zu},", + vnode->fid.vid, vnode->fid.vnode, count); if (IS_SWAPFILE(&vnode->vfs_inode)) { printk(KERN_INFO @@ -644,7 +643,7 @@ ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov, if (!count) return 0; - result = generic_file_aio_write(iocb, iov, nr_segs, pos); + result = generic_file_write_iter(iocb, from); if (IS_ERR_VALUE(result)) { _leave(" = %zd", result); return result; @@ -36,10 +36,10 @@ #include <linux/eventfd.h> #include <linux/blkdev.h> #include <linux/compat.h> -#include <linux/anon_inodes.h> #include <linux/migrate.h> #include <linux/ramfs.h> #include <linux/percpu-refcount.h> +#include <linux/mount.h> #include <asm/kmap_types.h> #include <asm/uaccess.h> @@ -52,7 +52,8 @@ struct aio_ring { unsigned id; /* kernel internal index number */ unsigned nr; /* number of io_events */ - unsigned head; + unsigned head; /* Written to by userland or under ring_lock + * mutex by aio_read_events_ring(). */ unsigned tail; unsigned magic; @@ -80,6 +81,8 @@ struct kioctx { struct percpu_ref users; atomic_t dead; + struct percpu_ref reqs; + unsigned long user_id; struct __percpu kioctx_cpu *cpu; @@ -107,9 +110,13 @@ struct kioctx { struct page **ring_pages; long nr_pages; - struct rcu_head rcu_head; struct work_struct free_work; + /* + * signals when all in-flight requests are done + */ + struct completion *requests_done; + struct { /* * This counts the number of available slots in the ringbuffer, @@ -152,12 +159,67 @@ unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio request static struct kmem_cache *kiocb_cachep; static struct kmem_cache *kioctx_cachep; +static struct vfsmount *aio_mnt; + +static const struct file_operations aio_ring_fops; +static const struct address_space_operations aio_ctx_aops; + +static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) +{ + struct qstr this = QSTR_INIT("[aio]", 5); + struct file *file; + struct path path; + struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + inode->i_mapping->a_ops = &aio_ctx_aops; + inode->i_mapping->private_data = ctx; + inode->i_size = PAGE_SIZE * nr_pages; + + path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this); + if (!path.dentry) { + iput(inode); + return ERR_PTR(-ENOMEM); + } + path.mnt = mntget(aio_mnt); + + d_instantiate(path.dentry, inode); + file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &aio_ring_fops); + if (IS_ERR(file)) { + path_put(&path); + return file; + } + + file->f_flags = O_RDWR; + file->private_data = ctx; + return file; +} + +static struct dentry *aio_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + static const struct dentry_operations ops = { + .d_dname = simple_dname, + }; + return mount_pseudo(fs_type, "aio:", NULL, &ops, 0xa10a10a1); +} + /* aio_setup * Creates the slab caches used by the aio routines, panic on * failure as this is done early during the boot sequence. */ static int __init aio_setup(void) { + static struct file_system_type aio_fs = { + .name = "aio", + .mount = aio_mount, + .kill_sb = kill_anon_super, + }; + aio_mnt = kern_mount(&aio_fs); + if (IS_ERR(aio_mnt)) + panic("Failed to create aio fs mount."); + kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); @@ -187,16 +249,26 @@ static void aio_free_ring(struct kioctx *ctx) { int i; + /* Disconnect the kiotx from the ring file. This prevents future + * accesses to the kioctx from page migration. + */ + put_aio_ring_file(ctx); + for (i = 0; i < ctx->nr_pages; i++) { + struct page *page; pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, page_count(ctx->ring_pages[i])); - put_page(ctx->ring_pages[i]); + page = ctx->ring_pages[i]; + if (!page) + continue; + ctx->ring_pages[i] = NULL; + put_page(page); } - put_aio_ring_file(ctx); - - if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) + if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) { kfree(ctx->ring_pages); + ctx->ring_pages = NULL; + } } static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) @@ -220,38 +292,66 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, { struct kioctx *ctx; unsigned long flags; + pgoff_t idx; int rc; + rc = 0; + + /* mapping->private_lock here protects against the kioctx teardown. */ + spin_lock(&mapping->private_lock); + ctx = mapping->private_data; + if (!ctx) { + rc = -EINVAL; + goto out; + } + + /* The ring_lock mutex. The prevents aio_read_events() from writing + * to the ring's head, and prevents page migration from mucking in + * a partially initialized kiotx. + */ + if (!mutex_trylock(&ctx->ring_lock)) { + rc = -EAGAIN; + goto out; + } + + idx = old->index; + if (idx < (pgoff_t)ctx->nr_pages) { + /* Make sure the old page hasn't already been changed */ + if (ctx->ring_pages[idx] != old) + rc = -EAGAIN; + } else + rc = -EINVAL; + + if (rc != 0) + goto out_unlock; + /* Writeback must be complete */ BUG_ON(PageWriteback(old)); - put_page(old); + get_page(new); - rc = migrate_page_move_mapping(mapping, new, old, NULL, mode); + rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1); if (rc != MIGRATEPAGE_SUCCESS) { - get_page(old); - return rc; + put_page(new); + goto out_unlock; } - get_page(new); - - /* We can potentially race against kioctx teardown here. Use the - * address_space's private data lock to protect the mapping's - * private_data. + /* Take completion_lock to prevent other writes to the ring buffer + * while the old page is copied to the new. This prevents new + * events from being lost. */ - spin_lock(&mapping->private_lock); - ctx = mapping->private_data; - if (ctx) { - pgoff_t idx; - spin_lock_irqsave(&ctx->completion_lock, flags); - migrate_page_copy(new, old); - idx = old->index; - if (idx < (pgoff_t)ctx->nr_pages) - ctx->ring_pages[idx] = new; - spin_unlock_irqrestore(&ctx->completion_lock, flags); - } else - rc = -EBUSY; - spin_unlock(&mapping->private_lock); + spin_lock_irqsave(&ctx->completion_lock, flags); + migrate_page_copy(new, old); + BUG_ON(ctx->ring_pages[idx] != old); + ctx->ring_pages[idx] = new; + spin_unlock_irqrestore(&ctx->completion_lock, flags); + /* The old page is no longer accessible. */ + put_page(old); + +out_unlock: + mutex_unlock(&ctx->ring_lock); +out: + spin_unlock(&mapping->private_lock); return rc; } #endif @@ -268,7 +368,7 @@ static int aio_setup_ring(struct kioctx *ctx) struct aio_ring *ring; unsigned nr_events = ctx->max_reqs; struct mm_struct *mm = current->mm; - unsigned long size, populate; + unsigned long size, unused; int nr_pages; int i; struct file *file; @@ -283,15 +383,25 @@ static int aio_setup_ring(struct kioctx *ctx) if (nr_pages < 0) return -EINVAL; - file = anon_inode_getfile_private("[aio]", &aio_ring_fops, ctx, O_RDWR); + file = aio_private_file(ctx, nr_pages); if (IS_ERR(file)) { ctx->aio_ring_file = NULL; - return -EAGAIN; + return -ENOMEM; } - file->f_inode->i_mapping->a_ops = &aio_ctx_aops; - file->f_inode->i_mapping->private_data = ctx; - file->f_inode->i_size = PAGE_SIZE * (loff_t)nr_pages; + ctx->aio_ring_file = file; + nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) + / sizeof(struct io_event); + + ctx->ring_pages = ctx->internal_pages; + if (nr_pages > AIO_RING_PAGES) { + ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), + GFP_KERNEL); + if (!ctx->ring_pages) { + put_aio_ring_file(ctx); + return -ENOMEM; + } + } for (i = 0; i < nr_pages; i++) { struct page *page; @@ -304,17 +414,14 @@ static int aio_setup_ring(struct kioctx *ctx) SetPageUptodate(page); SetPageDirty(page); unlock_page(page); + + ctx->ring_pages[i] = page; } - ctx->aio_ring_file = file; - nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) - / sizeof(struct io_event); + ctx->nr_pages = i; - ctx->ring_pages = ctx->internal_pages; - if (nr_pages > AIO_RING_PAGES) { - ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), - GFP_KERNEL); - if (!ctx->ring_pages) - return -ENOMEM; + if (unlikely(i != nr_pages)) { + aio_free_ring(ctx); + return -ENOMEM; } ctx->mmap_size = nr_pages * PAGE_SIZE; @@ -323,37 +430,16 @@ static int aio_setup_ring(struct kioctx *ctx) down_write(&mm->mmap_sem); ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, 0, &populate); + MAP_SHARED, 0, &unused); + up_write(&mm->mmap_sem); if (IS_ERR((void *)ctx->mmap_base)) { - up_write(&mm->mmap_sem); ctx->mmap_size = 0; aio_free_ring(ctx); - return -EAGAIN; + return -ENOMEM; } pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); - /* We must do this while still holding mmap_sem for write, as we - * need to be protected against userspace attempting to mremap() - * or munmap() the ring buffer. - */ - ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, - 1, 0, ctx->ring_pages, NULL); - - /* Dropping the reference here is safe as the page cache will hold - * onto the pages for us. It is also required so that page migration - * can unmap the pages and get the right reference count. - */ - for (i = 0; i < ctx->nr_pages; i++) - put_page(ctx->ring_pages[i]); - - up_write(&mm->mmap_sem); - - if (unlikely(ctx->nr_pages != nr_pages)) { - aio_free_ring(ctx); - return -EAGAIN; - } - ctx->user_id = ctx->mmap_base; ctx->nr_events = nr_events; /* trusted copy */ @@ -391,7 +477,7 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) } EXPORT_SYMBOL(kiocb_set_cancel_fn); -static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb) +static int kiocb_cancel(struct kiocb *kiocb) { kiocb_cancel_fn *old, *cancel; @@ -412,26 +498,38 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb) return cancel(kiocb); } -static void free_ioctx_rcu(struct rcu_head *head) +static void free_ioctx(struct work_struct *work) { - struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); + struct kioctx *ctx = container_of(work, struct kioctx, free_work); + pr_debug("freeing %p\n", ctx); + + aio_free_ring(ctx); free_percpu(ctx->cpu); kmem_cache_free(kioctx_cachep, ctx); } +static void free_ioctx_reqs(struct percpu_ref *ref) +{ + struct kioctx *ctx = container_of(ref, struct kioctx, reqs); + + /* At this point we know that there are no any in-flight requests */ + if (ctx->requests_done) + complete(ctx->requests_done); + + INIT_WORK(&ctx->free_work, free_ioctx); + schedule_work(&ctx->free_work); +} + /* * When this function runs, the kioctx has been removed from the "hash table" * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - * now it's safe to cancel any that need to be. */ -static void free_ioctx(struct work_struct *work) +static void free_ioctx_users(struct percpu_ref *ref) { - struct kioctx *ctx = container_of(work, struct kioctx, free_work); - struct aio_ring *ring; + struct kioctx *ctx = container_of(ref, struct kioctx, users); struct kiocb *req; - unsigned cpu, avail; - DEFINE_WAIT(wait); spin_lock_irq(&ctx->ctx_lock); @@ -440,59 +538,13 @@ static void free_ioctx(struct work_struct *work) struct kiocb, ki_list); list_del_init(&req->ki_list); - kiocb_cancel(ctx, req); + kiocb_cancel(req); } spin_unlock_irq(&ctx->ctx_lock); - for_each_possible_cpu(cpu) { - struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu); - - atomic_add(kcpu->reqs_available, &ctx->reqs_available); - kcpu->reqs_available = 0; - } - - while (1) { - prepare_to_wait(&ctx->wait, &wait, TASK_UNINTERRUPTIBLE); - - ring = kmap_atomic(ctx->ring_pages[0]); - avail = (ring->head <= ring->tail) - ? ring->tail - ring->head - : ctx->nr_events - ring->head + ring->tail; - - atomic_add(avail, &ctx->reqs_available); - ring->head = ring->tail; - kunmap_atomic(ring); - - if (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1) - break; - - schedule(); - } - finish_wait(&ctx->wait, &wait); - - WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1); - - aio_free_ring(ctx); - - pr_debug("freeing %p\n", ctx); - - /* - * Here the call_rcu() is between the wait_event() for reqs_active to - * hit 0, and freeing the ioctx. - * - * aio_complete() decrements reqs_active, but it has to touch the ioctx - * after to issue a wakeup so we use rcu. - */ - call_rcu(&ctx->rcu_head, free_ioctx_rcu); -} - -static void free_ioctx_ref(struct percpu_ref *ref) -{ - struct kioctx *ctx = container_of(ref, struct kioctx, users); - - INIT_WORK(&ctx->free_work, free_ioctx); - schedule_work(&ctx->free_work); + percpu_ref_kill(&ctx->reqs); + percpu_ref_put(&ctx->reqs); } static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) @@ -514,6 +566,10 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); + /* While kioctx setup is in progress, + * we are protected from page migration + * changes ring_pages by ->ring_lock. + */ ring = kmap_atomic(ctx->ring_pages[0]); ring->id = ctx->id; kunmap_atomic(ring); @@ -551,6 +607,16 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) } } +static void aio_nr_sub(unsigned nr) +{ + spin_lock(&aio_nr_lock); + if (WARN_ON(aio_nr - nr > aio_nr)) + aio_nr = 0; + else + aio_nr -= nr; + spin_unlock(&aio_nr_lock); +} + /* ioctx_alloc * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. */ @@ -588,22 +654,29 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) ctx->max_reqs = nr_events; - if (percpu_ref_init(&ctx->users, free_ioctx_ref)) - goto out_freectx; - spin_lock_init(&ctx->ctx_lock); spin_lock_init(&ctx->completion_lock); mutex_init(&ctx->ring_lock); + /* Protect against page migration throughout kiotx setup by keeping + * the ring_lock mutex held until setup is complete. */ + mutex_lock(&ctx->ring_lock); init_waitqueue_head(&ctx->wait); INIT_LIST_HEAD(&ctx->active_reqs); + if (percpu_ref_init(&ctx->users, free_ioctx_users)) + goto err; + + if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) + goto err; + ctx->cpu = alloc_percpu(struct kioctx_cpu); if (!ctx->cpu) - goto out_freeref; + goto err; - if (aio_setup_ring(ctx) < 0) - goto out_freepcpu; + err = aio_setup_ring(ctx); + if (err < 0) + goto err; atomic_set(&ctx->reqs_available, ctx->nr_events - 1); ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); @@ -615,32 +688,35 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) if (aio_nr + nr_events > (aio_max_nr * 2UL) || aio_nr + nr_events < aio_nr) { spin_unlock(&aio_nr_lock); - goto out_cleanup; + err = -EAGAIN; + goto err_ctx; } aio_nr += ctx->max_reqs; spin_unlock(&aio_nr_lock); - percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ + percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ + percpu_ref_get(&ctx->reqs); /* free_ioctx_users() will drop this */ err = ioctx_add_table(ctx, mm); if (err) - goto out_cleanup_put; + goto err_cleanup; + + /* Release the ring_lock mutex now that all setup is complete. */ + mutex_unlock(&ctx->ring_lock); pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", ctx, ctx->user_id, mm, ctx->nr_events); return ctx; -out_cleanup_put: - percpu_ref_put(&ctx->users); -out_cleanup: - err = -EAGAIN; +err_cleanup: + aio_nr_sub(ctx->max_reqs); +err_ctx: aio_free_ring(ctx); -out_freepcpu: +err: + mutex_unlock(&ctx->ring_lock); free_percpu(ctx->cpu); -out_freeref: + free_percpu(ctx->reqs.pcpu_count); free_percpu(ctx->users.pcpu_count); -out_freectx: - put_aio_ring_file(ctx); kmem_cache_free(kioctx_cachep, ctx); pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); @@ -651,40 +727,42 @@ out_freectx: * when the processes owning a context have all exited to encourage * the rapid destruction of the kioctx. */ -static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) +static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, + struct completion *requests_done) { - if (!atomic_xchg(&ctx->dead, 1)) { - struct kioctx_table *table; + struct kioctx_table *table; - spin_lock(&mm->ioctx_lock); - rcu_read_lock(); - table = rcu_dereference(mm->ioctx_table); + if (atomic_xchg(&ctx->dead, 1)) + return -EINVAL; - WARN_ON(ctx != table->table[ctx->id]); - table->table[ctx->id] = NULL; - rcu_read_unlock(); - spin_unlock(&mm->ioctx_lock); - /* percpu_ref_kill() will do the necessary call_rcu() */ - wake_up_all(&ctx->wait); + spin_lock(&mm->ioctx_lock); + rcu_read_lock(); + table = rcu_dereference(mm->ioctx_table); + + WARN_ON(ctx != table->table[ctx->id]); + table->table[ctx->id] = NULL; + rcu_read_unlock(); + spin_unlock(&mm->ioctx_lock); - /* - * It'd be more correct to do this in free_ioctx(), after all - * the outstanding kiocbs have finished - but by then io_destroy - * has already returned, so io_setup() could potentially return - * -EAGAIN with no ioctxs actually in use (as far as userspace - * could tell). - */ - spin_lock(&aio_nr_lock); - BUG_ON(aio_nr - ctx->max_reqs > aio_nr); - aio_nr -= ctx->max_reqs; - spin_unlock(&aio_nr_lock); + /* percpu_ref_kill() will do the necessary call_rcu() */ + wake_up_all(&ctx->wait); + + /* + * It'd be more correct to do this in free_ioctx(), after all + * the outstanding kiocbs have finished - but by then io_destroy + * has already returned, so io_setup() could potentially return + * -EAGAIN with no ioctxs actually in use (as far as userspace + * could tell). + */ + aio_nr_sub(ctx->max_reqs); - if (ctx->mmap_size) - vm_munmap(ctx->mmap_base, ctx->mmap_size); + if (ctx->mmap_size) + vm_munmap(ctx->mmap_base, ctx->mmap_size); - percpu_ref_kill(&ctx->users); - } + ctx->requests_done = requests_done; + percpu_ref_kill(&ctx->users); + return 0; } /* wait_on_sync_kiocb: @@ -745,23 +823,27 @@ void exit_aio(struct mm_struct *mm) */ ctx->mmap_size = 0; - kill_ioctx(mm, ctx); + kill_ioctx(mm, ctx, NULL); } } static void put_reqs_available(struct kioctx *ctx, unsigned nr) { struct kioctx_cpu *kcpu; + unsigned long flags; preempt_disable(); kcpu = this_cpu_ptr(ctx->cpu); + local_irq_save(flags); kcpu->reqs_available += nr; + while (kcpu->reqs_available >= ctx->req_batch * 2) { kcpu->reqs_available -= ctx->req_batch; atomic_add(ctx->req_batch, &ctx->reqs_available); } + local_irq_restore(flags); preempt_enable(); } @@ -769,10 +851,12 @@ static bool get_reqs_available(struct kioctx *ctx) { struct kioctx_cpu *kcpu; bool ret = false; + unsigned long flags; preempt_disable(); kcpu = this_cpu_ptr(ctx->cpu); + local_irq_save(flags); if (!kcpu->reqs_available) { int old, avail = atomic_read(&ctx->reqs_available); @@ -791,6 +875,7 @@ static bool get_reqs_available(struct kioctx *ctx) ret = true; kcpu->reqs_available--; out: + local_irq_restore(flags); preempt_enable(); return ret; } @@ -810,6 +895,8 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx) if (unlikely(!req)) goto out_put; + percpu_ref_get(&ctx->reqs); + req->ki_ctx = ctx; return req; out_put: @@ -879,12 +966,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2) return; } - /* - * Take rcu_read_lock() in case the kioctx is being destroyed, as we - * need to issue a wakeup after incrementing reqs_available. - */ - rcu_read_lock(); - if (iocb->ki_list.next) { unsigned long flags; @@ -947,6 +1028,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) /* everything turned out well, dispose of the aiocb. */ kiocb_free(iocb); + put_reqs_available(ctx, 1); /* * We have to order our ring_info tail store above and test @@ -959,7 +1041,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); - rcu_read_unlock(); + percpu_ref_put(&ctx->reqs); } EXPORT_SYMBOL(aio_complete); @@ -977,6 +1059,7 @@ static long aio_read_events_ring(struct kioctx *ctx, mutex_lock(&ctx->ring_lock); + /* Access to ->ring_pages here is protected by ctx->ring_lock. */ ring = kmap_atomic(ctx->ring_pages[0]); head = ring->head; tail = ring->tail; @@ -987,6 +1070,9 @@ static long aio_read_events_ring(struct kioctx *ctx, if (head == tail) goto out; + head %= ctx->nr_events; + tail %= ctx->nr_events; + while (ret < nr) { long avail; struct io_event *ev; @@ -1025,8 +1111,6 @@ static long aio_read_events_ring(struct kioctx *ctx, flush_dcache_page(ctx->ring_pages[0]); pr_debug("%li h%u t%u\n", ret, head, tail); - - put_reqs_available(ctx, ret); out: mutex_unlock(&ctx->ring_lock); @@ -1124,7 +1208,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); if (ret) - kill_ioctx(current->mm, ioctx); + kill_ioctx(current->mm, ioctx, NULL); percpu_ref_put(&ioctx->users); } @@ -1142,9 +1226,25 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { - kill_ioctx(current->mm, ioctx); + struct completion requests_done = + COMPLETION_INITIALIZER_ONSTACK(requests_done); + int ret; + + /* Pass requests_done to kill_ioctx() where it can be set + * in a thread-safe way. If we try to set it here then we have + * a race condition if two io_destroy() called simultaneously. + */ + ret = kill_ioctx(current->mm, ioctx, &requests_done); percpu_ref_put(&ioctx->users); - return 0; + + /* Wait until all IO for the context are done. Otherwise kernel + * keep using user-space buffers even if user thinks the context + * is destroyed. + */ + if (!ret) + wait_for_completion(&requests_done); + + return ret; } pr_debug("EINVAL: io_destroy: invalid context id\n"); return -EINVAL; @@ -1152,6 +1252,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *, unsigned long, loff_t); +typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *); static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, int rw, char __user *buf, @@ -1209,7 +1310,9 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, int rw; fmode_t mode; aio_rw_op *rw_op; + rw_iter_op *iter_op; struct iovec inline_vec, *iovec = &inline_vec; + struct iov_iter iter; switch (opcode) { case IOCB_CMD_PREAD: @@ -1217,6 +1320,7 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, mode = FMODE_READ; rw = READ; rw_op = file->f_op->aio_read; + iter_op = file->f_op->read_iter; goto rw_common; case IOCB_CMD_PWRITE: @@ -1224,12 +1328,13 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, mode = FMODE_WRITE; rw = WRITE; rw_op = file->f_op->aio_write; + iter_op = file->f_op->write_iter; goto rw_common; rw_common: if (unlikely(!(file->f_mode & mode))) return -EBADF; - if (!rw_op) + if (!rw_op && !iter_op) return -EINVAL; ret = (opcode == IOCB_CMD_PREADV || @@ -1238,10 +1343,8 @@ rw_common: &iovec, compat) : aio_setup_single_vector(req, rw, buf, &nr_segs, iovec); - if (ret) - return ret; - - ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); + if (!ret) + ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); if (ret < 0) { if (iovec != &inline_vec) kfree(iovec); @@ -1260,7 +1363,12 @@ rw_common: if (rw == WRITE) file_start_write(file); - ret = rw_op(req, iovec, nr_segs, req->ki_pos); + if (iter_op) { + iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes); + ret = iter_op(req, &iter); + } else { + ret = rw_op(req, iovec, nr_segs, req->ki_pos); + } if (rw == WRITE) file_end_write(file); @@ -1370,6 +1478,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, return 0; out_put_req: put_reqs_available(ctx, 1); + percpu_ref_put(&ctx->reqs); kiocb_free(req); return ret; } @@ -1497,7 +1606,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, kiocb = lookup_kiocb(ctx, iocb, key); if (kiocb) - ret = kiocb_cancel(ctx, kiocb); + ret = kiocb_cancel(kiocb); else ret = -EINVAL; diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 85c96184995..80ef38c73e5 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -24,7 +24,6 @@ static struct vfsmount *anon_inode_mnt __read_mostly; static struct inode *anon_inode_inode; -static const struct file_operations anon_inode_fops; /* * anon_inodefs_dname() is called from d_path(). @@ -39,67 +38,11 @@ static const struct dentry_operations anon_inodefs_dentry_operations = { .d_dname = anon_inodefs_dname, }; -/* - * nop .set_page_dirty method so that people can use .page_mkwrite on - * anon inodes. - */ -static int anon_set_page_dirty(struct page *page) -{ - return 0; -}; - -static const struct address_space_operations anon_aops = { - .set_page_dirty = anon_set_page_dirty, -}; - -/* - * A single inode exists for all anon_inode files. Contrary to pipes, - * anon_inode inodes have no associated per-instance data, so we need - * only allocate one of them. - */ -static struct inode *anon_inode_mkinode(struct super_block *s) -{ - struct inode *inode = new_inode_pseudo(s); - - if (!inode) - return ERR_PTR(-ENOMEM); - - inode->i_ino = get_next_ino(); - inode->i_fop = &anon_inode_fops; - - inode->i_mapping->a_ops = &anon_aops; - - /* - * Mark the inode dirty from the very beginning, - * that way it will never be moved to the dirty - * list because mark_inode_dirty() will think - * that it already _is_ on the dirty list. - */ - inode->i_state = I_DIRTY; - inode->i_mode = S_IRUSR | S_IWUSR; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); - inode->i_flags |= S_PRIVATE; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - return inode; -} - static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - struct dentry *root; - root = mount_pseudo(fs_type, "anon_inode:", NULL, + return mount_pseudo(fs_type, "anon_inode:", NULL, &anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC); - if (!IS_ERR(root)) { - struct super_block *s = root->d_sb; - anon_inode_inode = anon_inode_mkinode(s); - if (IS_ERR(anon_inode_inode)) { - dput(root); - deactivate_locked_super(s); - root = ERR_CAST(anon_inode_inode); - } - } - return root; } static struct file_system_type anon_inode_fs_type = { @@ -109,72 +52,6 @@ static struct file_system_type anon_inode_fs_type = { }; /** - * anon_inode_getfile_private - creates a new file instance by hooking it up to an - * anonymous inode, and a dentry that describe the "class" - * of the file - * - * @name: [in] name of the "class" of the new file - * @fops: [in] file operations for the new file - * @priv: [in] private data for the new file (will be file's private_data) - * @flags: [in] flags - * - * - * Similar to anon_inode_getfile, but each file holds a single inode. - * - */ -struct file *anon_inode_getfile_private(const char *name, - const struct file_operations *fops, - void *priv, int flags) -{ - struct qstr this; - struct path path; - struct file *file; - struct inode *inode; - - if (fops->owner && !try_module_get(fops->owner)) - return ERR_PTR(-ENOENT); - - inode = anon_inode_mkinode(anon_inode_mnt->mnt_sb); - if (IS_ERR(inode)) { - file = ERR_PTR(-ENOMEM); - goto err_module; - } - - /* - * Link the inode to a directory entry by creating a unique name - * using the inode sequence number. - */ - file = ERR_PTR(-ENOMEM); - this.name = name; - this.len = strlen(name); - this.hash = 0; - path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this); - if (!path.dentry) - goto err_module; - - path.mnt = mntget(anon_inode_mnt); - - d_instantiate(path.dentry, inode); - - file = alloc_file(&path, OPEN_FMODE(flags), fops); - if (IS_ERR(file)) - goto err_dput; - - file->f_mapping = inode->i_mapping; - file->f_flags = flags & (O_ACCMODE | O_NONBLOCK); - file->private_data = priv; - - return file; - -err_dput: - path_put(&path); -err_module: - module_put(fops->owner); - return file; -} -EXPORT_SYMBOL_GPL(anon_inode_getfile_private); - -/** * anon_inode_getfile - creates a new file instance by hooking it up to an * anonymous inode, and a dentry that describe the "class" * of the file @@ -287,22 +164,15 @@ EXPORT_SYMBOL_GPL(anon_inode_getfd); static int __init anon_inode_init(void) { - int error; - - error = register_filesystem(&anon_inode_fs_type); - if (error) - goto err_exit; anon_inode_mnt = kern_mount(&anon_inode_fs_type); - if (IS_ERR(anon_inode_mnt)) { - error = PTR_ERR(anon_inode_mnt); - goto err_unregister_filesystem; - } - return 0; + if (IS_ERR(anon_inode_mnt)) + panic("anon_inode_init() kernel mount failed (%ld)\n", PTR_ERR(anon_inode_mnt)); -err_unregister_filesystem: - unregister_filesystem(&anon_inode_fs_type); -err_exit: - panic(KERN_ERR "anon_inode_init() failed (%d)\n", error); + anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb); + if (IS_ERR(anon_inode_inode)) + panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode)); + + return 0; } fs_initcall(anon_inode_init); diff --git a/fs/attr.c b/fs/attr.c index 1449adb14ef..6530ced1969 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -50,14 +50,14 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) if ((ia_valid & ATTR_UID) && (!uid_eq(current_fsuid(), inode->i_uid) || !uid_eq(attr->ia_uid, inode->i_uid)) && - !inode_capable(inode, CAP_CHOWN)) + !capable_wrt_inode_uidgid(inode, CAP_CHOWN)) return -EPERM; /* Make sure caller can chgrp. */ if ((ia_valid & ATTR_GID) && (!uid_eq(current_fsuid(), inode->i_uid) || (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) && - !inode_capable(inode, CAP_CHOWN)) + !capable_wrt_inode_uidgid(inode, CAP_CHOWN)) return -EPERM; /* Make sure a caller can chmod. */ @@ -67,7 +67,7 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) /* Also check the setgid bit! */ if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : inode->i_gid) && - !inode_capable(inode, CAP_FSETID)) + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) attr->ia_mode &= ~S_ISGID; } @@ -160,14 +160,34 @@ void setattr_copy(struct inode *inode, const struct iattr *attr) umode_t mode = attr->ia_mode; if (!in_group_p(inode->i_gid) && - !inode_capable(inode, CAP_FSETID)) + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) mode &= ~S_ISGID; inode->i_mode = mode; } } EXPORT_SYMBOL(setattr_copy); -int notify_change(struct dentry * dentry, struct iattr * attr) +/** + * notify_change - modify attributes of a filesytem object + * @dentry: object affected + * @iattr: new attributes + * @delegated_inode: returns inode, if the inode is delegated + * + * The caller must hold the i_mutex on the affected object. + * + * If notify_change discovers a delegation in need of breaking, + * it will return -EWOULDBLOCK and return a reference to the inode in + * delegated_inode. The caller should then break the delegation and + * retry. Because breaking a delegation may take a long time, the + * caller should drop the i_mutex before doing so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. Also, passing NULL is fine for callers holding + * the file open for write, as there can be no conflicting delegation in + * that case. + */ +int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; umode_t mode = inode->i_mode; @@ -182,11 +202,6 @@ int notify_change(struct dentry * dentry, struct iattr * attr) return -EPERM; } - if ((ia_valid & ATTR_SIZE) && IS_I_VERSION(inode)) { - if (attr->ia_size != inode->i_size) - inode_inc_iversion(inode); - } - if ((ia_valid & ATTR_MODE)) { umode_t amode = attr->ia_mode; /* Flag setting protected by i_mutex */ @@ -243,6 +258,9 @@ int notify_change(struct dentry * dentry, struct iattr * attr) error = security_inode_setattr(dentry, attr); if (error) return error; + error = try_break_deleg(inode, delegated_inode); + if (error) + return error; if (inode->i_op->setattr) error = inode->i_op->setattr(dentry, attr); diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 3f1128b37e4..acf32054edd 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -104,7 +104,7 @@ struct autofs_sb_info { u32 magic; int pipefd; struct file *pipe; - pid_t oz_pgrp; + struct pid *oz_pgrp; int catatonic; int version; int sub_version; @@ -122,6 +122,7 @@ struct autofs_sb_info { spinlock_t lookup_lock; struct list_head active_list; struct list_head expiring_list; + struct rcu_head rcu; }; static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb) @@ -139,7 +140,7 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry) filesystem without "magic".) */ static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) { - return sbi->catatonic || task_pgrp_nr(current) == sbi->oz_pgrp; + return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp; } /* Does a dentry have some pending activity? */ @@ -271,7 +272,7 @@ void autofs4_clean_ino(struct autofs_info *); static inline int autofs_prepare_pipe(struct file *pipe) { - if (!pipe->f_op || !pipe->f_op->write) + if (!pipe->f_op->write) return -EINVAL; if (!S_ISFIFO(file_inode(pipe)->i_mode)) return -EINVAL; diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 0f00da329e7..5b570b6efa2 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -103,6 +103,9 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i if (tmp.size < sizeof(tmp)) return ERR_PTR(-EINVAL); + if (tmp.size > (PATH_MAX + sizeof(tmp))) + return ERR_PTR(-ENAMETOOLONG); + return memdup_user(in, tmp.size); } @@ -346,6 +349,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, { int pipefd; int err = 0; + struct pid *new_pid = NULL; if (param->setpipefd.pipefd == -1) return -EINVAL; @@ -357,7 +361,17 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, mutex_unlock(&sbi->wq_mutex); return -EBUSY; } else { - struct file *pipe = fget(pipefd); + struct file *pipe; + + new_pid = get_task_pid(current, PIDTYPE_PGID); + + if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) { + AUTOFS_WARN("Not allowed to change PID namespace"); + err = -EINVAL; + goto out; + } + + pipe = fget(pipefd); if (!pipe) { err = -EBADF; goto out; @@ -367,12 +381,13 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, fput(pipe); goto out; } - sbi->oz_pgrp = task_pgrp_nr(current); + swap(sbi->oz_pgrp, new_pid); sbi->pipefd = pipefd; sbi->pipe = pipe; sbi->catatonic = 0; } out: + put_pid(new_pid); mutex_unlock(&sbi->wq_mutex); return err; } @@ -658,12 +673,6 @@ static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __use goto out; } - if (!fp->f_op) { - err = -ENOTTY; - fput(fp); - goto out; - } - sbi = autofs_dev_ioctl_sbi(fp); if (!sbi || sbi->magic != AUTOFS_SBI_MAGIC) { err = -EINVAL; @@ -728,7 +737,7 @@ MODULE_ALIAS_MISCDEV(AUTOFS_MINOR); MODULE_ALIAS("devname:autofs"); /* Register/deregister misc character device */ -int autofs_dev_ioctl_init(void) +int __init autofs_dev_ioctl_init(void) { int r; diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 3d9d3f5d5dd..394e90b02c5 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -402,6 +402,20 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, goto next; } + if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) { + DPRINTK("checking symlink %p %.*s", + dentry, (int)dentry->d_name.len, dentry->d_name.name); + /* + * A symlink can't be "busy" in the usual sense so + * just check last used for expire timeout. + */ + if (autofs4_can_expire(dentry, timeout, do_now)) { + expired = dentry; + goto found; + } + goto next; + } + if (simple_empty(dentry)) goto next; diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index b104726e2d0..1c55388ae63 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -56,18 +56,16 @@ void autofs4_kill_sb(struct super_block *sb) * just call kill_anon_super when we are called from * deactivate_super. */ - if (!sbi) - goto out_kill_sb; - - /* Free wait queues, close pipe */ - autofs4_catatonic_mode(sbi); - - sb->s_fs_info = NULL; - kfree(sbi); + if (sbi) { + /* Free wait queues, close pipe */ + autofs4_catatonic_mode(sbi); + put_pid(sbi->oz_pgrp); + } -out_kill_sb: DPRINTK("shutting down"); kill_litter_super(sb); + if (sbi) + kfree_rcu(sbi, rcu); } static int autofs4_show_options(struct seq_file *m, struct dentry *root) @@ -85,7 +83,7 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root) if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID)) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, root_inode->i_gid)); - seq_printf(m, ",pgrp=%d", sbi->oz_pgrp); + seq_printf(m, ",pgrp=%d", pid_vnr(sbi->oz_pgrp)); seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); seq_printf(m, ",minproto=%d", sbi->min_proto); seq_printf(m, ",maxproto=%d", sbi->max_proto); @@ -129,7 +127,8 @@ static const match_table_t tokens = { }; static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, - pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto) + int *pgrp, bool *pgrp_set, unsigned int *type, + int *minproto, int *maxproto) { char *p; substring_t args[MAX_OPT_ARGS]; @@ -137,7 +136,6 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, *uid = current_uid(); *gid = current_gid(); - *pgrp = task_pgrp_nr(current); *minproto = AUTOFS_MIN_PROTO_VERSION; *maxproto = AUTOFS_MAX_PROTO_VERSION; @@ -176,6 +174,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, if (match_int(args, &option)) return 1; *pgrp = option; + *pgrp_set = true; break; case Opt_minproto: if (match_int(args, &option)) @@ -211,10 +210,13 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) int pipefd; struct autofs_sb_info *sbi; struct autofs_info *ino; + int pgrp = 0; + bool pgrp_set = false; + int ret = -EINVAL; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) - goto fail_unlock; + return -ENOMEM; DPRINTK("starting up, sbi = %p",sbi); s->s_fs_info = sbi; @@ -223,7 +225,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->pipe = NULL; sbi->catatonic = 1; sbi->exp_timeout = 0; - sbi->oz_pgrp = task_pgrp_nr(current); + sbi->oz_pgrp = NULL; sbi->sb = s; sbi->version = 0; sbi->sub_version = 0; @@ -248,8 +250,10 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) * Get the root inode and dentry, but defer checking for errors. */ ino = autofs4_new_ino(sbi); - if (!ino) + if (!ino) { + ret = -ENOMEM; goto fail_free; + } root_inode = autofs4_get_inode(s, S_IFDIR | 0755); root = d_make_root(root_inode); if (!root) @@ -260,12 +264,23 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) /* Can this call block? */ if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid, - &sbi->oz_pgrp, &sbi->type, &sbi->min_proto, - &sbi->max_proto)) { + &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto, + &sbi->max_proto)) { printk("autofs: called with bogus options\n"); goto fail_dput; } + if (pgrp_set) { + sbi->oz_pgrp = find_get_pid(pgrp); + if (!sbi->oz_pgrp) { + pr_warn("autofs: could not find process group %d\n", + pgrp); + goto fail_dput; + } + } else { + sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID); + } + if (autofs_type_trigger(sbi->type)) __managed_dentry_set_managed(root); @@ -289,14 +304,15 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->version = sbi->max_proto; sbi->sub_version = AUTOFS_PROTO_SUBVERSION; - DPRINTK("pipe fd = %d, pgrp = %u", pipefd, sbi->oz_pgrp); + DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pid_nr(sbi->oz_pgrp)); pipe = fget(pipefd); - + if (!pipe) { printk("autofs: could not open pipe file descriptor\n"); goto fail_dput; } - if (autofs_prepare_pipe(pipe) < 0) + ret = autofs_prepare_pipe(pipe); + if (ret < 0) goto fail_fput; sbi->pipe = pipe; sbi->pipefd = pipefd; @@ -321,10 +337,10 @@ fail_dput: fail_ino: kfree(ino); fail_free: + put_pid(sbi->oz_pgrp); kfree(sbi); s->s_fs_info = NULL; -fail_unlock: - return -EINVAL; + return ret; } struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode) diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 92ef341ba0c..cc87c1abac9 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) spin_lock(&active->d_lock); /* Already gone? */ - if (!d_count(active)) + if ((int) d_count(active) <= 0) goto next; qstr = &active->d_name; @@ -230,7 +230,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) spin_lock(&expiring->d_lock); - /* Bad luck, we've already been dentry_iput */ + /* We've already been dentry_iput or unlinked */ if (!expiring->d_inode) goto next; @@ -558,7 +558,7 @@ static int autofs4_dir_symlink(struct inode *dir, dget(dentry); atomic_inc(&ino->count); p_ino = autofs4_dentry_ino(dentry->d_parent); - if (p_ino && dentry->d_parent != dentry) + if (p_ino && !IS_ROOT(dentry)) atomic_inc(&p_ino->count); dir->i_mtime = CURRENT_TIME; @@ -593,7 +593,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) if (atomic_dec_and_test(&ino->count)) { p_ino = autofs4_dentry_ino(dentry->d_parent); - if (p_ino && dentry->d_parent != dentry) + if (p_ino && !IS_ROOT(dentry)) atomic_dec(&p_ino->count); } dput(ino->dentry); @@ -732,7 +732,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m dget(dentry); atomic_inc(&ino->count); p_ino = autofs4_dentry_ino(dentry->d_parent); - if (p_ino && dentry->d_parent != dentry) + if (p_ino && !IS_ROOT(dentry)) atomic_inc(&p_ino->count); inc_nlink(dir); dir->i_mtime = CURRENT_TIME; diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c index f27c094a191..1e8ea192be2 100644 --- a/fs/autofs4/symlink.c +++ b/fs/autofs4/symlink.c @@ -14,6 +14,10 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) { + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino && !autofs4_oz_mode(sbi)) + ino->last_used = jiffies; nd_set_link(nd, dentry->d_inode->i_private); return NULL; } diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 689e40d983a..116fd38ee47 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -347,11 +347,23 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, struct qstr qstr; char *name; int status, ret, type; + pid_t pid; + pid_t tgid; /* In catatonic mode, we don't wait for nobody */ if (sbi->catatonic) return -ENOENT; + /* + * Try translating pids to the namespace of the daemon. + * + * Zero means failure: we are in an unrelated pid namespace. + */ + pid = task_pid_nr_ns(current, ns_of_pid(sbi->oz_pgrp)); + tgid = task_tgid_nr_ns(current, ns_of_pid(sbi->oz_pgrp)); + if (pid == 0 || tgid == 0) + return -ENOENT; + if (!dentry->d_inode) { /* * A wait for a negative dentry is invalid for certain @@ -417,8 +429,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, wq->ino = autofs4_get_ino(sbi); wq->uid = current_uid(); wq->gid = current_gid(); - wq->pid = current->pid; - wq->tgid = current->tgid; + wq->pid = pid; + wq->tgid = tgid; wq->status = -EINTR; /* Status return if interrupted */ wq->wait_ctr = 2; diff --git a/fs/befs/Makefile b/fs/befs/Makefile index 2f370bd7a50..8b9f66642a8 100644 --- a/fs/befs/Makefile +++ b/fs/befs/Makefile @@ -3,5 +3,5 @@ # obj-$(CONFIG_BEFS_FS) += befs.o - +ccflags-$(CONFIG_BEFS_DEBUG) += -DDEBUG befs-objs := datastream.o btree.o super.o inode.o debug.o io.o linuxvfs.o diff --git a/fs/befs/befs.h b/fs/befs/befs.h index b2664283915..3a7813ab8c9 100644 --- a/fs/befs/befs.h +++ b/fs/befs/befs.h @@ -88,8 +88,11 @@ enum befs_err { /****************************/ /* debug.c */ +__printf(2, 3) void befs_error(const struct super_block *sb, const char *fmt, ...); +__printf(2, 3) void befs_warning(const struct super_block *sb, const char *fmt, ...); +__printf(2, 3) void befs_debug(const struct super_block *sb, const char *fmt, ...); void befs_dump_super_block(const struct super_block *sb, befs_super_block *); diff --git a/fs/befs/btree.c b/fs/befs/btree.c index 74e397db0b8..9c7faa8a928 100644 --- a/fs/befs/btree.c +++ b/fs/befs/btree.c @@ -137,7 +137,7 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds, struct buffer_head *bh = NULL; befs_disk_btree_super *od_sup = NULL; - befs_debug(sb, "---> befs_btree_read_super()"); + befs_debug(sb, "---> %s", __func__); bh = befs_read_datastream(sb, ds, 0, NULL); @@ -162,11 +162,11 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds, goto error; } - befs_debug(sb, "<--- befs_btree_read_super()"); + befs_debug(sb, "<--- %s", __func__); return BEFS_OK; error: - befs_debug(sb, "<--- befs_btree_read_super() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return BEFS_ERR; } @@ -195,16 +195,16 @@ befs_bt_read_node(struct super_block *sb, befs_data_stream * ds, { uint off = 0; - befs_debug(sb, "---> befs_bt_read_node()"); + befs_debug(sb, "---> %s", __func__); if (node->bh) brelse(node->bh); node->bh = befs_read_datastream(sb, ds, node_off, &off); if (!node->bh) { - befs_error(sb, "befs_bt_read_node() failed to read " - "node at %Lu", node_off); - befs_debug(sb, "<--- befs_bt_read_node() ERROR"); + befs_error(sb, "%s failed to read " + "node at %llu", __func__, node_off); + befs_debug(sb, "<--- %s ERROR", __func__); return BEFS_ERR; } @@ -221,7 +221,7 @@ befs_bt_read_node(struct super_block *sb, befs_data_stream * ds, node->head.all_key_length = fs16_to_cpu(sb, node->od_node->all_key_length); - befs_debug(sb, "<--- befs_btree_read_node()"); + befs_debug(sb, "<--- %s", __func__); return BEFS_OK; } @@ -252,7 +252,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds, befs_off_t node_off; int res; - befs_debug(sb, "---> befs_btree_find() Key: %s", key); + befs_debug(sb, "---> %s Key: %s", __func__, key); if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) { befs_error(sb, @@ -263,7 +263,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds, this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS); if (!this_node) { - befs_error(sb, "befs_btree_find() failed to allocate %u " + befs_error(sb, "befs_btree_find() failed to allocate %zu " "bytes of memory", sizeof (befs_btree_node)); goto error; } @@ -274,7 +274,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds, node_off = bt_super.root_node_ptr; if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { befs_error(sb, "befs_btree_find() failed to read " - "node at %Lu", node_off); + "node at %llu", node_off); goto error_alloc; } @@ -285,7 +285,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds, /* if no match, go to overflow node */ if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { befs_error(sb, "befs_btree_find() failed to read " - "node at %Lu", node_off); + "node at %llu", node_off); goto error_alloc; } } @@ -298,11 +298,11 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds, kfree(this_node); if (res != BEFS_BT_MATCH) { - befs_debug(sb, "<--- befs_btree_find() Key %s not found", key); + befs_debug(sb, "<--- %s Key %s not found", __func__, key); *value = 0; return BEFS_BT_NOT_FOUND; } - befs_debug(sb, "<--- befs_btree_find() Found key %s, value %Lu", + befs_debug(sb, "<--- %s Found key %s, value %llu", __func__, key, *value); return BEFS_OK; @@ -310,7 +310,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds, kfree(this_node); error: *value = 0; - befs_debug(sb, "<--- befs_btree_find() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return BEFS_ERR; } @@ -318,7 +318,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds, * befs_find_key - Search for a key within a node * @sb: Filesystem superblock * @node: Node to find the key within - * @key: Keystring to search for + * @findkey: Keystring to search for * @value: If key is found, the value stored with the key is put here * * finds exact match if one exists, and returns BEFS_BT_MATCH @@ -343,7 +343,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node, char *thiskey; fs64 *valarray; - befs_debug(sb, "---> befs_find_key() %s", findkey); + befs_debug(sb, "---> %s %s", __func__, findkey); *value = 0; @@ -355,7 +355,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node, eq = befs_compare_strings(thiskey, keylen, findkey, findkey_len); if (eq < 0) { - befs_debug(sb, "<--- befs_find_key() %s not found", findkey); + befs_debug(sb, "<--- %s %s not found", __func__, findkey); return BEFS_BT_NOT_FOUND; } @@ -373,8 +373,8 @@ befs_find_key(struct super_block *sb, befs_btree_node * node, findkey_len); if (eq == 0) { - befs_debug(sb, "<--- befs_find_key() found %s at %d", - thiskey, mid); + befs_debug(sb, "<--- %s found %s at %d", + __func__, thiskey, mid); *value = fs64_to_cpu(sb, valarray[mid]); return BEFS_BT_MATCH; @@ -388,7 +388,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node, *value = fs64_to_cpu(sb, valarray[mid + 1]); else *value = fs64_to_cpu(sb, valarray[mid]); - befs_debug(sb, "<--- befs_find_key() found %s at %d", thiskey, mid); + befs_debug(sb, "<--- %s found %s at %d", __func__, thiskey, mid); return BEFS_BT_PARMATCH; } @@ -405,7 +405,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node, * Heres how it works: Key_no is the index of the key/value pair to * return in keybuf/value. * Bufsize is the size of keybuf (BEFS_NAME_LEN+1 is a good size). Keysize is - * the number of charecters in the key (just a convenience). + * the number of characters in the key (just a convenience). * * Algorithm: * Get the first leafnode of the tree. See if the requested key is in that @@ -428,7 +428,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, uint key_sum = 0; - befs_debug(sb, "---> befs_btree_read()"); + befs_debug(sb, "---> %s", __func__); if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) { befs_error(sb, @@ -437,7 +437,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, } if ((this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) { - befs_error(sb, "befs_btree_read() failed to allocate %u " + befs_error(sb, "befs_btree_read() failed to allocate %zu " "bytes of memory", sizeof (befs_btree_node)); goto error; } @@ -452,7 +452,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, kfree(this_node); *value = 0; *keysize = 0; - befs_debug(sb, "<--- befs_btree_read() Tree is EMPTY"); + befs_debug(sb, "<--- %s Tree is EMPTY", __func__); return BEFS_BT_EMPTY; } else if (res == BEFS_ERR) { goto error_alloc; @@ -467,7 +467,8 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, *keysize = 0; *value = 0; befs_debug(sb, - "<--- befs_btree_read() END of keys at %Lu", + "<--- %s END of keys at %llu", __func__, + (unsigned long long) key_sum + this_node->head.all_key_count); brelse(this_node->bh); kfree(this_node); @@ -478,8 +479,8 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, node_off = this_node->head.right; if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { - befs_error(sb, "befs_btree_read() failed to read " - "node at %Lu", node_off); + befs_error(sb, "%s failed to read node at %llu", + __func__, (unsigned long long)node_off); goto error_alloc; } } @@ -492,27 +493,28 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, keystart = befs_bt_get_key(sb, this_node, cur_key, &keylen); - befs_debug(sb, "Read [%Lu,%d]: keysize %d", node_off, cur_key, keylen); + befs_debug(sb, "Read [%llu,%d]: keysize %d", + (long long unsigned int)node_off, (int)cur_key, + (int)keylen); if (bufsize < keylen + 1) { - befs_error(sb, "befs_btree_read() keybuf too small (%u) " - "for key of size %d", bufsize, keylen); + befs_error(sb, "%s keybuf too small (%zu) " + "for key of size %d", __func__, bufsize, keylen); brelse(this_node->bh); goto error_alloc; - }; + } - strncpy(keybuf, keystart, keylen); + strlcpy(keybuf, keystart, keylen + 1); *value = fs64_to_cpu(sb, valarray[cur_key]); *keysize = keylen; - keybuf[keylen] = '\0'; - befs_debug(sb, "Read [%Lu,%d]: Key \"%.*s\", Value %Lu", node_off, + befs_debug(sb, "Read [%llu,%d]: Key \"%.*s\", Value %llu", node_off, cur_key, keylen, keybuf, *value); brelse(this_node->bh); kfree(this_node); - befs_debug(sb, "<--- befs_btree_read()"); + befs_debug(sb, "<--- %s", __func__); return BEFS_OK; @@ -522,7 +524,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, error: *keysize = 0; *value = 0; - befs_debug(sb, "<--- befs_btree_read() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return BEFS_ERR; } @@ -547,26 +549,26 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds, befs_off_t * node_off) { - befs_debug(sb, "---> befs_btree_seekleaf()"); + befs_debug(sb, "---> %s", __func__); if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) { - befs_error(sb, "befs_btree_seekleaf() failed to read " - "node at %Lu", *node_off); + befs_error(sb, "%s failed to read " + "node at %llu", __func__, *node_off); goto error; } - befs_debug(sb, "Seekleaf to root node %Lu", *node_off); + befs_debug(sb, "Seekleaf to root node %llu", *node_off); if (this_node->head.all_key_count == 0 && befs_leafnode(this_node)) { - befs_debug(sb, "<--- befs_btree_seekleaf() Tree is EMPTY"); + befs_debug(sb, "<--- %s Tree is EMPTY", __func__); return BEFS_BT_EMPTY; } while (!befs_leafnode(this_node)) { if (this_node->head.all_key_count == 0) { - befs_debug(sb, "befs_btree_seekleaf() encountered " - "an empty interior node: %Lu. Using Overflow " - "node: %Lu", *node_off, + befs_debug(sb, "%s encountered " + "an empty interior node: %llu. Using Overflow " + "node: %llu", __func__, *node_off, this_node->head.overflow); *node_off = this_node->head.overflow; } else { @@ -574,19 +576,19 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds, *node_off = fs64_to_cpu(sb, valarray[0]); } if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) { - befs_error(sb, "befs_btree_seekleaf() failed to read " - "node at %Lu", *node_off); + befs_error(sb, "%s failed to read " + "node at %llu", __func__, *node_off); goto error; } - befs_debug(sb, "Seekleaf to child node %Lu", *node_off); + befs_debug(sb, "Seekleaf to child node %llu", *node_off); } - befs_debug(sb, "Node %Lu is a leaf node", *node_off); + befs_debug(sb, "Node %llu is a leaf node", *node_off); return BEFS_OK; error: - befs_debug(sb, "<--- befs_btree_seekleaf() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return BEFS_ERR; } @@ -704,7 +706,7 @@ befs_bt_get_key(struct super_block *sb, befs_btree_node * node, * @key1: pointer to the first key to be compared * @keylen1: length in bytes of key1 * @key2: pointer to the second key to be compared - * @kelen2: length in bytes of key2 + * @keylen2: length in bytes of key2 * * Returns 0 if @key1 and @key2 are equal. * Returns >0 if @key1 is greater. diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c index 59096b5e0fc..1e8e0b8d883 100644 --- a/fs/befs/datastream.c +++ b/fs/befs/datastream.c @@ -52,26 +52,25 @@ befs_read_datastream(struct super_block *sb, befs_data_stream * ds, befs_block_run run; befs_blocknr_t block; /* block coresponding to pos */ - befs_debug(sb, "---> befs_read_datastream() %Lu", pos); + befs_debug(sb, "---> %s %llu", __func__, pos); block = pos >> BEFS_SB(sb)->block_shift; if (off) *off = pos - (block << BEFS_SB(sb)->block_shift); if (befs_fblock2brun(sb, ds, block, &run) != BEFS_OK) { befs_error(sb, "BeFS: Error finding disk addr of block %lu", - block); - befs_debug(sb, "<--- befs_read_datastream() ERROR"); + (unsigned long)block); + befs_debug(sb, "<--- %s ERROR", __func__); return NULL; } bh = befs_bread_iaddr(sb, run); if (!bh) { befs_error(sb, "BeFS: Error reading block %lu from datastream", - block); + (unsigned long)block); return NULL; } - befs_debug(sb, "<--- befs_read_datastream() read data, starting at %Lu", - pos); + befs_debug(sb, "<--- %s read data, starting at %llu", __func__, pos); return bh; } @@ -106,7 +105,8 @@ befs_fblock2brun(struct super_block *sb, befs_data_stream * data, } else { befs_error(sb, "befs_fblock2brun() was asked to find block %lu, " - "which is not mapped by the datastream\n", fblock); + "which is not mapped by the datastream\n", + (unsigned long)fblock); err = BEFS_ERR; } return err; @@ -116,7 +116,7 @@ befs_fblock2brun(struct super_block *sb, befs_data_stream * data, * befs_read_lsmylink - read long symlink from datastream. * @sb: Filesystem superblock * @ds: Datastrem to read from - * @buf: Buffer in which to place long symlink data + * @buff: Buffer in which to place long symlink data * @len: Length of the long symlink in bytes * * Returns the number of bytes read @@ -128,14 +128,14 @@ befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff, befs_off_t bytes_read = 0; /* bytes readed */ u16 plen; struct buffer_head *bh = NULL; - befs_debug(sb, "---> befs_read_lsymlink() length: %Lu", len); + befs_debug(sb, "---> %s length: %llu", __func__, len); while (bytes_read < len) { bh = befs_read_datastream(sb, ds, bytes_read, NULL); if (!bh) { befs_error(sb, "BeFS: Error reading datastream block " - "starting from %Lu", bytes_read); - befs_debug(sb, "<--- befs_read_lsymlink() ERROR"); + "starting from %llu", bytes_read); + befs_debug(sb, "<--- %s ERROR", __func__); return bytes_read; } @@ -146,7 +146,8 @@ befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff, bytes_read += plen; } - befs_debug(sb, "<--- befs_read_lsymlink() read %u bytes", bytes_read); + befs_debug(sb, "<--- %s read %u bytes", __func__, (unsigned int) + bytes_read); return bytes_read; } @@ -169,7 +170,7 @@ befs_count_blocks(struct super_block * sb, befs_data_stream * ds) befs_blocknr_t metablocks; /* FS metadata blocks */ befs_sb_info *befs_sb = BEFS_SB(sb); - befs_debug(sb, "---> befs_count_blocks()"); + befs_debug(sb, "---> %s", __func__); datablocks = ds->size >> befs_sb->block_shift; if (ds->size & (befs_sb->block_size - 1)) @@ -206,7 +207,7 @@ befs_count_blocks(struct super_block * sb, befs_data_stream * ds) } blocks = datablocks + metablocks; - befs_debug(sb, "<--- befs_count_blocks() %u blocks", blocks); + befs_debug(sb, "<--- %s %u blocks", __func__, (unsigned int)blocks); return blocks; } @@ -251,11 +252,11 @@ befs_find_brun_direct(struct super_block *sb, befs_data_stream * data, befs_blocknr_t max_block = data->max_direct_range >> BEFS_SB(sb)->block_shift; - befs_debug(sb, "---> befs_find_brun_direct(), find %lu", blockno); + befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno); if (blockno > max_block) { - befs_error(sb, "befs_find_brun_direct() passed block outside of" - "direct region"); + befs_error(sb, "%s passed block outside of direct region", + __func__); return BEFS_ERR; } @@ -267,13 +268,14 @@ befs_find_brun_direct(struct super_block *sb, befs_data_stream * data, run->start = array[i].start + offset; run->len = array[i].len - offset; - befs_debug(sb, "---> befs_find_brun_direct(), " - "found %lu at direct[%d]", blockno, i); + befs_debug(sb, "---> %s, " + "found %lu at direct[%d]", __func__, + (unsigned long)blockno, i); return BEFS_OK; } } - befs_debug(sb, "---> befs_find_brun_direct() ERROR"); + befs_debug(sb, "---> %s ERROR", __func__); return BEFS_ERR; } @@ -316,7 +318,7 @@ befs_find_brun_indirect(struct super_block *sb, befs_blocknr_t indirblockno = iaddr2blockno(sb, &indirect); int arraylen = befs_iaddrs_per_block(sb); - befs_debug(sb, "---> befs_find_brun_indirect(), find %lu", blockno); + befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno); indir_start_blk = data->max_direct_range >> BEFS_SB(sb)->block_shift; search_blk = blockno - indir_start_blk; @@ -325,10 +327,9 @@ befs_find_brun_indirect(struct super_block *sb, for (i = 0; i < indirect.len; i++) { indirblock = befs_bread(sb, indirblockno + i); if (indirblock == NULL) { - befs_debug(sb, - "---> befs_find_brun_indirect() failed to " - "read disk block %lu from the indirect brun", - indirblockno + i); + befs_debug(sb, "---> %s failed to read " + "disk block %lu from the indirect brun", + __func__, (unsigned long)indirblockno + i); return BEFS_ERR; } @@ -348,9 +349,10 @@ befs_find_brun_indirect(struct super_block *sb, brelse(indirblock); befs_debug(sb, - "<--- befs_find_brun_indirect() found " - "file block %lu at indirect[%d]", - blockno, j + (i * arraylen)); + "<--- %s found file block " + "%lu at indirect[%d]", __func__, + (unsigned long)blockno, + j + (i * arraylen)); return BEFS_OK; } sum += len; @@ -360,10 +362,10 @@ befs_find_brun_indirect(struct super_block *sb, } /* Only fallthrough is an error */ - befs_error(sb, "BeFS: befs_find_brun_indirect() failed to find " - "file block %lu", blockno); + befs_error(sb, "BeFS: %s failed to find " + "file block %lu", __func__, (unsigned long)blockno); - befs_debug(sb, "<--- befs_find_brun_indirect() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return BEFS_ERR; } @@ -444,7 +446,7 @@ befs_find_brun_dblindirect(struct super_block *sb, size_t diblklen = iblklen * befs_iaddrs_per_block(sb) * BEFS_DBLINDIR_BRUN_LEN; - befs_debug(sb, "---> befs_find_brun_dblindirect() find %lu", blockno); + befs_debug(sb, "---> %s find %lu", __func__, (unsigned long)blockno); /* First, discover which of the double_indir->indir blocks * contains pos. Then figure out how much of pos that @@ -460,8 +462,9 @@ befs_find_brun_dblindirect(struct super_block *sb, dbl_which_block = dblindir_indx / befs_iaddrs_per_block(sb); if (dbl_which_block > data->double_indirect.len) { befs_error(sb, "The double-indirect index calculated by " - "befs_read_brun_dblindirect(), %d, is outside the range " - "of the double-indirect block", dblindir_indx); + "%s, %d, is outside the range " + "of the double-indirect block", __func__, + dblindir_indx); return BEFS_ERR; } @@ -469,10 +472,10 @@ befs_find_brun_dblindirect(struct super_block *sb, befs_bread(sb, iaddr2blockno(sb, &data->double_indirect) + dbl_which_block); if (dbl_indir_block == NULL) { - befs_error(sb, "befs_read_brun_dblindirect() couldn't read the " - "double-indirect block at blockno %lu", - iaddr2blockno(sb, - &data->double_indirect) + + befs_error(sb, "%s couldn't read the " + "double-indirect block at blockno %lu", __func__, + (unsigned long) + iaddr2blockno(sb, &data->double_indirect) + dbl_which_block); brelse(dbl_indir_block); return BEFS_ERR; @@ -489,16 +492,16 @@ befs_find_brun_dblindirect(struct super_block *sb, which_block = indir_indx / befs_iaddrs_per_block(sb); if (which_block > indir_run.len) { befs_error(sb, "The indirect index calculated by " - "befs_read_brun_dblindirect(), %d, is outside the range " - "of the indirect block", indir_indx); + "%s, %d, is outside the range " + "of the indirect block", __func__, indir_indx); return BEFS_ERR; } indir_block = befs_bread(sb, iaddr2blockno(sb, &indir_run) + which_block); if (indir_block == NULL) { - befs_error(sb, "befs_read_brun_dblindirect() couldn't read the " - "indirect block at blockno %lu", + befs_error(sb, "%s couldn't read the indirect block " + "at blockno %lu", __func__, (unsigned long) iaddr2blockno(sb, &indir_run) + which_block); brelse(indir_block); return BEFS_ERR; @@ -519,7 +522,7 @@ befs_find_brun_dblindirect(struct super_block *sb, run->len -= offset; befs_debug(sb, "Found file block %lu in double_indirect[%d][%d]," - " double_indirect_leftover = %lu", + " double_indirect_leftover = %lu", (unsigned long) blockno, dblindir_indx, indir_indx, dblindir_leftover); return BEFS_OK; diff --git a/fs/befs/debug.c b/fs/befs/debug.c index 622e73775c8..4de7cffcd66 100644 --- a/fs/befs/debug.c +++ b/fs/befs/debug.c @@ -10,6 +10,7 @@ * debug functions */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #ifdef __KERNEL__ #include <stdarg.h> @@ -23,43 +24,30 @@ #include "befs.h" -#define ERRBUFSIZE 1024 - void befs_error(const struct super_block *sb, const char *fmt, ...) { + struct va_format vaf; va_list args; - char *err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL); - if (err_buf == NULL) { - printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE); - return; - } va_start(args, fmt); - vsnprintf(err_buf, ERRBUFSIZE, fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + pr_err("(%s): %pV\n", sb->s_id, &vaf); va_end(args); - - printk(KERN_ERR "BeFS(%s): %s\n", sb->s_id, err_buf); - kfree(err_buf); } void befs_warning(const struct super_block *sb, const char *fmt, ...) { + struct va_format vaf; va_list args; - char *err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL); - if (err_buf == NULL) { - printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE); - return; - } va_start(args, fmt); - vsnprintf(err_buf, ERRBUFSIZE, fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + pr_warn("(%s): %pV\n", sb->s_id, &vaf); va_end(args); - - printk(KERN_WARNING "BeFS(%s): %s\n", sb->s_id, err_buf); - - kfree(err_buf); } void @@ -67,25 +55,13 @@ befs_debug(const struct super_block *sb, const char *fmt, ...) { #ifdef CONFIG_BEFS_DEBUG + struct va_format vaf; va_list args; - char *err_buf = NULL; - - if (BEFS_SB(sb)->mount_opts.debug) { - err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL); - if (err_buf == NULL) { - printk(KERN_ERR "could not allocate %d bytes\n", - ERRBUFSIZE); - return; - } - - va_start(args, fmt); - vsnprintf(err_buf, ERRBUFSIZE, fmt, args); - va_end(args); - - printk(KERN_DEBUG "BeFS(%s): %s\n", sb->s_id, err_buf); - - kfree(err_buf); - } + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_debug("(%s): %pV\n", sb->s_id, &vaf); + va_end(args); #endif //CONFIG_BEFS_DEBUG } @@ -109,9 +85,9 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode) befs_debug(sb, " gid %u", fs32_to_cpu(sb, inode->gid)); befs_debug(sb, " mode %08x", fs32_to_cpu(sb, inode->mode)); befs_debug(sb, " flags %08x", fs32_to_cpu(sb, inode->flags)); - befs_debug(sb, " create_time %Lu", + befs_debug(sb, " create_time %llu", fs64_to_cpu(sb, inode->create_time)); - befs_debug(sb, " last_modified_time %Lu", + befs_debug(sb, " last_modified_time %llu", fs64_to_cpu(sb, inode->last_modified_time)); tmp_run = fsrun_to_cpu(sb, inode->parent); @@ -137,7 +113,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode) tmp_run.allocation_group, tmp_run.start, tmp_run.len); } - befs_debug(sb, " max_direct_range %Lu", + befs_debug(sb, " max_direct_range %llu", fs64_to_cpu(sb, inode->data.datastream. max_direct_range)); @@ -147,7 +123,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode) tmp_run.allocation_group, tmp_run.start, tmp_run.len); - befs_debug(sb, " max_indirect_range %Lu", + befs_debug(sb, " max_indirect_range %llu", fs64_to_cpu(sb, inode->data.datastream. max_indirect_range)); @@ -158,12 +134,12 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode) tmp_run.allocation_group, tmp_run.start, tmp_run.len); - befs_debug(sb, " max_double_indirect_range %Lu", + befs_debug(sb, " max_double_indirect_range %llu", fs64_to_cpu(sb, inode->data.datastream. max_double_indirect_range)); - befs_debug(sb, " size %Lu", + befs_debug(sb, " size %llu", fs64_to_cpu(sb, inode->data.datastream.size)); } @@ -191,8 +167,8 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup) befs_debug(sb, " block_size %u", fs32_to_cpu(sb, sup->block_size)); befs_debug(sb, " block_shift %u", fs32_to_cpu(sb, sup->block_shift)); - befs_debug(sb, " num_blocks %Lu", fs64_to_cpu(sb, sup->num_blocks)); - befs_debug(sb, " used_blocks %Lu", fs64_to_cpu(sb, sup->used_blocks)); + befs_debug(sb, " num_blocks %llu", fs64_to_cpu(sb, sup->num_blocks)); + befs_debug(sb, " used_blocks %llu", fs64_to_cpu(sb, sup->used_blocks)); befs_debug(sb, " magic2 %08x", fs32_to_cpu(sb, sup->magic2)); befs_debug(sb, " blocks_per_ag %u", @@ -206,8 +182,8 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup) befs_debug(sb, " log_blocks %u, %hu, %hu", tmp_run.allocation_group, tmp_run.start, tmp_run.len); - befs_debug(sb, " log_start %Ld", fs64_to_cpu(sb, sup->log_start)); - befs_debug(sb, " log_end %Ld", fs64_to_cpu(sb, sup->log_end)); + befs_debug(sb, " log_start %lld", fs64_to_cpu(sb, sup->log_start)); + befs_debug(sb, " log_end %lld", fs64_to_cpu(sb, sup->log_end)); befs_debug(sb, " magic3 %08x", fs32_to_cpu(sb, sup->magic3)); diff --git a/fs/befs/inode.c b/fs/befs/inode.c index 94c17f9a957..fa4b718de59 100644 --- a/fs/befs/inode.c +++ b/fs/befs/inode.c @@ -25,7 +25,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode, /* check magic header. */ if (magic1 != BEFS_INODE_MAGIC1) { befs_error(sb, - "Inode has a bad magic header - inode = %lu", inode); + "Inode has a bad magic header - inode = %lu", + (unsigned long)inode); return BEFS_BAD_INODE; } @@ -34,8 +35,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode, */ if (inode != iaddr2blockno(sb, &ino_num)) { befs_error(sb, "inode blocknr field disagrees with vfs " - "VFS: %lu, Inode %lu", - inode, iaddr2blockno(sb, &ino_num)); + "VFS: %lu, Inode %lu", (unsigned long) + inode, (unsigned long)iaddr2blockno(sb, &ino_num)); return BEFS_BAD_INODE; } @@ -44,7 +45,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode, */ if (!(flags & BEFS_INODE_IN_USE)) { - befs_error(sb, "inode is not used - inode = %lu", inode); + befs_error(sb, "inode is not used - inode = %lu", + (unsigned long)inode); return BEFS_BAD_INODE; } diff --git a/fs/befs/io.c b/fs/befs/io.c index ddef98aa255..0408a3d601d 100644 --- a/fs/befs/io.c +++ b/fs/befs/io.c @@ -30,9 +30,9 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr) befs_blocknr_t block = 0; befs_sb_info *befs_sb = BEFS_SB(sb); - befs_debug(sb, "---> Enter befs_read_iaddr() " - "[%u, %hu, %hu]", - iaddr.allocation_group, iaddr.start, iaddr.len); + befs_debug(sb, "---> Enter %s " + "[%u, %hu, %hu]", __func__, iaddr.allocation_group, + iaddr.start, iaddr.len); if (iaddr.allocation_group > befs_sb->num_ags) { befs_error(sb, "BEFS: Invalid allocation group %u, max is %u", @@ -42,20 +42,21 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr) block = iaddr2blockno(sb, &iaddr); - befs_debug(sb, "befs_read_iaddr: offset = %lu", block); + befs_debug(sb, "%s: offset = %lu", __func__, (unsigned long)block); bh = sb_bread(sb, block); if (bh == NULL) { - befs_error(sb, "Failed to read block %lu", block); + befs_error(sb, "Failed to read block %lu", + (unsigned long)block); goto error; } - befs_debug(sb, "<--- befs_read_iaddr()"); + befs_debug(sb, "<--- %s", __func__); return bh; error: - befs_debug(sb, "<--- befs_read_iaddr() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return NULL; } @@ -64,20 +65,21 @@ befs_bread(struct super_block *sb, befs_blocknr_t block) { struct buffer_head *bh = NULL; - befs_debug(sb, "---> Enter befs_read() %Lu", block); + befs_debug(sb, "---> Enter %s %lu", __func__, (unsigned long)block); bh = sb_bread(sb, block); if (bh == NULL) { - befs_error(sb, "Failed to read block %lu", block); + befs_error(sb, "Failed to read block %lu", + (unsigned long)block); goto error; } - befs_debug(sb, "<--- befs_read()"); + befs_debug(sb, "<--- %s", __func__); return bh; error: - befs_debug(sb, "<--- befs_read() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return NULL; } diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index daa15d6ba45..a16fbd4e824 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -5,6 +5,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/slab.h> #include <linux/fs.h> @@ -39,7 +41,6 @@ static struct dentry *befs_lookup(struct inode *, struct dentry *, unsigned int) static struct inode *befs_iget(struct super_block *, unsigned long); static struct inode *befs_alloc_inode(struct super_block *sb); static void befs_destroy_inode(struct inode *inode); -static int befs_init_inodecache(void); static void befs_destroy_inodecache(void); static void *befs_follow_link(struct dentry *, struct nameidata *); static void *befs_fast_follow_link(struct dentry *, struct nameidata *); @@ -131,26 +132,20 @@ befs_get_block(struct inode *inode, sector_t block, ulong disk_off; befs_debug(sb, "---> befs_get_block() for inode %lu, block %ld", - inode->i_ino, block); - - if (block < 0) { - befs_error(sb, "befs_get_block() was asked for a block " - "number less than zero: block %ld in inode %lu", - block, inode->i_ino); - return -EIO; - } - + (unsigned long)inode->i_ino, (long)block); if (create) { befs_error(sb, "befs_get_block() was asked to write to " - "block %ld in inode %lu", block, inode->i_ino); + "block %ld in inode %lu", (long)block, + (unsigned long)inode->i_ino); return -EPERM; } res = befs_fblock2brun(sb, ds, block, &run); if (res != BEFS_OK) { befs_error(sb, - "<--- befs_get_block() for inode %lu, block " - "%ld ERROR", inode->i_ino, block); + "<--- %s for inode %lu, block %ld ERROR", + __func__, (unsigned long)inode->i_ino, + (long)block); return -EFBIG; } @@ -158,8 +153,9 @@ befs_get_block(struct inode *inode, sector_t block, map_bh(bh_result, inode->i_sb, disk_off); - befs_debug(sb, "<--- befs_get_block() for inode %lu, block %ld, " - "disk address %lu", inode->i_ino, block, disk_off); + befs_debug(sb, "<--- %s for inode %lu, block %ld, disk address %lu", + __func__, (unsigned long)inode->i_ino, (long)block, + (unsigned long)disk_off); return 0; } @@ -176,15 +172,15 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) char *utfname; const char *name = dentry->d_name.name; - befs_debug(sb, "---> befs_lookup() " - "name %s inode %ld", dentry->d_name.name, dir->i_ino); + befs_debug(sb, "---> %s name %s inode %ld", __func__, + dentry->d_name.name, dir->i_ino); /* Convert to UTF-8 */ if (BEFS_SB(sb)->nls) { ret = befs_nls2utf(sb, name, strlen(name), &utfname, &utfnamelen); if (ret < 0) { - befs_debug(sb, "<--- befs_lookup() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return ERR_PTR(ret); } ret = befs_btree_find(sb, ds, utfname, &offset); @@ -195,12 +191,12 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) } if (ret == BEFS_BT_NOT_FOUND) { - befs_debug(sb, "<--- befs_lookup() %s not found", + befs_debug(sb, "<--- %s %s not found", __func__, dentry->d_name.name); return ERR_PTR(-ENOENT); } else if (ret != BEFS_OK || offset == 0) { - befs_warning(sb, "<--- befs_lookup() Error"); + befs_warning(sb, "<--- %s Error", __func__); return ERR_PTR(-ENODATA); } @@ -210,7 +206,7 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) d_add(dentry, inode); - befs_debug(sb, "<--- befs_lookup()"); + befs_debug(sb, "<--- %s", __func__); return NULL; } @@ -228,26 +224,25 @@ befs_readdir(struct file *file, struct dir_context *ctx) char keybuf[BEFS_NAME_LEN + 1]; const char *dirname = file->f_path.dentry->d_name.name; - befs_debug(sb, "---> befs_readdir() " - "name %s, inode %ld, ctx->pos %Ld", - dirname, inode->i_ino, ctx->pos); + befs_debug(sb, "---> %s name %s, inode %ld, ctx->pos %lld", + __func__, dirname, inode->i_ino, ctx->pos); more: result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1, keybuf, &keysize, &value); if (result == BEFS_ERR) { - befs_debug(sb, "<--- befs_readdir() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); befs_error(sb, "IO error reading %s (inode %lu)", dirname, inode->i_ino); return -EIO; } else if (result == BEFS_BT_END) { - befs_debug(sb, "<--- befs_readdir() END"); + befs_debug(sb, "<--- %s END", __func__); return 0; } else if (result == BEFS_BT_EMPTY) { - befs_debug(sb, "<--- befs_readdir() Empty directory"); + befs_debug(sb, "<--- %s Empty directory", __func__); return 0; } @@ -260,7 +255,7 @@ more: result = befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen); if (result < 0) { - befs_debug(sb, "<--- befs_readdir() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return result; } if (!dir_emit(ctx, nlsname, nlsnamelen, @@ -277,7 +272,7 @@ more: ctx->pos++; goto more; - befs_debug(sb, "<--- befs_readdir() pos %Ld", ctx->pos); + befs_debug(sb, "<--- %s pos %lld", __func__, ctx->pos); return 0; } @@ -321,11 +316,11 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) struct inode *inode; long ret = -EIO; - befs_debug(sb, "---> befs_read_inode() " "inode = %lu", ino); + befs_debug(sb, "---> %s inode = %lu", __func__, ino); inode = iget_locked(sb, ino); - if (IS_ERR(inode)) - return inode; + if (!inode) + return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; @@ -393,9 +388,8 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) if (S_ISLNK(inode->i_mode) && !(befs_ino->i_flags & BEFS_LONG_SYMLINK)){ inode->i_size = 0; inode->i_blocks = befs_sb->block_size / VFS_BLOCK_SIZE; - strncpy(befs_ino->i_data.symlink, raw_inode->data.symlink, - BEFS_SYMLINK_LEN - 1); - befs_ino->i_data.symlink[BEFS_SYMLINK_LEN - 1] = '\0'; + strlcpy(befs_ino->i_data.symlink, raw_inode->data.symlink, + BEFS_SYMLINK_LEN); } else { int num_blks; @@ -428,7 +422,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) } brelse(bh); - befs_debug(sb, "<--- befs_read_inode()"); + befs_debug(sb, "<--- %s", __func__); unlock_new_inode(inode); return inode; @@ -437,7 +431,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) unacquire_none: iget_failed(inode); - befs_debug(sb, "<--- befs_read_inode() - Bad inode"); + befs_debug(sb, "<--- %s - Bad inode", __func__); return ERR_PTR(ret); } @@ -445,7 +439,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) * * Taken from NFS implementation by Al Viro. */ -static int +static int __init befs_init_inodecache(void) { befs_inode_cachep = kmem_cache_create("befs_inode_cache", @@ -454,11 +448,9 @@ befs_init_inodecache(void) SLAB_MEM_SPREAD), init_once); if (befs_inode_cachep == NULL) { - printk(KERN_ERR "befs_init_inodecache: " - "Couldn't initialize inode slabcache\n"); + pr_err("%s: Couldn't initialize inode slabcache\n", __func__); return -ENOMEM; } - return 0; } @@ -544,16 +536,16 @@ befs_utf2nls(struct super_block *sb, const char *in, */ int maxlen = in_len + 1; - befs_debug(sb, "---> utf2nls()"); + befs_debug(sb, "---> %s", __func__); if (!nls) { - befs_error(sb, "befs_utf2nls called with no NLS table loaded"); + befs_error(sb, "%s called with no NLS table loaded", __func__); return -EINVAL; } *out = result = kmalloc(maxlen, GFP_NOFS); if (!*out) { - befs_error(sb, "befs_utf2nls() cannot allocate memory"); + befs_error(sb, "%s cannot allocate memory", __func__); *out_len = 0; return -ENOMEM; } @@ -575,14 +567,14 @@ befs_utf2nls(struct super_block *sb, const char *in, result[o] = '\0'; *out_len = o; - befs_debug(sb, "<--- utf2nls()"); + befs_debug(sb, "<--- %s", __func__); return o; conv_err: befs_error(sb, "Name using character set %s contains a character that " "cannot be converted to unicode.", nls->charset); - befs_debug(sb, "<--- utf2nls()"); + befs_debug(sb, "<--- %s", __func__); kfree(result); return -EILSEQ; } @@ -590,21 +582,21 @@ befs_utf2nls(struct super_block *sb, const char *in, /** * befs_nls2utf - Convert NLS string to utf8 encodeing * @sb: Superblock - * @src: Input string buffer in NLS format - * @srclen: Length of input string in bytes - * @dest: The output string in UTF-8 format - * @destlen: Length of the output buffer + * @in: Input string buffer in NLS format + * @in_len: Length of input string in bytes + * @out: The output string in UTF-8 format + * @out_len: Length of the output buffer * - * Converts input string @src, which is in the format of the loaded NLS map, + * Converts input string @in, which is in the format of the loaded NLS map, * into a utf8 string. * - * The destination string @dest is allocated by this function and the caller is + * The destination string @out is allocated by this function and the caller is * responsible for freeing it with kfree() * - * On return, *@destlen is the length of @dest in bytes. + * On return, *@out_len is the length of @out in bytes. * * On success, the return value is the number of utf8 characters written to - * the output buffer @dest. + * the output buffer @out. * * On Failure, a negative number coresponding to the error code is returned. */ @@ -623,16 +615,17 @@ befs_nls2utf(struct super_block *sb, const char *in, * in special cases */ int maxlen = (3 * in_len) + 1; - befs_debug(sb, "---> nls2utf()\n"); + befs_debug(sb, "---> %s\n", __func__); if (!nls) { - befs_error(sb, "befs_nls2utf called with no NLS table loaded."); + befs_error(sb, "%s called with no NLS table loaded.", + __func__); return -EINVAL; } *out = result = kmalloc(maxlen, GFP_NOFS); if (!*out) { - befs_error(sb, "befs_nls2utf() cannot allocate memory"); + befs_error(sb, "%s cannot allocate memory", __func__); *out_len = 0; return -ENOMEM; } @@ -653,14 +646,14 @@ befs_nls2utf(struct super_block *sb, const char *in, result[o] = '\0'; *out_len = o; - befs_debug(sb, "<--- nls2utf()"); + befs_debug(sb, "<--- %s", __func__); return i; conv_err: befs_error(sb, "Name using charecter set %s contains a charecter that " "cannot be converted to unicode.", nls->charset); - befs_debug(sb, "<--- nls2utf()"); + befs_debug(sb, "<--- %s", __func__); kfree(result); return -EILSEQ; } @@ -715,8 +708,8 @@ parse_options(char *options, befs_mount_options * opts) if (option >= 0) uid = make_kuid(current_user_ns(), option); if (!uid_valid(uid)) { - printk(KERN_ERR "BeFS: Invalid uid %d, " - "using default\n", option); + pr_err("Invalid uid %d, " + "using default\n", option); break; } opts->uid = uid; @@ -729,8 +722,8 @@ parse_options(char *options, befs_mount_options * opts) if (option >= 0) gid = make_kgid(current_user_ns(), option); if (!gid_valid(gid)) { - printk(KERN_ERR "BeFS: Invalid gid %d, " - "using default\n", option); + pr_err("Invalid gid %d, " + "using default\n", option); break; } opts->gid = gid; @@ -740,8 +733,8 @@ parse_options(char *options, befs_mount_options * opts) kfree(opts->iocharset); opts->iocharset = match_strdup(&args[0]); if (!opts->iocharset) { - printk(KERN_ERR "BeFS: allocation failure for " - "iocharset string\n"); + pr_err("allocation failure for " + "iocharset string\n"); return 0; } break; @@ -749,8 +742,8 @@ parse_options(char *options, befs_mount_options * opts) opts->debug = 1; break; default: - printk(KERN_ERR "BeFS: Unrecognized mount option \"%s\" " - "or missing value\n", p); + pr_err("Unrecognized mount option \"%s\" " + "or missing value\n", p); return 0; } } @@ -791,22 +784,20 @@ befs_fill_super(struct super_block *sb, void *data, int silent) save_mount_options(sb, data); - sb->s_fs_info = kmalloc(sizeof (*befs_sb), GFP_KERNEL); + sb->s_fs_info = kzalloc(sizeof(*befs_sb), GFP_KERNEL); if (sb->s_fs_info == NULL) { - printk(KERN_ERR - "BeFS(%s): Unable to allocate memory for private " + pr_err("(%s): Unable to allocate memory for private " "portion of superblock. Bailing.\n", sb->s_id); goto unacquire_none; } befs_sb = BEFS_SB(sb); - memset(befs_sb, 0, sizeof(befs_sb_info)); if (!parse_options((char *) data, &befs_sb->mount_opts)) { befs_error(sb, "cannot parse mount options"); goto unacquire_priv_sbp; } - befs_debug(sb, "---> befs_fill_super()"); + befs_debug(sb, "---> %s", __func__); #ifndef CONFIG_BEFS_RW if (!(sb->s_flags & MS_RDONLY)) { @@ -854,7 +845,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) goto unacquire_priv_sbp; if( befs_sb->num_blocks > ~((sector_t)0) ) { - befs_error(sb, "blocks count: %Lu " + befs_error(sb, "blocks count: %llu " "is larger than the host can use", befs_sb->num_blocks); goto unacquire_priv_sbp; @@ -913,6 +904,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) static int befs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); if (!(*flags & MS_RDONLY)) return -EINVAL; return 0; @@ -924,7 +916,7 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf) struct super_block *sb = dentry->d_sb; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); - befs_debug(sb, "---> befs_statfs()"); + befs_debug(sb, "---> %s", __func__); buf->f_type = BEFS_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; @@ -937,7 +929,7 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = BEFS_NAME_LEN; - befs_debug(sb, "<--- befs_statfs()"); + befs_debug(sb, "<--- %s", __func__); return 0; } @@ -963,7 +955,7 @@ init_befs_fs(void) { int err; - printk(KERN_INFO "BeFS version: %s\n", BEFS_VERSION); + pr_info("version: %s\n", BEFS_VERSION); err = befs_init_inodecache(); if (err) diff --git a/fs/bfs/file.c b/fs/bfs/file.c index ae289221833..e7f88ace1a2 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -23,10 +23,10 @@ const struct file_operations bfs_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .splice_read = generic_file_splice_read, }; diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 8defc6b3f9a..7041ac35ace 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -172,7 +172,7 @@ static void bfs_evict_inode(struct inode *inode) dprintf("ino=%08lx\n", ino); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); clear_inode(inode); @@ -266,7 +266,7 @@ static void init_once(void *foo) inode_init_once(&bi->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { bfs_inode_cachep = kmem_cache_create("bfs_inode_cache", sizeof(struct bfs_inode_info), diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 89dec7f789a..ca0ba15a730 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -45,7 +45,6 @@ static int load_aout_library(struct file*); */ static int aout_core_dump(struct coredump_params *cprm) { - struct file *file = cprm->file; mm_segment_t fs; int has_dumped = 0; void __user *dump_start; @@ -85,10 +84,10 @@ static int aout_core_dump(struct coredump_params *cprm) set_fs(KERNEL_DS); /* struct user */ - if (!dump_write(file, &dump, sizeof(dump))) + if (!dump_emit(cprm, &dump, sizeof(dump))) goto end_coredump; /* Now dump all of the user data. Include malloced stuff as well */ - if (!dump_seek(cprm->file, PAGE_SIZE - sizeof(dump))) + if (!dump_skip(cprm, PAGE_SIZE - sizeof(dump))) goto end_coredump; /* now we start writing out the user space info */ set_fs(USER_DS); @@ -96,14 +95,14 @@ static int aout_core_dump(struct coredump_params *cprm) if (dump.u_dsize != 0) { dump_start = START_DATA(dump); dump_size = dump.u_dsize << PAGE_SHIFT; - if (!dump_write(file, dump_start, dump_size)) + if (!dump_emit(cprm, dump_start, dump_size)) goto end_coredump; } /* Now prepare to dump the stack area */ if (dump.u_ssize != 0) { dump_start = START_STACK(dump); dump_size = dump.u_ssize << PAGE_SHIFT; - if (!dump_write(file, dump_start, dump_size)) + if (!dump_emit(cprm, dump_start, dump_size)) goto end_coredump; } end_coredump: @@ -221,7 +220,7 @@ static int load_aout_binary(struct linux_binprm * bprm) * Requires a mmap handler. This prevents people from using a.out * as part of an exploit attack against /proc-related vulnerabilities. */ - if (!bprm->file->f_op || !bprm->file->f_op->mmap) + if (!bprm->file->f_op->mmap) return -ENOEXEC; fd_offset = N_TXTOFF(ex); @@ -374,7 +373,7 @@ static int load_aout_library(struct file *file) * Requires a mmap handler. This prevents people from using a.out * as part of an exploit attack against /proc-related vulnerabilities. */ - if (!file->f_op || !file->f_op->mmap) + if (!file->f_op->mmap) goto out; if (N_FLAGS(ex)) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 4c94a79991b..3892c1a2324 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -46,10 +46,15 @@ #endif static int load_elf_binary(struct linux_binprm *bprm); -static int load_elf_library(struct file *); static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *, int, int, unsigned long); +#ifdef CONFIG_USELIB +static int load_elf_library(struct file *); +#else +#define load_elf_library NULL +#endif + /* * If we don't support core dumping, then supply a NULL so we * don't even try. @@ -406,7 +411,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, goto out; if (!elf_check_arch(interp_elf_ex)) goto out; - if (!interpreter->f_op || !interpreter->f_op->mmap) + if (!interpreter->f_op->mmap) goto out; /* @@ -543,9 +548,6 @@ out: * libraries. There is no binary dependent code anywhere else. */ -#define INTERPRETER_NONE 0 -#define INTERPRETER_ELF 2 - #ifndef STACK_RND_MASK #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ #endif @@ -582,7 +584,6 @@ static int load_elf_binary(struct linux_binprm *bprm) unsigned long start_code, end_code, start_data, end_data; unsigned long reloc_func_desc __maybe_unused = 0; int executable_stack = EXSTACK_DEFAULT; - unsigned long def_flags = 0; struct pt_regs *regs = current_pt_regs(); struct { struct elfhdr elf_ex; @@ -607,7 +608,7 @@ static int load_elf_binary(struct linux_binprm *bprm) goto out; if (!elf_check_arch(&loc->elf_ex)) goto out; - if (!bprm->file->f_op || !bprm->file->f_op->mmap) + if (!bprm->file->f_op->mmap) goto out; /* Now read in all of the header information */ @@ -722,9 +723,6 @@ static int load_elf_binary(struct linux_binprm *bprm) if (retval) goto out_free_dentry; - /* OK, This is the point of no return */ - current->mm->def_flags = def_flags; - /* Do this immediately, since STACK_TOP as used in setup_arg_pages may depend on the personality. */ SET_PERSONALITY(loc->elf_ex); @@ -1008,6 +1006,7 @@ out_free_ph: goto out; } +#ifdef CONFIG_USELIB /* This is really simpleminded and specialized - we are loading an a.out library that is given an ELF header. */ static int load_elf_library(struct file *file) @@ -1028,7 +1027,7 @@ static int load_elf_library(struct file *file) /* First of all, some simple consistency checks */ if (elf_ex.e_type != ET_EXEC || elf_ex.e_phnum > 2 || - !elf_check_arch(&elf_ex) || !file->f_op || !file->f_op->mmap) + !elf_check_arch(&elf_ex) || !file->f_op->mmap) goto out; /* Now read in all of the header information */ @@ -1086,6 +1085,7 @@ out_free_ph: out: return error; } +#endif /* #ifdef CONFIG_USELIB */ #ifdef CONFIG_ELF_CORE /* @@ -1108,6 +1108,14 @@ static bool always_dump_vma(struct vm_area_struct *vma) /* Any vsyscall mappings? */ if (vma == get_gate_vma(vma->vm_mm)) return true; + + /* + * Assume that all vmas with a .name op should always be dumped. + * If this changes, a new vm_ops field can easily be added. + */ + if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma)) + return true; + /* * arch_vma_name() returns non-NULL for special architecture mappings, * such as vDSO sections. @@ -1225,35 +1233,17 @@ static int notesize(struct memelfnote *en) return sz; } -#define DUMP_WRITE(addr, nr, foffset) \ - do { if (!dump_write(file, (addr), (nr))) return 0; *foffset += (nr); } while(0) - -static int alignfile(struct file *file, loff_t *foffset) -{ - static const char buf[4] = { 0, }; - DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset); - return 1; -} - -static int writenote(struct memelfnote *men, struct file *file, - loff_t *foffset) +static int writenote(struct memelfnote *men, struct coredump_params *cprm) { struct elf_note en; en.n_namesz = strlen(men->name) + 1; en.n_descsz = men->datasz; en.n_type = men->type; - DUMP_WRITE(&en, sizeof(en), foffset); - DUMP_WRITE(men->name, en.n_namesz, foffset); - if (!alignfile(file, foffset)) - return 0; - DUMP_WRITE(men->data, men->datasz, foffset); - if (!alignfile(file, foffset)) - return 0; - - return 1; + return dump_emit(cprm, &en, sizeof(en)) && + dump_emit(cprm, men->name, en.n_namesz) && dump_align(cprm, 4) && + dump_emit(cprm, men->data, men->datasz) && dump_align(cprm, 4); } -#undef DUMP_WRITE static void fill_elf_header(struct elfhdr *elf, int segs, u16 machine, u32 flags) @@ -1392,7 +1382,7 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm) } static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, - siginfo_t *siginfo) + const siginfo_t *siginfo) { mm_segment_t old_fs = get_fs(); set_fs(KERNEL_DS); @@ -1599,7 +1589,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_note_info *info, - siginfo_t *siginfo, struct pt_regs *regs) + const siginfo_t *siginfo, struct pt_regs *regs) { struct task_struct *dump_task = current; const struct user_regset_view *view = task_user_regset_view(dump_task); @@ -1702,33 +1692,33 @@ static size_t get_note_info_size(struct elf_note_info *info) * process-wide notes are interleaved after the first thread-specific note. */ static int write_note_info(struct elf_note_info *info, - struct file *file, loff_t *foffset) + struct coredump_params *cprm) { - bool first = 1; + bool first = true; struct elf_thread_core_info *t = info->thread; do { int i; - if (!writenote(&t->notes[0], file, foffset)) + if (!writenote(&t->notes[0], cprm)) return 0; - if (first && !writenote(&info->psinfo, file, foffset)) + if (first && !writenote(&info->psinfo, cprm)) return 0; - if (first && !writenote(&info->signote, file, foffset)) + if (first && !writenote(&info->signote, cprm)) return 0; - if (first && !writenote(&info->auxv, file, foffset)) + if (first && !writenote(&info->auxv, cprm)) return 0; if (first && info->files.data && - !writenote(&info->files, file, foffset)) + !writenote(&info->files, cprm)) return 0; for (i = 1; i < info->thread_notes; ++i) if (t->notes[i].data && - !writenote(&t->notes[i], file, foffset)) + !writenote(&t->notes[i], cprm)) return 0; - first = 0; + first = false; t = t->next; } while (t); @@ -1848,34 +1838,31 @@ static int elf_note_info_init(struct elf_note_info *info) static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_note_info *info, - siginfo_t *siginfo, struct pt_regs *regs) + const siginfo_t *siginfo, struct pt_regs *regs) { struct list_head *t; + struct core_thread *ct; + struct elf_thread_status *ets; if (!elf_note_info_init(info)) return 0; - if (siginfo->si_signo) { - struct core_thread *ct; - struct elf_thread_status *ets; - - for (ct = current->mm->core_state->dumper.next; - ct; ct = ct->next) { - ets = kzalloc(sizeof(*ets), GFP_KERNEL); - if (!ets) - return 0; + for (ct = current->mm->core_state->dumper.next; + ct; ct = ct->next) { + ets = kzalloc(sizeof(*ets), GFP_KERNEL); + if (!ets) + return 0; - ets->thread = ct->task; - list_add(&ets->list, &info->thread_list); - } + ets->thread = ct->task; + list_add(&ets->list, &info->thread_list); + } - list_for_each(t, &info->thread_list) { - int sz; + list_for_each(t, &info->thread_list) { + int sz; - ets = list_entry(t, struct elf_thread_status, list); - sz = elf_dump_thread_status(siginfo->si_signo, ets); - info->thread_status_size += sz; - } + ets = list_entry(t, struct elf_thread_status, list); + sz = elf_dump_thread_status(siginfo->si_signo, ets); + info->thread_status_size += sz; } /* now collect the dump for the current */ memset(info->prstatus, 0, sizeof(*info->prstatus)); @@ -1935,13 +1922,13 @@ static size_t get_note_info_size(struct elf_note_info *info) } static int write_note_info(struct elf_note_info *info, - struct file *file, loff_t *foffset) + struct coredump_params *cprm) { int i; struct list_head *t; for (i = 0; i < info->numnote; i++) - if (!writenote(info->notes + i, file, foffset)) + if (!writenote(info->notes + i, cprm)) return 0; /* write out the thread status notes section */ @@ -1950,7 +1937,7 @@ static int write_note_info(struct elf_note_info *info, list_entry(t, struct elf_thread_status, list); for (i = 0; i < tmp->num_notes; i++) - if (!writenote(&tmp->notes[i], file, foffset)) + if (!writenote(&tmp->notes[i], cprm)) return 0; } @@ -2046,10 +2033,9 @@ static int elf_core_dump(struct coredump_params *cprm) int has_dumped = 0; mm_segment_t fs; int segs; - size_t size = 0; struct vm_area_struct *vma, *gate_vma; struct elfhdr *elf = NULL; - loff_t offset = 0, dataoff, foffset; + loff_t offset = 0, dataoff; struct elf_note_info info = { }; struct elf_phdr *phdr4note = NULL; struct elf_shdr *shdr4extnum = NULL; @@ -2105,7 +2091,6 @@ static int elf_core_dump(struct coredump_params *cprm) offset += sizeof(*elf); /* Elf header */ offset += segs * sizeof(struct elf_phdr); /* Program headers */ - foffset = offset; /* Write notes phdr entry */ { @@ -2136,13 +2121,10 @@ static int elf_core_dump(struct coredump_params *cprm) offset = dataoff; - size += sizeof(*elf); - if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf))) + if (!dump_emit(cprm, elf, sizeof(*elf))) goto end_coredump; - size += sizeof(*phdr4note); - if (size > cprm->limit - || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note))) + if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note))) goto end_coredump; /* Write program headers for segments dump */ @@ -2164,24 +2146,22 @@ static int elf_core_dump(struct coredump_params *cprm) phdr.p_flags |= PF_X; phdr.p_align = ELF_EXEC_PAGESIZE; - size += sizeof(phdr); - if (size > cprm->limit - || !dump_write(cprm->file, &phdr, sizeof(phdr))) + if (!dump_emit(cprm, &phdr, sizeof(phdr))) goto end_coredump; } - if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit)) + if (!elf_core_write_extra_phdrs(cprm, offset)) goto end_coredump; /* write out the notes section */ - if (!write_note_info(&info, cprm->file, &foffset)) + if (!write_note_info(&info, cprm)) goto end_coredump; - if (elf_coredump_extra_notes_write(cprm->file, &foffset)) + if (elf_coredump_extra_notes_write(cprm)) goto end_coredump; /* Align to page */ - if (!dump_seek(cprm->file, dataoff - foffset)) + if (!dump_skip(cprm, dataoff - cprm->written)) goto end_coredump; for (vma = first_vma(current, gate_vma); vma != NULL; @@ -2198,26 +2178,21 @@ static int elf_core_dump(struct coredump_params *cprm) page = get_dump_page(addr); if (page) { void *kaddr = kmap(page); - stop = ((size += PAGE_SIZE) > cprm->limit) || - !dump_write(cprm->file, kaddr, - PAGE_SIZE); + stop = !dump_emit(cprm, kaddr, PAGE_SIZE); kunmap(page); page_cache_release(page); } else - stop = !dump_seek(cprm->file, PAGE_SIZE); + stop = !dump_skip(cprm, PAGE_SIZE); if (stop) goto end_coredump; } } - if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit)) + if (!elf_core_write_extra_data(cprm)) goto end_coredump; if (e_phnum == PN_XNUM) { - size += sizeof(*shdr4extnum); - if (size > cprm->limit - || !dump_write(cprm->file, shdr4extnum, - sizeof(*shdr4extnum))) + if (!dump_emit(cprm, shdr4extnum, sizeof(*shdr4extnum))) goto end_coredump; } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index c166f325a18..fe2a643ee00 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -111,7 +111,7 @@ static int is_elf_fdpic(struct elfhdr *hdr, struct file *file) return 0; if (!elf_check_arch(hdr) || !elf_check_fdpic(hdr)) return 0; - if (!file->f_op || !file->f_op->mmap) + if (!file->f_op->mmap) return 0; return 1; } @@ -1267,35 +1267,17 @@ static int notesize(struct memelfnote *en) /* #define DEBUG */ -#define DUMP_WRITE(addr, nr, foffset) \ - do { if (!dump_write(file, (addr), (nr))) return 0; *foffset += (nr); } while(0) - -static int alignfile(struct file *file, loff_t *foffset) -{ - static const char buf[4] = { 0, }; - DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset); - return 1; -} - -static int writenote(struct memelfnote *men, struct file *file, - loff_t *foffset) +static int writenote(struct memelfnote *men, struct coredump_params *cprm) { struct elf_note en; en.n_namesz = strlen(men->name) + 1; en.n_descsz = men->datasz; en.n_type = men->type; - DUMP_WRITE(&en, sizeof(en), foffset); - DUMP_WRITE(men->name, en.n_namesz, foffset); - if (!alignfile(file, foffset)) - return 0; - DUMP_WRITE(men->data, men->datasz, foffset); - if (!alignfile(file, foffset)) - return 0; - - return 1; + return dump_emit(cprm, &en, sizeof(en)) && + dump_emit(cprm, men->name, en.n_namesz) && dump_align(cprm, 4) && + dump_emit(cprm, men->data, men->datasz) && dump_align(cprm, 4); } -#undef DUMP_WRITE static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs) { @@ -1500,66 +1482,40 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, /* * dump the segments for an MMU process */ -#ifdef CONFIG_MMU -static int elf_fdpic_dump_segments(struct file *file, size_t *size, - unsigned long *limit, unsigned long mm_flags) +static bool elf_fdpic_dump_segments(struct coredump_params *cprm) { struct vm_area_struct *vma; - int err = 0; for (vma = current->mm->mmap; vma; vma = vma->vm_next) { unsigned long addr; - if (!maydump(vma, mm_flags)) + if (!maydump(vma, cprm->mm_flags)) continue; +#ifdef CONFIG_MMU for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { + bool res; struct page *page = get_dump_page(addr); if (page) { void *kaddr = kmap(page); - *size += PAGE_SIZE; - if (*size > *limit) - err = -EFBIG; - else if (!dump_write(file, kaddr, PAGE_SIZE)) - err = -EIO; + res = dump_emit(cprm, kaddr, PAGE_SIZE); kunmap(page); page_cache_release(page); - } else if (!dump_seek(file, PAGE_SIZE)) - err = -EFBIG; - if (err) - goto out; + } else { + res = dump_skip(cprm, PAGE_SIZE); + } + if (!res) + return false; } - } -out: - return err; -} -#endif - -/* - * dump the segments for a NOMMU process - */ -#ifndef CONFIG_MMU -static int elf_fdpic_dump_segments(struct file *file, size_t *size, - unsigned long *limit, unsigned long mm_flags) -{ - struct vm_area_struct *vma; - - for (vma = current->mm->mmap; vma; vma = vma->vm_next) { - if (!maydump(vma, mm_flags)) - continue; - - if ((*size += PAGE_SIZE) > *limit) - return -EFBIG; - - if (!dump_write(file, (void *) vma->vm_start, +#else + if (!dump_emit(cprm, (void *) vma->vm_start, vma->vm_end - vma->vm_start)) - return -EIO; + return false; +#endif } - - return 0; + return true; } -#endif static size_t elf_core_vma_data_size(unsigned long mm_flags) { @@ -1585,11 +1541,10 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) int has_dumped = 0; mm_segment_t fs; int segs; - size_t size = 0; int i; struct vm_area_struct *vma; struct elfhdr *elf = NULL; - loff_t offset = 0, dataoff, foffset; + loff_t offset = 0, dataoff; int numnote; struct memelfnote *notes = NULL; struct elf_prstatus *prstatus = NULL; /* NT_PRSTATUS */ @@ -1606,6 +1561,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) struct elf_shdr *shdr4extnum = NULL; Elf_Half e_phnum; elf_addr_t e_shoff; + struct core_thread *ct; + struct elf_thread_status *tmp; /* * We no longer stop all VM operations. @@ -1641,28 +1598,23 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) goto cleanup; #endif - if (cprm->siginfo->si_signo) { - struct core_thread *ct; - struct elf_thread_status *tmp; - - for (ct = current->mm->core_state->dumper.next; - ct; ct = ct->next) { - tmp = kzalloc(sizeof(*tmp), GFP_KERNEL); - if (!tmp) - goto cleanup; + for (ct = current->mm->core_state->dumper.next; + ct; ct = ct->next) { + tmp = kzalloc(sizeof(*tmp), GFP_KERNEL); + if (!tmp) + goto cleanup; - tmp->thread = ct->task; - list_add(&tmp->list, &thread_list); - } + tmp->thread = ct->task; + list_add(&tmp->list, &thread_list); + } - list_for_each(t, &thread_list) { - struct elf_thread_status *tmp; - int sz; + list_for_each(t, &thread_list) { + struct elf_thread_status *tmp; + int sz; - tmp = list_entry(t, struct elf_thread_status, list); - sz = elf_dump_thread_status(cprm->siginfo->si_signo, tmp); - thread_status_size += sz; - } + tmp = list_entry(t, struct elf_thread_status, list); + sz = elf_dump_thread_status(cprm->siginfo->si_signo, tmp); + thread_status_size += sz; } /* now collect the dump for the current */ @@ -1720,7 +1672,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) offset += sizeof(*elf); /* Elf header */ offset += segs * sizeof(struct elf_phdr); /* Program headers */ - foffset = offset; /* Write notes phdr entry */ { @@ -1755,13 +1706,10 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) offset = dataoff; - size += sizeof(*elf); - if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf))) + if (!dump_emit(cprm, elf, sizeof(*elf))) goto end_coredump; - size += sizeof(*phdr4note); - if (size > cprm->limit - || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note))) + if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note))) goto end_coredump; /* write program headers for segments dump */ @@ -1785,18 +1733,16 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) phdr.p_flags |= PF_X; phdr.p_align = ELF_EXEC_PAGESIZE; - size += sizeof(phdr); - if (size > cprm->limit - || !dump_write(cprm->file, &phdr, sizeof(phdr))) + if (!dump_emit(cprm, &phdr, sizeof(phdr))) goto end_coredump; } - if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit)) + if (!elf_core_write_extra_phdrs(cprm, offset)) goto end_coredump; /* write out the notes section */ for (i = 0; i < numnote; i++) - if (!writenote(notes + i, cprm->file, &foffset)) + if (!writenote(notes + i, cprm)) goto end_coredump; /* write out the thread status notes section */ @@ -1805,25 +1751,21 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) list_entry(t, struct elf_thread_status, list); for (i = 0; i < tmp->num_notes; i++) - if (!writenote(&tmp->notes[i], cprm->file, &foffset)) + if (!writenote(&tmp->notes[i], cprm)) goto end_coredump; } - if (!dump_seek(cprm->file, dataoff - foffset)) + if (!dump_skip(cprm, dataoff - cprm->written)) goto end_coredump; - if (elf_fdpic_dump_segments(cprm->file, &size, &cprm->limit, - cprm->mm_flags) < 0) + if (!elf_fdpic_dump_segments(cprm)) goto end_coredump; - if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit)) + if (!elf_core_write_extra_data(cprm)) goto end_coredump; if (e_phnum == PN_XNUM) { - size += sizeof(*shdr4extnum); - if (size > cprm->limit - || !dump_write(cprm->file, shdr4extnum, - sizeof(*shdr4extnum))) + if (!dump_emit(cprm, shdr4extnum, sizeof(*shdr4extnum))) goto end_coredump; } diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index 037a3e2b045..f37b08cea1f 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -38,7 +38,7 @@ static int load_em86(struct linux_binprm *bprm) /* First of all, some simple consistency checks */ if ((elf_ex.e_type != ET_EXEC && elf_ex.e_type != ET_DYN) || (!((elf_ex.e_machine == EM_386) || (elf_ex.e_machine == EM_486))) || - (!bprm->file->f_op || !bprm->file->f_op->mmap)) { + !bprm->file->f_op->mmap) { return -ENOEXEC; } diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index d50bbe59da1..f723cd3a455 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -380,7 +380,7 @@ failed: /****************************************************************************/ -void old_reloc(unsigned long rl) +static void old_reloc(unsigned long rl) { #ifdef DEBUG char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" }; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 1c740e152f3..b60500300dd 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -656,6 +656,7 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer, mutex_unlock(&root->d_inode->i_mutex); dput(root); + break; default: return res; } return count; diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c deleted file mode 100644 index fc60b31453e..00000000000 --- a/fs/bio-integrity.c +++ /dev/null @@ -1,759 +0,0 @@ -/* - * bio-integrity.c - bio data integrity extensions - * - * Copyright (C) 2007, 2008, 2009 Oracle Corporation - * Written by: Martin K. Petersen <martin.petersen@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, - * USA. - * - */ - -#include <linux/blkdev.h> -#include <linux/mempool.h> -#include <linux/export.h> -#include <linux/bio.h> -#include <linux/workqueue.h> -#include <linux/slab.h> - -#define BIP_INLINE_VECS 4 - -static struct kmem_cache *bip_slab; -static struct workqueue_struct *kintegrityd_wq; - -/** - * bio_integrity_alloc - Allocate integrity payload and attach it to bio - * @bio: bio to attach integrity metadata to - * @gfp_mask: Memory allocation mask - * @nr_vecs: Number of integrity metadata scatter-gather elements - * - * Description: This function prepares a bio for attaching integrity - * metadata. nr_vecs specifies the maximum number of pages containing - * integrity metadata that can be attached. - */ -struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, - gfp_t gfp_mask, - unsigned int nr_vecs) -{ - struct bio_integrity_payload *bip; - struct bio_set *bs = bio->bi_pool; - unsigned long idx = BIO_POOL_NONE; - unsigned inline_vecs; - - if (!bs) { - bip = kmalloc(sizeof(struct bio_integrity_payload) + - sizeof(struct bio_vec) * nr_vecs, gfp_mask); - inline_vecs = nr_vecs; - } else { - bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); - inline_vecs = BIP_INLINE_VECS; - } - - if (unlikely(!bip)) - return NULL; - - memset(bip, 0, sizeof(*bip)); - - if (nr_vecs > inline_vecs) { - bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx, - bs->bvec_integrity_pool); - if (!bip->bip_vec) - goto err; - } else { - bip->bip_vec = bip->bip_inline_vecs; - } - - bip->bip_slab = idx; - bip->bip_bio = bio; - bio->bi_integrity = bip; - - return bip; -err: - mempool_free(bip, bs->bio_integrity_pool); - return NULL; -} -EXPORT_SYMBOL(bio_integrity_alloc); - -/** - * bio_integrity_free - Free bio integrity payload - * @bio: bio containing bip to be freed - * - * Description: Used to free the integrity portion of a bio. Usually - * called from bio_free(). - */ -void bio_integrity_free(struct bio *bio) -{ - struct bio_integrity_payload *bip = bio->bi_integrity; - struct bio_set *bs = bio->bi_pool; - - if (bip->bip_owns_buf) - kfree(bip->bip_buf); - - if (bs) { - if (bip->bip_slab != BIO_POOL_NONE) - bvec_free(bs->bvec_integrity_pool, bip->bip_vec, - bip->bip_slab); - - mempool_free(bip, bs->bio_integrity_pool); - } else { - kfree(bip); - } - - bio->bi_integrity = NULL; -} -EXPORT_SYMBOL(bio_integrity_free); - -/** - * bio_integrity_add_page - Attach integrity metadata - * @bio: bio to update - * @page: page containing integrity metadata - * @len: number of bytes of integrity metadata in page - * @offset: start offset within page - * - * Description: Attach a page containing integrity metadata to bio. - */ -int bio_integrity_add_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int offset) -{ - struct bio_integrity_payload *bip = bio->bi_integrity; - struct bio_vec *iv; - - if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_slab)) { - printk(KERN_ERR "%s: bip_vec full\n", __func__); - return 0; - } - - iv = bip_vec_idx(bip, bip->bip_vcnt); - BUG_ON(iv == NULL); - - iv->bv_page = page; - iv->bv_len = len; - iv->bv_offset = offset; - bip->bip_vcnt++; - - return len; -} -EXPORT_SYMBOL(bio_integrity_add_page); - -static int bdev_integrity_enabled(struct block_device *bdev, int rw) -{ - struct blk_integrity *bi = bdev_get_integrity(bdev); - - if (bi == NULL) - return 0; - - if (rw == READ && bi->verify_fn != NULL && - (bi->flags & INTEGRITY_FLAG_READ)) - return 1; - - if (rw == WRITE && bi->generate_fn != NULL && - (bi->flags & INTEGRITY_FLAG_WRITE)) - return 1; - - return 0; -} - -/** - * bio_integrity_enabled - Check whether integrity can be passed - * @bio: bio to check - * - * Description: Determines whether bio_integrity_prep() can be called - * on this bio or not. bio data direction and target device must be - * set prior to calling. The functions honors the write_generate and - * read_verify flags in sysfs. - */ -int bio_integrity_enabled(struct bio *bio) -{ - /* Already protected? */ - if (bio_integrity(bio)) - return 0; - - return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio)); -} -EXPORT_SYMBOL(bio_integrity_enabled); - -/** - * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto - * @bi: blk_integrity profile for device - * @sectors: Number of 512 sectors to convert - * - * Description: The block layer calculates everything in 512 byte - * sectors but integrity metadata is done in terms of the hardware - * sector size of the storage device. Convert the block layer sectors - * to physical sectors. - */ -static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi, - unsigned int sectors) -{ - /* At this point there are only 512b or 4096b DIF/EPP devices */ - if (bi->sector_size == 4096) - return sectors >>= 3; - - return sectors; -} - -/** - * bio_integrity_tag_size - Retrieve integrity tag space - * @bio: bio to inspect - * - * Description: Returns the maximum number of tag bytes that can be - * attached to this bio. Filesystems can use this to determine how - * much metadata to attach to an I/O. - */ -unsigned int bio_integrity_tag_size(struct bio *bio) -{ - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - - BUG_ON(bio->bi_size == 0); - - return bi->tag_size * (bio->bi_size / bi->sector_size); -} -EXPORT_SYMBOL(bio_integrity_tag_size); - -int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set) -{ - struct bio_integrity_payload *bip = bio->bi_integrity; - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - unsigned int nr_sectors; - - BUG_ON(bip->bip_buf == NULL); - - if (bi->tag_size == 0) - return -1; - - nr_sectors = bio_integrity_hw_sectors(bi, - DIV_ROUND_UP(len, bi->tag_size)); - - if (nr_sectors * bi->tuple_size > bip->bip_size) { - printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", - __func__, nr_sectors * bi->tuple_size, bip->bip_size); - return -1; - } - - if (set) - bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors); - else - bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors); - - return 0; -} - -/** - * bio_integrity_set_tag - Attach a tag buffer to a bio - * @bio: bio to attach buffer to - * @tag_buf: Pointer to a buffer containing tag data - * @len: Length of the included buffer - * - * Description: Use this function to tag a bio by leveraging the extra - * space provided by devices formatted with integrity protection. The - * size of the integrity buffer must be <= to the size reported by - * bio_integrity_tag_size(). - */ -int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len) -{ - BUG_ON(bio_data_dir(bio) != WRITE); - - return bio_integrity_tag(bio, tag_buf, len, 1); -} -EXPORT_SYMBOL(bio_integrity_set_tag); - -/** - * bio_integrity_get_tag - Retrieve a tag buffer from a bio - * @bio: bio to retrieve buffer from - * @tag_buf: Pointer to a buffer for the tag data - * @len: Length of the target buffer - * - * Description: Use this function to retrieve the tag buffer from a - * completed I/O. The size of the integrity buffer must be <= to the - * size reported by bio_integrity_tag_size(). - */ -int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len) -{ - BUG_ON(bio_data_dir(bio) != READ); - - return bio_integrity_tag(bio, tag_buf, len, 0); -} -EXPORT_SYMBOL(bio_integrity_get_tag); - -/** - * bio_integrity_generate - Generate integrity metadata for a bio - * @bio: bio to generate integrity metadata for - * - * Description: Generates integrity metadata for a bio by calling the - * block device's generation callback function. The bio must have a - * bip attached with enough room to accommodate the generated - * integrity metadata. - */ -static void bio_integrity_generate(struct bio *bio) -{ - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - struct blk_integrity_exchg bix; - struct bio_vec *bv; - sector_t sector = bio->bi_sector; - unsigned int i, sectors, total; - void *prot_buf = bio->bi_integrity->bip_buf; - - total = 0; - bix.disk_name = bio->bi_bdev->bd_disk->disk_name; - bix.sector_size = bi->sector_size; - - bio_for_each_segment(bv, bio, i) { - void *kaddr = kmap_atomic(bv->bv_page); - bix.data_buf = kaddr + bv->bv_offset; - bix.data_size = bv->bv_len; - bix.prot_buf = prot_buf; - bix.sector = sector; - - bi->generate_fn(&bix); - - sectors = bv->bv_len / bi->sector_size; - sector += sectors; - prot_buf += sectors * bi->tuple_size; - total += sectors * bi->tuple_size; - BUG_ON(total > bio->bi_integrity->bip_size); - - kunmap_atomic(kaddr); - } -} - -static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi) -{ - if (bi) - return bi->tuple_size; - - return 0; -} - -/** - * bio_integrity_prep - Prepare bio for integrity I/O - * @bio: bio to prepare - * - * Description: Allocates a buffer for integrity metadata, maps the - * pages and attaches them to a bio. The bio must have data - * direction, target device and start sector set priot to calling. In - * the WRITE case, integrity metadata will be generated using the - * block device's integrity function. In the READ case, the buffer - * will be prepared for DMA and a suitable end_io handler set up. - */ -int bio_integrity_prep(struct bio *bio) -{ - struct bio_integrity_payload *bip; - struct blk_integrity *bi; - struct request_queue *q; - void *buf; - unsigned long start, end; - unsigned int len, nr_pages; - unsigned int bytes, offset, i; - unsigned int sectors; - - bi = bdev_get_integrity(bio->bi_bdev); - q = bdev_get_queue(bio->bi_bdev); - BUG_ON(bi == NULL); - BUG_ON(bio_integrity(bio)); - - sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio)); - - /* Allocate kernel buffer for protection data */ - len = sectors * blk_integrity_tuple_size(bi); - buf = kmalloc(len, GFP_NOIO | q->bounce_gfp); - if (unlikely(buf == NULL)) { - printk(KERN_ERR "could not allocate integrity buffer\n"); - return -ENOMEM; - } - - end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - start = ((unsigned long) buf) >> PAGE_SHIFT; - nr_pages = end - start; - - /* Allocate bio integrity payload and integrity vectors */ - bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages); - if (unlikely(bip == NULL)) { - printk(KERN_ERR "could not allocate data integrity bioset\n"); - kfree(buf); - return -EIO; - } - - bip->bip_owns_buf = 1; - bip->bip_buf = buf; - bip->bip_size = len; - bip->bip_sector = bio->bi_sector; - - /* Map it */ - offset = offset_in_page(buf); - for (i = 0 ; i < nr_pages ; i++) { - int ret; - bytes = PAGE_SIZE - offset; - - if (len <= 0) - break; - - if (bytes > len) - bytes = len; - - ret = bio_integrity_add_page(bio, virt_to_page(buf), - bytes, offset); - - if (ret == 0) - return 0; - - if (ret < bytes) - break; - - buf += bytes; - len -= bytes; - offset = 0; - } - - /* Install custom I/O completion handler if read verify is enabled */ - if (bio_data_dir(bio) == READ) { - bip->bip_end_io = bio->bi_end_io; - bio->bi_end_io = bio_integrity_endio; - } - - /* Auto-generate integrity metadata if this is a write */ - if (bio_data_dir(bio) == WRITE) - bio_integrity_generate(bio); - - return 0; -} -EXPORT_SYMBOL(bio_integrity_prep); - -/** - * bio_integrity_verify - Verify integrity metadata for a bio - * @bio: bio to verify - * - * Description: This function is called to verify the integrity of a - * bio. The data in the bio io_vec is compared to the integrity - * metadata returned by the HBA. - */ -static int bio_integrity_verify(struct bio *bio) -{ - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - struct blk_integrity_exchg bix; - struct bio_vec *bv; - sector_t sector = bio->bi_integrity->bip_sector; - unsigned int i, sectors, total, ret; - void *prot_buf = bio->bi_integrity->bip_buf; - - ret = total = 0; - bix.disk_name = bio->bi_bdev->bd_disk->disk_name; - bix.sector_size = bi->sector_size; - - bio_for_each_segment(bv, bio, i) { - void *kaddr = kmap_atomic(bv->bv_page); - bix.data_buf = kaddr + bv->bv_offset; - bix.data_size = bv->bv_len; - bix.prot_buf = prot_buf; - bix.sector = sector; - - ret = bi->verify_fn(&bix); - - if (ret) { - kunmap_atomic(kaddr); - return ret; - } - - sectors = bv->bv_len / bi->sector_size; - sector += sectors; - prot_buf += sectors * bi->tuple_size; - total += sectors * bi->tuple_size; - BUG_ON(total > bio->bi_integrity->bip_size); - - kunmap_atomic(kaddr); - } - - return ret; -} - -/** - * bio_integrity_verify_fn - Integrity I/O completion worker - * @work: Work struct stored in bio to be verified - * - * Description: This workqueue function is called to complete a READ - * request. The function verifies the transferred integrity metadata - * and then calls the original bio end_io function. - */ -static void bio_integrity_verify_fn(struct work_struct *work) -{ - struct bio_integrity_payload *bip = - container_of(work, struct bio_integrity_payload, bip_work); - struct bio *bio = bip->bip_bio; - int error; - - error = bio_integrity_verify(bio); - - /* Restore original bio completion handler */ - bio->bi_end_io = bip->bip_end_io; - bio_endio(bio, error); -} - -/** - * bio_integrity_endio - Integrity I/O completion function - * @bio: Protected bio - * @error: Pointer to errno - * - * Description: Completion for integrity I/O - * - * Normally I/O completion is done in interrupt context. However, - * verifying I/O integrity is a time-consuming task which must be run - * in process context. This function postpones completion - * accordingly. - */ -void bio_integrity_endio(struct bio *bio, int error) -{ - struct bio_integrity_payload *bip = bio->bi_integrity; - - BUG_ON(bip->bip_bio != bio); - - /* In case of an I/O error there is no point in verifying the - * integrity metadata. Restore original bio end_io handler - * and run it. - */ - if (error) { - bio->bi_end_io = bip->bip_end_io; - bio_endio(bio, error); - - return; - } - - INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); - queue_work(kintegrityd_wq, &bip->bip_work); -} -EXPORT_SYMBOL(bio_integrity_endio); - -/** - * bio_integrity_mark_head - Advance bip_vec skip bytes - * @bip: Integrity vector to advance - * @skip: Number of bytes to advance it - */ -void bio_integrity_mark_head(struct bio_integrity_payload *bip, - unsigned int skip) -{ - struct bio_vec *iv; - unsigned int i; - - bip_for_each_vec(iv, bip, i) { - if (skip == 0) { - bip->bip_idx = i; - return; - } else if (skip >= iv->bv_len) { - skip -= iv->bv_len; - } else { /* skip < iv->bv_len) */ - iv->bv_offset += skip; - iv->bv_len -= skip; - bip->bip_idx = i; - return; - } - } -} - -/** - * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long - * @bip: Integrity vector to truncate - * @len: New length of integrity vector - */ -void bio_integrity_mark_tail(struct bio_integrity_payload *bip, - unsigned int len) -{ - struct bio_vec *iv; - unsigned int i; - - bip_for_each_vec(iv, bip, i) { - if (len == 0) { - bip->bip_vcnt = i; - return; - } else if (len >= iv->bv_len) { - len -= iv->bv_len; - } else { /* len < iv->bv_len) */ - iv->bv_len = len; - len = 0; - } - } -} - -/** - * bio_integrity_advance - Advance integrity vector - * @bio: bio whose integrity vector to update - * @bytes_done: number of data bytes that have been completed - * - * Description: This function calculates how many integrity bytes the - * number of completed data bytes correspond to and advances the - * integrity vector accordingly. - */ -void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) -{ - struct bio_integrity_payload *bip = bio->bi_integrity; - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - unsigned int nr_sectors; - - BUG_ON(bip == NULL); - BUG_ON(bi == NULL); - - nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9); - bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size); -} -EXPORT_SYMBOL(bio_integrity_advance); - -/** - * bio_integrity_trim - Trim integrity vector - * @bio: bio whose integrity vector to update - * @offset: offset to first data sector - * @sectors: number of data sectors - * - * Description: Used to trim the integrity vector in a cloned bio. - * The ivec will be advanced corresponding to 'offset' data sectors - * and the length will be truncated corresponding to 'len' data - * sectors. - */ -void bio_integrity_trim(struct bio *bio, unsigned int offset, - unsigned int sectors) -{ - struct bio_integrity_payload *bip = bio->bi_integrity; - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - unsigned int nr_sectors; - - BUG_ON(bip == NULL); - BUG_ON(bi == NULL); - BUG_ON(!bio_flagged(bio, BIO_CLONED)); - - nr_sectors = bio_integrity_hw_sectors(bi, sectors); - bip->bip_sector = bip->bip_sector + offset; - bio_integrity_mark_head(bip, offset * bi->tuple_size); - bio_integrity_mark_tail(bip, sectors * bi->tuple_size); -} -EXPORT_SYMBOL(bio_integrity_trim); - -/** - * bio_integrity_split - Split integrity metadata - * @bio: Protected bio - * @bp: Resulting bio_pair - * @sectors: Offset - * - * Description: Splits an integrity page into a bio_pair. - */ -void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors) -{ - struct blk_integrity *bi; - struct bio_integrity_payload *bip = bio->bi_integrity; - unsigned int nr_sectors; - - if (bio_integrity(bio) == 0) - return; - - bi = bdev_get_integrity(bio->bi_bdev); - BUG_ON(bi == NULL); - BUG_ON(bip->bip_vcnt != 1); - - nr_sectors = bio_integrity_hw_sectors(bi, sectors); - - bp->bio1.bi_integrity = &bp->bip1; - bp->bio2.bi_integrity = &bp->bip2; - - bp->iv1 = bip->bip_vec[bip->bip_idx]; - bp->iv2 = bip->bip_vec[bip->bip_idx]; - - bp->bip1.bip_vec = &bp->iv1; - bp->bip2.bip_vec = &bp->iv2; - - bp->iv1.bv_len = sectors * bi->tuple_size; - bp->iv2.bv_offset += sectors * bi->tuple_size; - bp->iv2.bv_len -= sectors * bi->tuple_size; - - bp->bip1.bip_sector = bio->bi_integrity->bip_sector; - bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors; - - bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1; - bp->bip1.bip_idx = bp->bip2.bip_idx = 0; -} -EXPORT_SYMBOL(bio_integrity_split); - -/** - * bio_integrity_clone - Callback for cloning bios with integrity metadata - * @bio: New bio - * @bio_src: Original bio - * @gfp_mask: Memory allocation mask - * - * Description: Called to allocate a bip when cloning a bio - */ -int bio_integrity_clone(struct bio *bio, struct bio *bio_src, - gfp_t gfp_mask) -{ - struct bio_integrity_payload *bip_src = bio_src->bi_integrity; - struct bio_integrity_payload *bip; - - BUG_ON(bip_src == NULL); - - bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt); - - if (bip == NULL) - return -EIO; - - memcpy(bip->bip_vec, bip_src->bip_vec, - bip_src->bip_vcnt * sizeof(struct bio_vec)); - - bip->bip_sector = bip_src->bip_sector; - bip->bip_vcnt = bip_src->bip_vcnt; - bip->bip_idx = bip_src->bip_idx; - - return 0; -} -EXPORT_SYMBOL(bio_integrity_clone); - -int bioset_integrity_create(struct bio_set *bs, int pool_size) -{ - if (bs->bio_integrity_pool) - return 0; - - bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab); - if (!bs->bio_integrity_pool) - return -1; - - bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size); - if (!bs->bvec_integrity_pool) { - mempool_destroy(bs->bio_integrity_pool); - return -1; - } - - return 0; -} -EXPORT_SYMBOL(bioset_integrity_create); - -void bioset_integrity_free(struct bio_set *bs) -{ - if (bs->bio_integrity_pool) - mempool_destroy(bs->bio_integrity_pool); - - if (bs->bvec_integrity_pool) - mempool_destroy(bs->bvec_integrity_pool); -} -EXPORT_SYMBOL(bioset_integrity_free); - -void __init bio_integrity_init(void) -{ - /* - * kintegrityd won't block much but may burn a lot of CPU cycles. - * Make it highpri CPU intensive wq with max concurrency of 1. - */ - kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | - WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); - if (!kintegrityd_wq) - panic("Failed to create kintegrityd\n"); - - bip_slab = kmem_cache_create("bio_integrity_payload", - sizeof(struct bio_integrity_payload) + - sizeof(struct bio_vec) * BIP_INLINE_VECS, - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - if (!bip_slab) - panic("Failed to create slab\n"); -} diff --git a/fs/bio.c b/fs/bio.c deleted file mode 100644 index ea5035da4d9..00000000000 --- a/fs/bio.c +++ /dev/null @@ -1,2029 +0,0 @@ -/* - * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public Licens - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- - * - */ -#include <linux/mm.h> -#include <linux/swap.h> -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <linux/uio.h> -#include <linux/iocontext.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/export.h> -#include <linux/mempool.h> -#include <linux/workqueue.h> -#include <linux/cgroup.h> -#include <scsi/sg.h> /* for struct sg_iovec */ - -#include <trace/events/block.h> - -/* - * Test patch to inline a certain number of bi_io_vec's inside the bio - * itself, to shrink a bio data allocation from two mempool calls to one - */ -#define BIO_INLINE_VECS 4 - -static mempool_t *bio_split_pool __read_mostly; - -/* - * if you change this list, also change bvec_alloc or things will - * break badly! cannot be bigger than what you can fit into an - * unsigned short - */ -#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } -static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { - BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), -}; -#undef BV - -/* - * fs_bio_set is the bio_set containing bio and iovec memory pools used by - * IO code that does not need private memory pools. - */ -struct bio_set *fs_bio_set; -EXPORT_SYMBOL(fs_bio_set); - -/* - * Our slab pool management - */ -struct bio_slab { - struct kmem_cache *slab; - unsigned int slab_ref; - unsigned int slab_size; - char name[8]; -}; -static DEFINE_MUTEX(bio_slab_lock); -static struct bio_slab *bio_slabs; -static unsigned int bio_slab_nr, bio_slab_max; - -static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) -{ - unsigned int sz = sizeof(struct bio) + extra_size; - struct kmem_cache *slab = NULL; - struct bio_slab *bslab, *new_bio_slabs; - unsigned int new_bio_slab_max; - unsigned int i, entry = -1; - - mutex_lock(&bio_slab_lock); - - i = 0; - while (i < bio_slab_nr) { - bslab = &bio_slabs[i]; - - if (!bslab->slab && entry == -1) - entry = i; - else if (bslab->slab_size == sz) { - slab = bslab->slab; - bslab->slab_ref++; - break; - } - i++; - } - - if (slab) - goto out_unlock; - - if (bio_slab_nr == bio_slab_max && entry == -1) { - new_bio_slab_max = bio_slab_max << 1; - new_bio_slabs = krealloc(bio_slabs, - new_bio_slab_max * sizeof(struct bio_slab), - GFP_KERNEL); - if (!new_bio_slabs) - goto out_unlock; - bio_slab_max = new_bio_slab_max; - bio_slabs = new_bio_slabs; - } - if (entry == -1) - entry = bio_slab_nr++; - - bslab = &bio_slabs[entry]; - - snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry); - slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL); - if (!slab) - goto out_unlock; - - printk(KERN_INFO "bio: create slab <%s> at %d\n", bslab->name, entry); - bslab->slab = slab; - bslab->slab_ref = 1; - bslab->slab_size = sz; -out_unlock: - mutex_unlock(&bio_slab_lock); - return slab; -} - -static void bio_put_slab(struct bio_set *bs) -{ - struct bio_slab *bslab = NULL; - unsigned int i; - - mutex_lock(&bio_slab_lock); - - for (i = 0; i < bio_slab_nr; i++) { - if (bs->bio_slab == bio_slabs[i].slab) { - bslab = &bio_slabs[i]; - break; - } - } - - if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n")) - goto out; - - WARN_ON(!bslab->slab_ref); - - if (--bslab->slab_ref) - goto out; - - kmem_cache_destroy(bslab->slab); - bslab->slab = NULL; - -out: - mutex_unlock(&bio_slab_lock); -} - -unsigned int bvec_nr_vecs(unsigned short idx) -{ - return bvec_slabs[idx].nr_vecs; -} - -void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) -{ - BIO_BUG_ON(idx >= BIOVEC_NR_POOLS); - - if (idx == BIOVEC_MAX_IDX) - mempool_free(bv, pool); - else { - struct biovec_slab *bvs = bvec_slabs + idx; - - kmem_cache_free(bvs->slab, bv); - } -} - -struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, - mempool_t *pool) -{ - struct bio_vec *bvl; - - /* - * see comment near bvec_array define! - */ - switch (nr) { - case 1: - *idx = 0; - break; - case 2 ... 4: - *idx = 1; - break; - case 5 ... 16: - *idx = 2; - break; - case 17 ... 64: - *idx = 3; - break; - case 65 ... 128: - *idx = 4; - break; - case 129 ... BIO_MAX_PAGES: - *idx = 5; - break; - default: - return NULL; - } - - /* - * idx now points to the pool we want to allocate from. only the - * 1-vec entry pool is mempool backed. - */ - if (*idx == BIOVEC_MAX_IDX) { -fallback: - bvl = mempool_alloc(pool, gfp_mask); - } else { - struct biovec_slab *bvs = bvec_slabs + *idx; - gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO); - - /* - * Make this allocation restricted and don't dump info on - * allocation failures, since we'll fallback to the mempool - * in case of failure. - */ - __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; - - /* - * Try a slab allocation. If this fails and __GFP_WAIT - * is set, retry with the 1-entry mempool - */ - bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); - if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) { - *idx = BIOVEC_MAX_IDX; - goto fallback; - } - } - - return bvl; -} - -static void __bio_free(struct bio *bio) -{ - bio_disassociate_task(bio); - - if (bio_integrity(bio)) - bio_integrity_free(bio); -} - -static void bio_free(struct bio *bio) -{ - struct bio_set *bs = bio->bi_pool; - void *p; - - __bio_free(bio); - - if (bs) { - if (bio_flagged(bio, BIO_OWNS_VEC)) - bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio)); - - /* - * If we have front padding, adjust the bio pointer before freeing - */ - p = bio; - p -= bs->front_pad; - - mempool_free(p, bs->bio_pool); - } else { - /* Bio was allocated by bio_kmalloc() */ - kfree(bio); - } -} - -void bio_init(struct bio *bio) -{ - memset(bio, 0, sizeof(*bio)); - bio->bi_flags = 1 << BIO_UPTODATE; - atomic_set(&bio->bi_cnt, 1); -} -EXPORT_SYMBOL(bio_init); - -/** - * bio_reset - reinitialize a bio - * @bio: bio to reset - * - * Description: - * After calling bio_reset(), @bio will be in the same state as a freshly - * allocated bio returned bio bio_alloc_bioset() - the only fields that are - * preserved are the ones that are initialized by bio_alloc_bioset(). See - * comment in struct bio. - */ -void bio_reset(struct bio *bio) -{ - unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); - - __bio_free(bio); - - memset(bio, 0, BIO_RESET_BYTES); - bio->bi_flags = flags|(1 << BIO_UPTODATE); -} -EXPORT_SYMBOL(bio_reset); - -static void bio_alloc_rescue(struct work_struct *work) -{ - struct bio_set *bs = container_of(work, struct bio_set, rescue_work); - struct bio *bio; - - while (1) { - spin_lock(&bs->rescue_lock); - bio = bio_list_pop(&bs->rescue_list); - spin_unlock(&bs->rescue_lock); - - if (!bio) - break; - - generic_make_request(bio); - } -} - -static void punt_bios_to_rescuer(struct bio_set *bs) -{ - struct bio_list punt, nopunt; - struct bio *bio; - - /* - * In order to guarantee forward progress we must punt only bios that - * were allocated from this bio_set; otherwise, if there was a bio on - * there for a stacking driver higher up in the stack, processing it - * could require allocating bios from this bio_set, and doing that from - * our own rescuer would be bad. - * - * Since bio lists are singly linked, pop them all instead of trying to - * remove from the middle of the list: - */ - - bio_list_init(&punt); - bio_list_init(&nopunt); - - while ((bio = bio_list_pop(current->bio_list))) - bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); - - *current->bio_list = nopunt; - - spin_lock(&bs->rescue_lock); - bio_list_merge(&bs->rescue_list, &punt); - spin_unlock(&bs->rescue_lock); - - queue_work(bs->rescue_workqueue, &bs->rescue_work); -} - -/** - * bio_alloc_bioset - allocate a bio for I/O - * @gfp_mask: the GFP_ mask given to the slab allocator - * @nr_iovecs: number of iovecs to pre-allocate - * @bs: the bio_set to allocate from. - * - * Description: - * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is - * backed by the @bs's mempool. - * - * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be - * able to allocate a bio. This is due to the mempool guarantees. To make this - * work, callers must never allocate more than 1 bio at a time from this pool. - * Callers that need to allocate more than 1 bio must always submit the - * previously allocated bio for IO before attempting to allocate a new one. - * Failure to do so can cause deadlocks under memory pressure. - * - * Note that when running under generic_make_request() (i.e. any block - * driver), bios are not submitted until after you return - see the code in - * generic_make_request() that converts recursion into iteration, to prevent - * stack overflows. - * - * This would normally mean allocating multiple bios under - * generic_make_request() would be susceptible to deadlocks, but we have - * deadlock avoidance code that resubmits any blocked bios from a rescuer - * thread. - * - * However, we do not guarantee forward progress for allocations from other - * mempools. Doing multiple allocations from the same mempool under - * generic_make_request() should be avoided - instead, use bio_set's front_pad - * for per bio allocations. - * - * RETURNS: - * Pointer to new bio on success, NULL on failure. - */ -struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) -{ - gfp_t saved_gfp = gfp_mask; - unsigned front_pad; - unsigned inline_vecs; - unsigned long idx = BIO_POOL_NONE; - struct bio_vec *bvl = NULL; - struct bio *bio; - void *p; - - if (!bs) { - if (nr_iovecs > UIO_MAXIOV) - return NULL; - - p = kmalloc(sizeof(struct bio) + - nr_iovecs * sizeof(struct bio_vec), - gfp_mask); - front_pad = 0; - inline_vecs = nr_iovecs; - } else { - /* - * generic_make_request() converts recursion to iteration; this - * means if we're running beneath it, any bios we allocate and - * submit will not be submitted (and thus freed) until after we - * return. - * - * This exposes us to a potential deadlock if we allocate - * multiple bios from the same bio_set() while running - * underneath generic_make_request(). If we were to allocate - * multiple bios (say a stacking block driver that was splitting - * bios), we would deadlock if we exhausted the mempool's - * reserve. - * - * We solve this, and guarantee forward progress, with a rescuer - * workqueue per bio_set. If we go to allocate and there are - * bios on current->bio_list, we first try the allocation - * without __GFP_WAIT; if that fails, we punt those bios we - * would be blocking to the rescuer workqueue before we retry - * with the original gfp_flags. - */ - - if (current->bio_list && !bio_list_empty(current->bio_list)) - gfp_mask &= ~__GFP_WAIT; - - p = mempool_alloc(bs->bio_pool, gfp_mask); - if (!p && gfp_mask != saved_gfp) { - punt_bios_to_rescuer(bs); - gfp_mask = saved_gfp; - p = mempool_alloc(bs->bio_pool, gfp_mask); - } - - front_pad = bs->front_pad; - inline_vecs = BIO_INLINE_VECS; - } - - if (unlikely(!p)) - return NULL; - - bio = p + front_pad; - bio_init(bio); - - if (nr_iovecs > inline_vecs) { - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); - if (!bvl && gfp_mask != saved_gfp) { - punt_bios_to_rescuer(bs); - gfp_mask = saved_gfp; - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); - } - - if (unlikely(!bvl)) - goto err_free; - - bio->bi_flags |= 1 << BIO_OWNS_VEC; - } else if (nr_iovecs) { - bvl = bio->bi_inline_vecs; - } - - bio->bi_pool = bs; - bio->bi_flags |= idx << BIO_POOL_OFFSET; - bio->bi_max_vecs = nr_iovecs; - bio->bi_io_vec = bvl; - return bio; - -err_free: - mempool_free(p, bs->bio_pool); - return NULL; -} -EXPORT_SYMBOL(bio_alloc_bioset); - -void zero_fill_bio(struct bio *bio) -{ - unsigned long flags; - struct bio_vec *bv; - int i; - - bio_for_each_segment(bv, bio, i) { - char *data = bvec_kmap_irq(bv, &flags); - memset(data, 0, bv->bv_len); - flush_dcache_page(bv->bv_page); - bvec_kunmap_irq(data, &flags); - } -} -EXPORT_SYMBOL(zero_fill_bio); - -/** - * bio_put - release a reference to a bio - * @bio: bio to release reference to - * - * Description: - * Put a reference to a &struct bio, either one you have gotten with - * bio_alloc, bio_get or bio_clone. The last put of a bio will free it. - **/ -void bio_put(struct bio *bio) -{ - BIO_BUG_ON(!atomic_read(&bio->bi_cnt)); - - /* - * last put frees it - */ - if (atomic_dec_and_test(&bio->bi_cnt)) - bio_free(bio); -} -EXPORT_SYMBOL(bio_put); - -inline int bio_phys_segments(struct request_queue *q, struct bio *bio) -{ - if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) - blk_recount_segments(q, bio); - - return bio->bi_phys_segments; -} -EXPORT_SYMBOL(bio_phys_segments); - -/** - * __bio_clone - clone a bio - * @bio: destination bio - * @bio_src: bio to clone - * - * Clone a &bio. Caller will own the returned bio, but not - * the actual data it points to. Reference count of returned - * bio will be one. - */ -void __bio_clone(struct bio *bio, struct bio *bio_src) -{ - memcpy(bio->bi_io_vec, bio_src->bi_io_vec, - bio_src->bi_max_vecs * sizeof(struct bio_vec)); - - /* - * most users will be overriding ->bi_bdev with a new target, - * so we don't set nor calculate new physical/hw segment counts here - */ - bio->bi_sector = bio_src->bi_sector; - bio->bi_bdev = bio_src->bi_bdev; - bio->bi_flags |= 1 << BIO_CLONED; - bio->bi_rw = bio_src->bi_rw; - bio->bi_vcnt = bio_src->bi_vcnt; - bio->bi_size = bio_src->bi_size; - bio->bi_idx = bio_src->bi_idx; -} -EXPORT_SYMBOL(__bio_clone); - -/** - * bio_clone_bioset - clone a bio - * @bio: bio to clone - * @gfp_mask: allocation priority - * @bs: bio_set to allocate from - * - * Like __bio_clone, only also allocates the returned bio - */ -struct bio *bio_clone_bioset(struct bio *bio, gfp_t gfp_mask, - struct bio_set *bs) -{ - struct bio *b; - - b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, bs); - if (!b) - return NULL; - - __bio_clone(b, bio); - - if (bio_integrity(bio)) { - int ret; - - ret = bio_integrity_clone(b, bio, gfp_mask); - - if (ret < 0) { - bio_put(b); - return NULL; - } - } - - return b; -} -EXPORT_SYMBOL(bio_clone_bioset); - -/** - * bio_get_nr_vecs - return approx number of vecs - * @bdev: I/O target - * - * Return the approximate number of pages we can send to this target. - * There's no guarantee that you will be able to fit this number of pages - * into a bio, it does not account for dynamic restrictions that vary - * on offset. - */ -int bio_get_nr_vecs(struct block_device *bdev) -{ - struct request_queue *q = bdev_get_queue(bdev); - int nr_pages; - - nr_pages = min_t(unsigned, - queue_max_segments(q), - queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1); - - return min_t(unsigned, nr_pages, BIO_MAX_PAGES); - -} -EXPORT_SYMBOL(bio_get_nr_vecs); - -static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page - *page, unsigned int len, unsigned int offset, - unsigned short max_sectors) -{ - int retried_segments = 0; - struct bio_vec *bvec; - - /* - * cloned bio must not modify vec list - */ - if (unlikely(bio_flagged(bio, BIO_CLONED))) - return 0; - - if (((bio->bi_size + len) >> 9) > max_sectors) - return 0; - - /* - * For filesystems with a blocksize smaller than the pagesize - * we will often be called with the same page as last time and - * a consecutive offset. Optimize this special case. - */ - if (bio->bi_vcnt > 0) { - struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; - - if (page == prev->bv_page && - offset == prev->bv_offset + prev->bv_len) { - unsigned int prev_bv_len = prev->bv_len; - prev->bv_len += len; - - if (q->merge_bvec_fn) { - struct bvec_merge_data bvm = { - /* prev_bvec is already charged in - bi_size, discharge it in order to - simulate merging updated prev_bvec - as new bvec. */ - .bi_bdev = bio->bi_bdev, - .bi_sector = bio->bi_sector, - .bi_size = bio->bi_size - prev_bv_len, - .bi_rw = bio->bi_rw, - }; - - if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) { - prev->bv_len -= len; - return 0; - } - } - - goto done; - } - } - - if (bio->bi_vcnt >= bio->bi_max_vecs) - return 0; - - /* - * we might lose a segment or two here, but rather that than - * make this too complex. - */ - - while (bio->bi_phys_segments >= queue_max_segments(q)) { - - if (retried_segments) - return 0; - - retried_segments = 1; - blk_recount_segments(q, bio); - } - - /* - * setup the new entry, we might clear it again later if we - * cannot add the page - */ - bvec = &bio->bi_io_vec[bio->bi_vcnt]; - bvec->bv_page = page; - bvec->bv_len = len; - bvec->bv_offset = offset; - - /* - * if queue has other restrictions (eg varying max sector size - * depending on offset), it can specify a merge_bvec_fn in the - * queue to get further control - */ - if (q->merge_bvec_fn) { - struct bvec_merge_data bvm = { - .bi_bdev = bio->bi_bdev, - .bi_sector = bio->bi_sector, - .bi_size = bio->bi_size, - .bi_rw = bio->bi_rw, - }; - - /* - * merge_bvec_fn() returns number of bytes it can accept - * at this offset - */ - if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) { - bvec->bv_page = NULL; - bvec->bv_len = 0; - bvec->bv_offset = 0; - return 0; - } - } - - /* If we may be able to merge these biovecs, force a recount */ - if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) - bio->bi_flags &= ~(1 << BIO_SEG_VALID); - - bio->bi_vcnt++; - bio->bi_phys_segments++; - done: - bio->bi_size += len; - return len; -} - -/** - * bio_add_pc_page - attempt to add page to bio - * @q: the target queue - * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset - * - * Attempt to add a page to the bio_vec maplist. This can fail for a - * number of reasons, such as the bio being full or target block device - * limitations. The target block device must allow bio's up to PAGE_SIZE, - * so it is always possible to add a single page to an empty bio. - * - * This should only be used by REQ_PC bios. - */ -int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, - unsigned int len, unsigned int offset) -{ - return __bio_add_page(q, bio, page, len, offset, - queue_max_hw_sectors(q)); -} -EXPORT_SYMBOL(bio_add_pc_page); - -/** - * bio_add_page - attempt to add page to bio - * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset - * - * Attempt to add a page to the bio_vec maplist. This can fail for a - * number of reasons, such as the bio being full or target block device - * limitations. The target block device must allow bio's up to PAGE_SIZE, - * so it is always possible to add a single page to an empty bio. - */ -int bio_add_page(struct bio *bio, struct page *page, unsigned int len, - unsigned int offset) -{ - struct request_queue *q = bdev_get_queue(bio->bi_bdev); - return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q)); -} -EXPORT_SYMBOL(bio_add_page); - -struct submit_bio_ret { - struct completion event; - int error; -}; - -static void submit_bio_wait_endio(struct bio *bio, int error) -{ - struct submit_bio_ret *ret = bio->bi_private; - - ret->error = error; - complete(&ret->event); -} - -/** - * submit_bio_wait - submit a bio, and wait until it completes - * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) - * @bio: The &struct bio which describes the I/O - * - * Simple wrapper around submit_bio(). Returns 0 on success, or the error from - * bio_endio() on failure. - */ -int submit_bio_wait(int rw, struct bio *bio) -{ - struct submit_bio_ret ret; - - rw |= REQ_SYNC; - init_completion(&ret.event); - bio->bi_private = &ret; - bio->bi_end_io = submit_bio_wait_endio; - submit_bio(rw, bio); - wait_for_completion(&ret.event); - - return ret.error; -} -EXPORT_SYMBOL(submit_bio_wait); - -/** - * bio_advance - increment/complete a bio by some number of bytes - * @bio: bio to advance - * @bytes: number of bytes to complete - * - * This updates bi_sector, bi_size and bi_idx; if the number of bytes to - * complete doesn't align with a bvec boundary, then bv_len and bv_offset will - * be updated on the last bvec as well. - * - * @bio will then represent the remaining, uncompleted portion of the io. - */ -void bio_advance(struct bio *bio, unsigned bytes) -{ - if (bio_integrity(bio)) - bio_integrity_advance(bio, bytes); - - bio->bi_sector += bytes >> 9; - bio->bi_size -= bytes; - - if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK) - return; - - while (bytes) { - if (unlikely(bio->bi_idx >= bio->bi_vcnt)) { - WARN_ONCE(1, "bio idx %d >= vcnt %d\n", - bio->bi_idx, bio->bi_vcnt); - break; - } - - if (bytes >= bio_iovec(bio)->bv_len) { - bytes -= bio_iovec(bio)->bv_len; - bio->bi_idx++; - } else { - bio_iovec(bio)->bv_len -= bytes; - bio_iovec(bio)->bv_offset += bytes; - bytes = 0; - } - } -} -EXPORT_SYMBOL(bio_advance); - -/** - * bio_alloc_pages - allocates a single page for each bvec in a bio - * @bio: bio to allocate pages for - * @gfp_mask: flags for allocation - * - * Allocates pages up to @bio->bi_vcnt. - * - * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are - * freed. - */ -int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) -{ - int i; - struct bio_vec *bv; - - bio_for_each_segment_all(bv, bio, i) { - bv->bv_page = alloc_page(gfp_mask); - if (!bv->bv_page) { - while (--bv >= bio->bi_io_vec) - __free_page(bv->bv_page); - return -ENOMEM; - } - } - - return 0; -} -EXPORT_SYMBOL(bio_alloc_pages); - -/** - * bio_copy_data - copy contents of data buffers from one chain of bios to - * another - * @src: source bio list - * @dst: destination bio list - * - * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats - * @src and @dst as linked lists of bios. - * - * Stops when it reaches the end of either @src or @dst - that is, copies - * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios). - */ -void bio_copy_data(struct bio *dst, struct bio *src) -{ - struct bio_vec *src_bv, *dst_bv; - unsigned src_offset, dst_offset, bytes; - void *src_p, *dst_p; - - src_bv = bio_iovec(src); - dst_bv = bio_iovec(dst); - - src_offset = src_bv->bv_offset; - dst_offset = dst_bv->bv_offset; - - while (1) { - if (src_offset == src_bv->bv_offset + src_bv->bv_len) { - src_bv++; - if (src_bv == bio_iovec_idx(src, src->bi_vcnt)) { - src = src->bi_next; - if (!src) - break; - - src_bv = bio_iovec(src); - } - - src_offset = src_bv->bv_offset; - } - - if (dst_offset == dst_bv->bv_offset + dst_bv->bv_len) { - dst_bv++; - if (dst_bv == bio_iovec_idx(dst, dst->bi_vcnt)) { - dst = dst->bi_next; - if (!dst) - break; - - dst_bv = bio_iovec(dst); - } - - dst_offset = dst_bv->bv_offset; - } - - bytes = min(dst_bv->bv_offset + dst_bv->bv_len - dst_offset, - src_bv->bv_offset + src_bv->bv_len - src_offset); - - src_p = kmap_atomic(src_bv->bv_page); - dst_p = kmap_atomic(dst_bv->bv_page); - - memcpy(dst_p + dst_offset, - src_p + src_offset, - bytes); - - kunmap_atomic(dst_p); - kunmap_atomic(src_p); - - src_offset += bytes; - dst_offset += bytes; - } -} -EXPORT_SYMBOL(bio_copy_data); - -struct bio_map_data { - struct bio_vec *iovecs; - struct sg_iovec *sgvecs; - int nr_sgvecs; - int is_our_pages; -}; - -static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, - struct sg_iovec *iov, int iov_count, - int is_our_pages) -{ - memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt); - memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); - bmd->nr_sgvecs = iov_count; - bmd->is_our_pages = is_our_pages; - bio->bi_private = bmd; -} - -static void bio_free_map_data(struct bio_map_data *bmd) -{ - kfree(bmd->iovecs); - kfree(bmd->sgvecs); - kfree(bmd); -} - -static struct bio_map_data *bio_alloc_map_data(int nr_segs, - unsigned int iov_count, - gfp_t gfp_mask) -{ - struct bio_map_data *bmd; - - if (iov_count > UIO_MAXIOV) - return NULL; - - bmd = kmalloc(sizeof(*bmd), gfp_mask); - if (!bmd) - return NULL; - - bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask); - if (!bmd->iovecs) { - kfree(bmd); - return NULL; - } - - bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask); - if (bmd->sgvecs) - return bmd; - - kfree(bmd->iovecs); - kfree(bmd); - return NULL; -} - -static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, - struct sg_iovec *iov, int iov_count, - int to_user, int from_user, int do_free_page) -{ - int ret = 0, i; - struct bio_vec *bvec; - int iov_idx = 0; - unsigned int iov_off = 0; - - bio_for_each_segment_all(bvec, bio, i) { - char *bv_addr = page_address(bvec->bv_page); - unsigned int bv_len = iovecs[i].bv_len; - - while (bv_len && iov_idx < iov_count) { - unsigned int bytes; - char __user *iov_addr; - - bytes = min_t(unsigned int, - iov[iov_idx].iov_len - iov_off, bv_len); - iov_addr = iov[iov_idx].iov_base + iov_off; - - if (!ret) { - if (to_user) - ret = copy_to_user(iov_addr, bv_addr, - bytes); - - if (from_user) - ret = copy_from_user(bv_addr, iov_addr, - bytes); - - if (ret) - ret = -EFAULT; - } - - bv_len -= bytes; - bv_addr += bytes; - iov_addr += bytes; - iov_off += bytes; - - if (iov[iov_idx].iov_len == iov_off) { - iov_idx++; - iov_off = 0; - } - } - - if (do_free_page) - __free_page(bvec->bv_page); - } - - return ret; -} - -/** - * bio_uncopy_user - finish previously mapped bio - * @bio: bio being terminated - * - * Free pages allocated from bio_copy_user() and write back data - * to user space in case of a read. - */ -int bio_uncopy_user(struct bio *bio) -{ - struct bio_map_data *bmd = bio->bi_private; - struct bio_vec *bvec; - int ret = 0, i; - - if (!bio_flagged(bio, BIO_NULL_MAPPED)) { - /* - * if we're in a workqueue, the request is orphaned, so - * don't copy into a random user address space, just free. - */ - if (current->mm) - ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, - bmd->nr_sgvecs, bio_data_dir(bio) == READ, - 0, bmd->is_our_pages); - else if (bmd->is_our_pages) - bio_for_each_segment_all(bvec, bio, i) - __free_page(bvec->bv_page); - } - bio_free_map_data(bmd); - bio_put(bio); - return ret; -} -EXPORT_SYMBOL(bio_uncopy_user); - -/** - * bio_copy_user_iov - copy user data to bio - * @q: destination block queue - * @map_data: pointer to the rq_map_data holding pages (if necessary) - * @iov: the iovec. - * @iov_count: number of elements in the iovec - * @write_to_vm: bool indicating writing to pages or not - * @gfp_mask: memory allocation flags - * - * Prepares and returns a bio for indirect user io, bouncing data - * to/from kernel pages as necessary. Must be paired with - * call bio_uncopy_user() on io completion. - */ -struct bio *bio_copy_user_iov(struct request_queue *q, - struct rq_map_data *map_data, - struct sg_iovec *iov, int iov_count, - int write_to_vm, gfp_t gfp_mask) -{ - struct bio_map_data *bmd; - struct bio_vec *bvec; - struct page *page; - struct bio *bio; - int i, ret; - int nr_pages = 0; - unsigned int len = 0; - unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0; - - for (i = 0; i < iov_count; i++) { - unsigned long uaddr; - unsigned long end; - unsigned long start; - - uaddr = (unsigned long)iov[i].iov_base; - end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; - start = uaddr >> PAGE_SHIFT; - - /* - * Overflow, abort - */ - if (end < start) - return ERR_PTR(-EINVAL); - - nr_pages += end - start; - len += iov[i].iov_len; - } - - if (offset) - nr_pages++; - - bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask); - if (!bmd) - return ERR_PTR(-ENOMEM); - - ret = -ENOMEM; - bio = bio_kmalloc(gfp_mask, nr_pages); - if (!bio) - goto out_bmd; - - if (!write_to_vm) - bio->bi_rw |= REQ_WRITE; - - ret = 0; - - if (map_data) { - nr_pages = 1 << map_data->page_order; - i = map_data->offset / PAGE_SIZE; - } - while (len) { - unsigned int bytes = PAGE_SIZE; - - bytes -= offset; - - if (bytes > len) - bytes = len; - - if (map_data) { - if (i == map_data->nr_entries * nr_pages) { - ret = -ENOMEM; - break; - } - - page = map_data->pages[i / nr_pages]; - page += (i % nr_pages); - - i++; - } else { - page = alloc_page(q->bounce_gfp | gfp_mask); - if (!page) { - ret = -ENOMEM; - break; - } - } - - if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) - break; - - len -= bytes; - offset = 0; - } - - if (ret) - goto cleanup; - - /* - * success - */ - if ((!write_to_vm && (!map_data || !map_data->null_mapped)) || - (map_data && map_data->from_user)) { - ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 1, 0); - if (ret) - goto cleanup; - } - - bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1); - return bio; -cleanup: - if (!map_data) - bio_for_each_segment_all(bvec, bio, i) - __free_page(bvec->bv_page); - - bio_put(bio); -out_bmd: - bio_free_map_data(bmd); - return ERR_PTR(ret); -} - -/** - * bio_copy_user - copy user data to bio - * @q: destination block queue - * @map_data: pointer to the rq_map_data holding pages (if necessary) - * @uaddr: start of user address - * @len: length in bytes - * @write_to_vm: bool indicating writing to pages or not - * @gfp_mask: memory allocation flags - * - * Prepares and returns a bio for indirect user io, bouncing data - * to/from kernel pages as necessary. Must be paired with - * call bio_uncopy_user() on io completion. - */ -struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data, - unsigned long uaddr, unsigned int len, - int write_to_vm, gfp_t gfp_mask) -{ - struct sg_iovec iov; - - iov.iov_base = (void __user *)uaddr; - iov.iov_len = len; - - return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask); -} -EXPORT_SYMBOL(bio_copy_user); - -static struct bio *__bio_map_user_iov(struct request_queue *q, - struct block_device *bdev, - struct sg_iovec *iov, int iov_count, - int write_to_vm, gfp_t gfp_mask) -{ - int i, j; - int nr_pages = 0; - struct page **pages; - struct bio *bio; - int cur_page = 0; - int ret, offset; - - for (i = 0; i < iov_count; i++) { - unsigned long uaddr = (unsigned long)iov[i].iov_base; - unsigned long len = iov[i].iov_len; - unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = uaddr >> PAGE_SHIFT; - - /* - * Overflow, abort - */ - if (end < start) - return ERR_PTR(-EINVAL); - - nr_pages += end - start; - /* - * buffer must be aligned to at least hardsector size for now - */ - if (uaddr & queue_dma_alignment(q)) - return ERR_PTR(-EINVAL); - } - - if (!nr_pages) - return ERR_PTR(-EINVAL); - - bio = bio_kmalloc(gfp_mask, nr_pages); - if (!bio) - return ERR_PTR(-ENOMEM); - - ret = -ENOMEM; - pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask); - if (!pages) - goto out; - - for (i = 0; i < iov_count; i++) { - unsigned long uaddr = (unsigned long)iov[i].iov_base; - unsigned long len = iov[i].iov_len; - unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = uaddr >> PAGE_SHIFT; - const int local_nr_pages = end - start; - const int page_limit = cur_page + local_nr_pages; - - ret = get_user_pages_fast(uaddr, local_nr_pages, - write_to_vm, &pages[cur_page]); - if (ret < local_nr_pages) { - ret = -EFAULT; - goto out_unmap; - } - - offset = uaddr & ~PAGE_MASK; - for (j = cur_page; j < page_limit; j++) { - unsigned int bytes = PAGE_SIZE - offset; - - if (len <= 0) - break; - - if (bytes > len) - bytes = len; - - /* - * sorry... - */ - if (bio_add_pc_page(q, bio, pages[j], bytes, offset) < - bytes) - break; - - len -= bytes; - offset = 0; - } - - cur_page = j; - /* - * release the pages we didn't map into the bio, if any - */ - while (j < page_limit) - page_cache_release(pages[j++]); - } - - kfree(pages); - - /* - * set data direction, and check if mapped pages need bouncing - */ - if (!write_to_vm) - bio->bi_rw |= REQ_WRITE; - - bio->bi_bdev = bdev; - bio->bi_flags |= (1 << BIO_USER_MAPPED); - return bio; - - out_unmap: - for (i = 0; i < nr_pages; i++) { - if(!pages[i]) - break; - page_cache_release(pages[i]); - } - out: - kfree(pages); - bio_put(bio); - return ERR_PTR(ret); -} - -/** - * bio_map_user - map user address into bio - * @q: the struct request_queue for the bio - * @bdev: destination block device - * @uaddr: start of user address - * @len: length in bytes - * @write_to_vm: bool indicating writing to pages or not - * @gfp_mask: memory allocation flags - * - * Map the user space address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev, - unsigned long uaddr, unsigned int len, int write_to_vm, - gfp_t gfp_mask) -{ - struct sg_iovec iov; - - iov.iov_base = (void __user *)uaddr; - iov.iov_len = len; - - return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask); -} -EXPORT_SYMBOL(bio_map_user); - -/** - * bio_map_user_iov - map user sg_iovec table into bio - * @q: the struct request_queue for the bio - * @bdev: destination block device - * @iov: the iovec. - * @iov_count: number of elements in the iovec - * @write_to_vm: bool indicating writing to pages or not - * @gfp_mask: memory allocation flags - * - * Map the user space address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, - struct sg_iovec *iov, int iov_count, - int write_to_vm, gfp_t gfp_mask) -{ - struct bio *bio; - - bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm, - gfp_mask); - if (IS_ERR(bio)) - return bio; - - /* - * subtle -- if __bio_map_user() ended up bouncing a bio, - * it would normally disappear when its bi_end_io is run. - * however, we need it for the unmap, so grab an extra - * reference to it - */ - bio_get(bio); - - return bio; -} - -static void __bio_unmap_user(struct bio *bio) -{ - struct bio_vec *bvec; - int i; - - /* - * make sure we dirty pages we wrote to - */ - bio_for_each_segment_all(bvec, bio, i) { - if (bio_data_dir(bio) == READ) - set_page_dirty_lock(bvec->bv_page); - - page_cache_release(bvec->bv_page); - } - - bio_put(bio); -} - -/** - * bio_unmap_user - unmap a bio - * @bio: the bio being unmapped - * - * Unmap a bio previously mapped by bio_map_user(). Must be called with - * a process context. - * - * bio_unmap_user() may sleep. - */ -void bio_unmap_user(struct bio *bio) -{ - __bio_unmap_user(bio); - bio_put(bio); -} -EXPORT_SYMBOL(bio_unmap_user); - -static void bio_map_kern_endio(struct bio *bio, int err) -{ - bio_put(bio); -} - -static struct bio *__bio_map_kern(struct request_queue *q, void *data, - unsigned int len, gfp_t gfp_mask) -{ - unsigned long kaddr = (unsigned long)data; - unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = kaddr >> PAGE_SHIFT; - const int nr_pages = end - start; - int offset, i; - struct bio *bio; - - bio = bio_kmalloc(gfp_mask, nr_pages); - if (!bio) - return ERR_PTR(-ENOMEM); - - offset = offset_in_page(kaddr); - for (i = 0; i < nr_pages; i++) { - unsigned int bytes = PAGE_SIZE - offset; - - if (len <= 0) - break; - - if (bytes > len) - bytes = len; - - if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, - offset) < bytes) - break; - - data += bytes; - len -= bytes; - offset = 0; - } - - bio->bi_end_io = bio_map_kern_endio; - return bio; -} - -/** - * bio_map_kern - map kernel address into bio - * @q: the struct request_queue for the bio - * @data: pointer to buffer to map - * @len: length in bytes - * @gfp_mask: allocation flags for bio allocation - * - * Map the kernel address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, - gfp_t gfp_mask) -{ - struct bio *bio; - - bio = __bio_map_kern(q, data, len, gfp_mask); - if (IS_ERR(bio)) - return bio; - - if (bio->bi_size == len) - return bio; - - /* - * Don't support partial mappings. - */ - bio_put(bio); - return ERR_PTR(-EINVAL); -} -EXPORT_SYMBOL(bio_map_kern); - -static void bio_copy_kern_endio(struct bio *bio, int err) -{ - struct bio_vec *bvec; - const int read = bio_data_dir(bio) == READ; - struct bio_map_data *bmd = bio->bi_private; - int i; - char *p = bmd->sgvecs[0].iov_base; - - bio_for_each_segment_all(bvec, bio, i) { - char *addr = page_address(bvec->bv_page); - int len = bmd->iovecs[i].bv_len; - - if (read) - memcpy(p, addr, len); - - __free_page(bvec->bv_page); - p += len; - } - - bio_free_map_data(bmd); - bio_put(bio); -} - -/** - * bio_copy_kern - copy kernel address into bio - * @q: the struct request_queue for the bio - * @data: pointer to buffer to copy - * @len: length in bytes - * @gfp_mask: allocation flags for bio and page allocation - * @reading: data direction is READ - * - * copy the kernel address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, - gfp_t gfp_mask, int reading) -{ - struct bio *bio; - struct bio_vec *bvec; - int i; - - bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask); - if (IS_ERR(bio)) - return bio; - - if (!reading) { - void *p = data; - - bio_for_each_segment_all(bvec, bio, i) { - char *addr = page_address(bvec->bv_page); - - memcpy(addr, p, bvec->bv_len); - p += bvec->bv_len; - } - } - - bio->bi_end_io = bio_copy_kern_endio; - - return bio; -} -EXPORT_SYMBOL(bio_copy_kern); - -/* - * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions - * for performing direct-IO in BIOs. - * - * The problem is that we cannot run set_page_dirty() from interrupt context - * because the required locks are not interrupt-safe. So what we can do is to - * mark the pages dirty _before_ performing IO. And in interrupt context, - * check that the pages are still dirty. If so, fine. If not, redirty them - * in process context. - * - * We special-case compound pages here: normally this means reads into hugetlb - * pages. The logic in here doesn't really work right for compound pages - * because the VM does not uniformly chase down the head page in all cases. - * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't - * handle them at all. So we skip compound pages here at an early stage. - * - * Note that this code is very hard to test under normal circumstances because - * direct-io pins the pages with get_user_pages(). This makes - * is_page_cache_freeable return false, and the VM will not clean the pages. - * But other code (eg, flusher threads) could clean the pages if they are mapped - * pagecache. - * - * Simply disabling the call to bio_set_pages_dirty() is a good way to test the - * deferred bio dirtying paths. - */ - -/* - * bio_set_pages_dirty() will mark all the bio's pages as dirty. - */ -void bio_set_pages_dirty(struct bio *bio) -{ - struct bio_vec *bvec; - int i; - - bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; - - if (page && !PageCompound(page)) - set_page_dirty_lock(page); - } -} - -static void bio_release_pages(struct bio *bio) -{ - struct bio_vec *bvec; - int i; - - bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; - - if (page) - put_page(page); - } -} - -/* - * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. - * If they are, then fine. If, however, some pages are clean then they must - * have been written out during the direct-IO read. So we take another ref on - * the BIO and the offending pages and re-dirty the pages in process context. - * - * It is expected that bio_check_pages_dirty() will wholly own the BIO from - * here on. It will run one page_cache_release() against each page and will - * run one bio_put() against the BIO. - */ - -static void bio_dirty_fn(struct work_struct *work); - -static DECLARE_WORK(bio_dirty_work, bio_dirty_fn); -static DEFINE_SPINLOCK(bio_dirty_lock); -static struct bio *bio_dirty_list; - -/* - * This runs in process context - */ -static void bio_dirty_fn(struct work_struct *work) -{ - unsigned long flags; - struct bio *bio; - - spin_lock_irqsave(&bio_dirty_lock, flags); - bio = bio_dirty_list; - bio_dirty_list = NULL; - spin_unlock_irqrestore(&bio_dirty_lock, flags); - - while (bio) { - struct bio *next = bio->bi_private; - - bio_set_pages_dirty(bio); - bio_release_pages(bio); - bio_put(bio); - bio = next; - } -} - -void bio_check_pages_dirty(struct bio *bio) -{ - struct bio_vec *bvec; - int nr_clean_pages = 0; - int i; - - bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; - - if (PageDirty(page) || PageCompound(page)) { - page_cache_release(page); - bvec->bv_page = NULL; - } else { - nr_clean_pages++; - } - } - - if (nr_clean_pages) { - unsigned long flags; - - spin_lock_irqsave(&bio_dirty_lock, flags); - bio->bi_private = bio_dirty_list; - bio_dirty_list = bio; - spin_unlock_irqrestore(&bio_dirty_lock, flags); - schedule_work(&bio_dirty_work); - } else { - bio_put(bio); - } -} - -#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -void bio_flush_dcache_pages(struct bio *bi) -{ - int i; - struct bio_vec *bvec; - - bio_for_each_segment(bvec, bi, i) - flush_dcache_page(bvec->bv_page); -} -EXPORT_SYMBOL(bio_flush_dcache_pages); -#endif - -/** - * bio_endio - end I/O on a bio - * @bio: bio - * @error: error, if any - * - * Description: - * bio_endio() will end I/O on the whole bio. bio_endio() is the - * preferred way to end I/O on a bio, it takes care of clearing - * BIO_UPTODATE on error. @error is 0 on success, and and one of the - * established -Exxxx (-EIO, for instance) error values in case - * something went wrong. No one should call bi_end_io() directly on a - * bio unless they own it and thus know that it has an end_io - * function. - **/ -void bio_endio(struct bio *bio, int error) -{ - if (error) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = -EIO; - - if (bio->bi_end_io) - bio->bi_end_io(bio, error); -} -EXPORT_SYMBOL(bio_endio); - -void bio_pair_release(struct bio_pair *bp) -{ - if (atomic_dec_and_test(&bp->cnt)) { - struct bio *master = bp->bio1.bi_private; - - bio_endio(master, bp->error); - mempool_free(bp, bp->bio2.bi_private); - } -} -EXPORT_SYMBOL(bio_pair_release); - -static void bio_pair_end_1(struct bio *bi, int err) -{ - struct bio_pair *bp = container_of(bi, struct bio_pair, bio1); - - if (err) - bp->error = err; - - bio_pair_release(bp); -} - -static void bio_pair_end_2(struct bio *bi, int err) -{ - struct bio_pair *bp = container_of(bi, struct bio_pair, bio2); - - if (err) - bp->error = err; - - bio_pair_release(bp); -} - -/* - * split a bio - only worry about a bio with a single page in its iovec - */ -struct bio_pair *bio_split(struct bio *bi, int first_sectors) -{ - struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO); - - if (!bp) - return bp; - - trace_block_split(bdev_get_queue(bi->bi_bdev), bi, - bi->bi_sector + first_sectors); - - BUG_ON(bio_segments(bi) > 1); - atomic_set(&bp->cnt, 3); - bp->error = 0; - bp->bio1 = *bi; - bp->bio2 = *bi; - bp->bio2.bi_sector += first_sectors; - bp->bio2.bi_size -= first_sectors << 9; - bp->bio1.bi_size = first_sectors << 9; - - if (bi->bi_vcnt != 0) { - bp->bv1 = *bio_iovec(bi); - bp->bv2 = *bio_iovec(bi); - - if (bio_is_rw(bi)) { - bp->bv2.bv_offset += first_sectors << 9; - bp->bv2.bv_len -= first_sectors << 9; - bp->bv1.bv_len = first_sectors << 9; - } - - bp->bio1.bi_io_vec = &bp->bv1; - bp->bio2.bi_io_vec = &bp->bv2; - - bp->bio1.bi_max_vecs = 1; - bp->bio2.bi_max_vecs = 1; - } - - bp->bio1.bi_end_io = bio_pair_end_1; - bp->bio2.bi_end_io = bio_pair_end_2; - - bp->bio1.bi_private = bi; - bp->bio2.bi_private = bio_split_pool; - - if (bio_integrity(bi)) - bio_integrity_split(bi, bp, first_sectors); - - return bp; -} -EXPORT_SYMBOL(bio_split); - -/** - * bio_sector_offset - Find hardware sector offset in bio - * @bio: bio to inspect - * @index: bio_vec index - * @offset: offset in bv_page - * - * Return the number of hardware sectors between beginning of bio - * and an end point indicated by a bio_vec index and an offset - * within that vector's page. - */ -sector_t bio_sector_offset(struct bio *bio, unsigned short index, - unsigned int offset) -{ - unsigned int sector_sz; - struct bio_vec *bv; - sector_t sectors; - int i; - - sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue); - sectors = 0; - - if (index >= bio->bi_idx) - index = bio->bi_vcnt - 1; - - bio_for_each_segment_all(bv, bio, i) { - if (i == index) { - if (offset > bv->bv_offset) - sectors += (offset - bv->bv_offset) / sector_sz; - break; - } - - sectors += bv->bv_len / sector_sz; - } - - return sectors; -} -EXPORT_SYMBOL(bio_sector_offset); - -/* - * create memory pools for biovec's in a bio_set. - * use the global biovec slabs created for general use. - */ -mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries) -{ - struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; - - return mempool_create_slab_pool(pool_entries, bp->slab); -} - -void bioset_free(struct bio_set *bs) -{ - if (bs->rescue_workqueue) - destroy_workqueue(bs->rescue_workqueue); - - if (bs->bio_pool) - mempool_destroy(bs->bio_pool); - - if (bs->bvec_pool) - mempool_destroy(bs->bvec_pool); - - bioset_integrity_free(bs); - bio_put_slab(bs); - - kfree(bs); -} -EXPORT_SYMBOL(bioset_free); - -/** - * bioset_create - Create a bio_set - * @pool_size: Number of bio and bio_vecs to cache in the mempool - * @front_pad: Number of bytes to allocate in front of the returned bio - * - * Description: - * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller - * to ask for a number of bytes to be allocated in front of the bio. - * Front pad allocation is useful for embedding the bio inside - * another structure, to avoid allocating extra data to go with the bio. - * Note that the bio must be embedded at the END of that structure always, - * or things will break badly. - */ -struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) -{ - unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); - struct bio_set *bs; - - bs = kzalloc(sizeof(*bs), GFP_KERNEL); - if (!bs) - return NULL; - - bs->front_pad = front_pad; - - spin_lock_init(&bs->rescue_lock); - bio_list_init(&bs->rescue_list); - INIT_WORK(&bs->rescue_work, bio_alloc_rescue); - - bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); - if (!bs->bio_slab) { - kfree(bs); - return NULL; - } - - bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab); - if (!bs->bio_pool) - goto bad; - - bs->bvec_pool = biovec_create_pool(bs, pool_size); - if (!bs->bvec_pool) - goto bad; - - bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); - if (!bs->rescue_workqueue) - goto bad; - - return bs; -bad: - bioset_free(bs); - return NULL; -} -EXPORT_SYMBOL(bioset_create); - -#ifdef CONFIG_BLK_CGROUP -/** - * bio_associate_current - associate a bio with %current - * @bio: target bio - * - * Associate @bio with %current if it hasn't been associated yet. Block - * layer will treat @bio as if it were issued by %current no matter which - * task actually issues it. - * - * This function takes an extra reference of @task's io_context and blkcg - * which will be put when @bio is released. The caller must own @bio, - * ensure %current->io_context exists, and is responsible for synchronizing - * calls to this function. - */ -int bio_associate_current(struct bio *bio) -{ - struct io_context *ioc; - struct cgroup_subsys_state *css; - - if (bio->bi_ioc) - return -EBUSY; - - ioc = current->io_context; - if (!ioc) - return -ENOENT; - - /* acquire active ref on @ioc and associate */ - get_io_context_active(ioc); - bio->bi_ioc = ioc; - - /* associate blkcg if exists */ - rcu_read_lock(); - css = task_css(current, blkio_subsys_id); - if (css && css_tryget(css)) - bio->bi_css = css; - rcu_read_unlock(); - - return 0; -} - -/** - * bio_disassociate_task - undo bio_associate_current() - * @bio: target bio - */ -void bio_disassociate_task(struct bio *bio) -{ - if (bio->bi_ioc) { - put_io_context(bio->bi_ioc); - bio->bi_ioc = NULL; - } - if (bio->bi_css) { - css_put(bio->bi_css); - bio->bi_css = NULL; - } -} - -#endif /* CONFIG_BLK_CGROUP */ - -static void __init biovec_init_slabs(void) -{ - int i; - - for (i = 0; i < BIOVEC_NR_POOLS; i++) { - int size; - struct biovec_slab *bvs = bvec_slabs + i; - - if (bvs->nr_vecs <= BIO_INLINE_VECS) { - bvs->slab = NULL; - continue; - } - - size = bvs->nr_vecs * sizeof(struct bio_vec); - bvs->slab = kmem_cache_create(bvs->name, size, 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - } -} - -static int __init init_bio(void) -{ - bio_slab_max = 2; - bio_slab_nr = 0; - bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL); - if (!bio_slabs) - panic("bio: can't allocate bios\n"); - - bio_integrity_init(); - biovec_init_slabs(); - - fs_bio_set = bioset_create(BIO_POOL_SIZE, 0); - if (!fs_bio_set) - panic("bio: can't allocate bios\n"); - - if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE)) - panic("bio: can't create integrity pool\n"); - - bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES, - sizeof(struct bio_pair)); - if (!bio_split_pool) - panic("bio: can't create split pool\n"); - - return 0; -} -subsys_initcall(init_bio); diff --git a/fs/block_dev.c b/fs/block_dev.c index 1e86823a9cb..6d7274619bf 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -83,7 +83,7 @@ void kill_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; - if (mapping->nrpages == 0) + if (mapping->nrpages == 0 && mapping->nrshadows == 0) return; invalidate_bh_lrus(); @@ -165,14 +165,15 @@ blkdev_get_block(struct inode *inode, sector_t iblock, } static ssize_t -blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs) +blkdev_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset, - nr_segs, blkdev_get_block, NULL, NULL, 0); + return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iter, + offset, blkdev_get_block, + NULL, NULL, 0); } int __sync_blockdev(struct block_device *bdev, int wait) @@ -363,6 +364,69 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) } EXPORT_SYMBOL(blkdev_fsync); +/** + * bdev_read_page() - Start reading a page from a block device + * @bdev: The device to read the page from + * @sector: The offset on the device to read the page to (need not be aligned) + * @page: The page to read + * + * On entry, the page should be locked. It will be unlocked when the page + * has been read. If the block driver implements rw_page synchronously, + * that will be true on exit from this function, but it need not be. + * + * Errors returned by this function are usually "soft", eg out of memory, or + * queue full; callers should try a different route to read this page rather + * than propagate an error back up the stack. + * + * Return: negative errno if an error occurs, 0 if submission was successful. + */ +int bdev_read_page(struct block_device *bdev, sector_t sector, + struct page *page) +{ + const struct block_device_operations *ops = bdev->bd_disk->fops; + if (!ops->rw_page) + return -EOPNOTSUPP; + return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); +} +EXPORT_SYMBOL_GPL(bdev_read_page); + +/** + * bdev_write_page() - Start writing a page to a block device + * @bdev: The device to write the page to + * @sector: The offset on the device to write the page to (need not be aligned) + * @page: The page to write + * @wbc: The writeback_control for the write + * + * On entry, the page should be locked and not currently under writeback. + * On exit, if the write started successfully, the page will be unlocked and + * under writeback. If the write failed already (eg the driver failed to + * queue the page to the device), the page will still be locked. If the + * caller is a ->writepage implementation, it will need to unlock the page. + * + * Errors returned by this function are usually "soft", eg out of memory, or + * queue full; callers should try a different route to write this page rather + * than propagate an error back up the stack. + * + * Return: negative errno if an error occurs, 0 if submission was successful. + */ +int bdev_write_page(struct block_device *bdev, sector_t sector, + struct page *page, struct writeback_control *wbc) +{ + int result; + int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE; + const struct block_device_operations *ops = bdev->bd_disk->fops; + if (!ops->rw_page) + return -EOPNOTSUPP; + set_page_writeback(page); + result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw); + if (result) + end_page_writeback(page); + else + unlock_page(page); + return result; +} +EXPORT_SYMBOL_GPL(bdev_write_page); + /* * pseudo-fs */ @@ -419,7 +483,7 @@ static void bdev_evict_inode(struct inode *inode) { struct block_device *bdev = &BDEV_I(inode)->bdev; struct list_head *p; - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); /* is it needed here? */ clear_inode(inode); spin_lock(&bdev_lock); @@ -1508,43 +1572,38 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) * Does not take i_mutex for the write and thus is not for general purpose * use. */ -ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct blk_plug plug; ssize_t ret; - BUG_ON(iocb->ki_pos != pos); - blk_start_plug(&plug); - ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + ret = __generic_file_write_iter(iocb, from); if (ret > 0) { ssize_t err; - - err = generic_write_sync(file, pos, ret); - if (err < 0 && ret > 0) + err = generic_write_sync(file, iocb->ki_pos - ret, ret); + if (err < 0) ret = err; } blk_finish_plug(&plug); return ret; } -EXPORT_SYMBOL_GPL(blkdev_aio_write); +EXPORT_SYMBOL_GPL(blkdev_write_iter); -static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *bd_inode = file->f_mapping->host; loff_t size = i_size_read(bd_inode); + loff_t pos = iocb->ki_pos; if (pos >= size) return 0; size -= pos; - if (size < iocb->ki_nbytes) - nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size); - return generic_file_aio_read(iocb, iov, nr_segs, pos); + iov_iter_truncate(to, size); + return generic_file_read_iter(iocb, to); } /* @@ -1576,10 +1635,10 @@ const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_close, .llseek = block_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = blkdev_aio_read, - .aio_write = blkdev_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = blkdev_read_iter, + .write_iter = blkdev_write_iter, .mmap = generic_file_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, @@ -1587,7 +1646,7 @@ const struct file_operations def_blk_fops = { .compat_ioctl = compat_blkdev_ioctl, #endif .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, }; int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 398cbd517be..a66768ebc8d 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -1,6 +1,7 @@ config BTRFS_FS tristate "Btrfs filesystem support" - select LIBCRC32C + select CRYPTO + select CRYPTO_CRC32C select ZLIB_INFLATE select ZLIB_DEFLATE select LZO_COMPRESS @@ -9,12 +10,17 @@ config BTRFS_FS select XOR_BLOCKS help - Btrfs is a new filesystem with extents, writable snapshotting, - support for multiple devices and many more features. + Btrfs is a general purpose copy-on-write filesystem with extents, + writable snapshotting, support for multiple devices and many more + features focused on fault tolerance, repair and easy administration. - Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET - FINALIZED. You should say N here unless you are interested in - testing Btrfs with non-critical data. + The filesystem disk format is no longer unstable, and it's not + expected to change unless there are strong reasons to do so. If there + is a format change, file systems with a unchanged format will + continue to be mountable and usable by newer kernels. + + For more information, please see the web pages at + http://btrfs.wiki.kernel.org. To compile this file system support as a module, choose M here. The module will be called btrfs. @@ -59,7 +65,8 @@ config BTRFS_FS_RUN_SANITY_TESTS help This will run some basic sanity tests on the free space cache code to make sure it is acting as it should. These are mostly - regression tests and are only really interesting to btrfs devlopers. + regression tests and are only really interesting to btrfs + developers. If unsure, say N. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index a91a6a355cc..6d1d0b93b1a 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -9,9 +9,11 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ - uuid-tree.o + uuid-tree.o props.o hash.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o -btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o +btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ + tests/extent-buffer-tests.o tests/btrfs-tests.o \ + tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index e15d2b0d8d3..9a0124a9585 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -35,13 +35,6 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) char *value = NULL; struct posix_acl *acl; - if (!IS_POSIXACL(inode)) - return NULL; - - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - switch (type) { case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; @@ -76,44 +69,16 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) return acl; } -static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name, - void *value, size_t size, int type) -{ - struct posix_acl *acl; - int ret = 0; - - if (!IS_POSIXACL(dentry->d_inode)) - return -EOPNOTSUPP; - - acl = btrfs_get_acl(dentry->d_inode, type); - - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); - posix_acl_release(acl); - - return ret; -} - /* * Needs to be called with fs_mutex held */ -static int btrfs_set_acl(struct btrfs_trans_handle *trans, +static int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct posix_acl *acl, int type) { int ret, size = 0; const char *name; char *value = NULL; - if (acl) { - ret = posix_acl_valid(acl); - if (ret < 0) - return ret; - ret = 0; - } - switch (type) { case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; @@ -158,35 +123,9 @@ out: return ret; } -static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - int ret; - struct posix_acl *acl = NULL; - - if (!inode_owner_or_capable(dentry->d_inode)) - return -EPERM; - - if (!IS_POSIXACL(dentry->d_inode)) - return -EOPNOTSUPP; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (acl) { - ret = posix_acl_valid(acl); - if (ret) - goto out; - } - } - - ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type); -out: - posix_acl_release(acl); - - return ret; + return __btrfs_set_acl(NULL, inode, acl, type); } /* @@ -197,83 +136,31 @@ out: int btrfs_init_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir) { - struct posix_acl *acl = NULL; + struct posix_acl *default_acl, *acl; int ret = 0; /* this happens with subvols */ if (!dir) return 0; - if (!S_ISLNK(inode->i_mode)) { - if (IS_POSIXACL(dir)) { - acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } + ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (ret) + return ret; - if (!acl) - inode->i_mode &= ~current_umask(); + if (default_acl) { + ret = __btrfs_set_acl(trans, inode, default_acl, + ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); } - if (IS_POSIXACL(dir) && acl) { - if (S_ISDIR(inode->i_mode)) { - ret = btrfs_set_acl(trans, inode, acl, - ACL_TYPE_DEFAULT); - if (ret) - goto failed; - } - ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); - if (ret < 0) - return ret; - - if (ret > 0) { - /* we need an acl */ - ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS); - } else { - cache_no_acl(inode); - } - } else { - cache_no_acl(inode); + if (acl) { + if (!ret) + ret = __btrfs_set_acl(trans, inode, acl, + ACL_TYPE_ACCESS); + posix_acl_release(acl); } -failed: - posix_acl_release(acl); - - return ret; -} -int btrfs_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; - int ret = 0; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - if (!IS_POSIXACL(inode)) - return 0; - - acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR_OR_NULL(acl)) - return PTR_ERR(acl); - - ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (ret) - return ret; - ret = btrfs_set_acl(NULL, inode, acl, ACL_TYPE_ACCESS); - posix_acl_release(acl); + if (!default_acl && !acl) + cache_no_acl(inode); return ret; } - -const struct xattr_handler btrfs_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .get = btrfs_xattr_acl_get, - .set = btrfs_xattr_acl_set, -}; - -const struct xattr_handler btrfs_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .get = btrfs_xattr_acl_get, - .set = btrfs_xattr_acl_set, -}; diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 08cc08f037a..5a201d81049 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -1,5 +1,6 @@ /* * Copyright (C) 2007 Oracle. All rights reserved. + * Copyright (C) 2014 Fujitsu. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public @@ -21,707 +22,315 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/freezer.h> +#include <linux/workqueue.h> #include "async-thread.h" +#include "ctree.h" + +#define WORK_DONE_BIT 0 +#define WORK_ORDER_DONE_BIT 1 +#define WORK_HIGH_PRIO_BIT 2 + +#define NO_THRESHOLD (-1) +#define DFT_THRESHOLD (32) + +struct __btrfs_workqueue { + struct workqueue_struct *normal_wq; + /* List head pointing to ordered work list */ + struct list_head ordered_list; + + /* Spinlock for ordered_list */ + spinlock_t list_lock; + + /* Thresholding related variants */ + atomic_t pending; + int max_active; + int current_max; + int thresh; + unsigned int count; + spinlock_t thres_lock; +}; -#define WORK_QUEUED_BIT 0 -#define WORK_DONE_BIT 1 -#define WORK_ORDER_DONE_BIT 2 -#define WORK_HIGH_PRIO_BIT 3 - -/* - * container for the kthread task pointer and the list of pending work - * One of these is allocated per thread. - */ -struct btrfs_worker_thread { - /* pool we belong to */ - struct btrfs_workers *workers; - - /* list of struct btrfs_work that are waiting for service */ - struct list_head pending; - struct list_head prio_pending; - - /* list of worker threads from struct btrfs_workers */ - struct list_head worker_list; - - /* kthread */ - struct task_struct *task; +struct btrfs_workqueue { + struct __btrfs_workqueue *normal; + struct __btrfs_workqueue *high; +}; - /* number of things on the pending list */ - atomic_t num_pending; +static inline struct __btrfs_workqueue +*__btrfs_alloc_workqueue(const char *name, int flags, int max_active, + int thresh) +{ + struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); - /* reference counter for this struct */ - atomic_t refs; + if (unlikely(!ret)) + return NULL; - unsigned long sequence; + ret->max_active = max_active; + atomic_set(&ret->pending, 0); + if (thresh == 0) + thresh = DFT_THRESHOLD; + /* For low threshold, disabling threshold is a better choice */ + if (thresh < DFT_THRESHOLD) { + ret->current_max = max_active; + ret->thresh = NO_THRESHOLD; + } else { + ret->current_max = 1; + ret->thresh = thresh; + } - /* protects the pending list. */ - spinlock_t lock; + if (flags & WQ_HIGHPRI) + ret->normal_wq = alloc_workqueue("%s-%s-high", flags, + ret->max_active, + "btrfs", name); + else + ret->normal_wq = alloc_workqueue("%s-%s", flags, + ret->max_active, "btrfs", + name); + if (unlikely(!ret->normal_wq)) { + kfree(ret); + return NULL; + } - /* set to non-zero when this thread is already awake and kicking */ - int working; + INIT_LIST_HEAD(&ret->ordered_list); + spin_lock_init(&ret->list_lock); + spin_lock_init(&ret->thres_lock); + trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI); + return ret; +} - /* are we currently idle */ - int idle; -}; +static inline void +__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq); -static int __btrfs_start_workers(struct btrfs_workers *workers); +struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, + int flags, + int max_active, + int thresh) +{ + struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); -/* - * btrfs_start_workers uses kthread_run, which can block waiting for memory - * for a very long time. It will actually throttle on page writeback, - * and so it may not make progress until after our btrfs worker threads - * process all of the pending work structs in their queue - * - * This means we can't use btrfs_start_workers from inside a btrfs worker - * thread that is used as part of cleaning dirty memory, which pretty much - * involves all of the worker threads. - * - * Instead we have a helper queue who never has more than one thread - * where we scheduler thread start operations. This worker_start struct - * is used to contain the work and hold a pointer to the queue that needs - * another worker. - */ -struct worker_start { - struct btrfs_work work; - struct btrfs_workers *queue; -}; + if (unlikely(!ret)) + return NULL; -static void start_new_worker_func(struct btrfs_work *work) -{ - struct worker_start *start; - start = container_of(work, struct worker_start, work); - __btrfs_start_workers(start->queue); - kfree(start); -} + ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI, + max_active, thresh); + if (unlikely(!ret->normal)) { + kfree(ret); + return NULL; + } -/* - * helper function to move a thread onto the idle list after it - * has finished some requests. - */ -static void check_idle_worker(struct btrfs_worker_thread *worker) -{ - if (!worker->idle && atomic_read(&worker->num_pending) < - worker->workers->idle_thresh / 2) { - unsigned long flags; - spin_lock_irqsave(&worker->workers->lock, flags); - worker->idle = 1; - - /* the list may be empty if the worker is just starting */ - if (!list_empty(&worker->worker_list) && - !worker->workers->stopping) { - list_move(&worker->worker_list, - &worker->workers->idle_list); + if (flags & WQ_HIGHPRI) { + ret->high = __btrfs_alloc_workqueue(name, flags, max_active, + thresh); + if (unlikely(!ret->high)) { + __btrfs_destroy_workqueue(ret->normal); + kfree(ret); + return NULL; } - spin_unlock_irqrestore(&worker->workers->lock, flags); } + return ret; } /* - * helper function to move a thread off the idle list after new - * pending work is added. + * Hook for threshold which will be called in btrfs_queue_work. + * This hook WILL be called in IRQ handler context, + * so workqueue_set_max_active MUST NOT be called in this hook */ -static void check_busy_worker(struct btrfs_worker_thread *worker) +static inline void thresh_queue_hook(struct __btrfs_workqueue *wq) { - if (worker->idle && atomic_read(&worker->num_pending) >= - worker->workers->idle_thresh) { - unsigned long flags; - spin_lock_irqsave(&worker->workers->lock, flags); - worker->idle = 0; - - if (!list_empty(&worker->worker_list) && - !worker->workers->stopping) { - list_move_tail(&worker->worker_list, - &worker->workers->worker_list); - } - spin_unlock_irqrestore(&worker->workers->lock, flags); - } + if (wq->thresh == NO_THRESHOLD) + return; + atomic_inc(&wq->pending); } -static void check_pending_worker_creates(struct btrfs_worker_thread *worker) +/* + * Hook for threshold which will be called before executing the work, + * This hook is called in kthread content. + * So workqueue_set_max_active is called here. + */ +static inline void thresh_exec_hook(struct __btrfs_workqueue *wq) { - struct btrfs_workers *workers = worker->workers; - struct worker_start *start; - unsigned long flags; - - rmb(); - if (!workers->atomic_start_pending) - return; + int new_max_active; + long pending; + int need_change = 0; - start = kzalloc(sizeof(*start), GFP_NOFS); - if (!start) + if (wq->thresh == NO_THRESHOLD) return; - start->work.func = start_new_worker_func; - start->queue = workers; - - spin_lock_irqsave(&workers->lock, flags); - if (!workers->atomic_start_pending) - goto out; - - workers->atomic_start_pending = 0; - if (workers->num_workers + workers->num_workers_starting >= - workers->max_workers) - goto out; - - workers->num_workers_starting += 1; - spin_unlock_irqrestore(&workers->lock, flags); - btrfs_queue_worker(workers->atomic_worker_start, &start->work); - return; + atomic_dec(&wq->pending); + spin_lock(&wq->thres_lock); + /* + * Use wq->count to limit the calling frequency of + * workqueue_set_max_active. + */ + wq->count++; + wq->count %= (wq->thresh / 4); + if (!wq->count) + goto out; + new_max_active = wq->current_max; + /* + * pending may be changed later, but it's OK since we really + * don't need it so accurate to calculate new_max_active. + */ + pending = atomic_read(&wq->pending); + if (pending > wq->thresh) + new_max_active++; + if (pending < wq->thresh / 2) + new_max_active--; + new_max_active = clamp_val(new_max_active, 1, wq->max_active); + if (new_max_active != wq->current_max) { + need_change = 1; + wq->current_max = new_max_active; + } out: - kfree(start); - spin_unlock_irqrestore(&workers->lock, flags); + spin_unlock(&wq->thres_lock); + + if (need_change) { + workqueue_set_max_active(wq->normal_wq, wq->current_max); + } } -static noinline void run_ordered_completions(struct btrfs_workers *workers, - struct btrfs_work *work) +static void run_ordered_work(struct __btrfs_workqueue *wq) { - if (!workers->ordered) - return; - - set_bit(WORK_DONE_BIT, &work->flags); - - spin_lock(&workers->order_lock); + struct list_head *list = &wq->ordered_list; + struct btrfs_work *work; + spinlock_t *lock = &wq->list_lock; + unsigned long flags; while (1) { - if (!list_empty(&workers->prio_order_list)) { - work = list_entry(workers->prio_order_list.next, - struct btrfs_work, order_list); - } else if (!list_empty(&workers->order_list)) { - work = list_entry(workers->order_list.next, - struct btrfs_work, order_list); - } else { + spin_lock_irqsave(lock, flags); + if (list_empty(list)) break; - } + work = list_entry(list->next, struct btrfs_work, + ordered_list); if (!test_bit(WORK_DONE_BIT, &work->flags)) break; - /* we are going to call the ordered done function, but + /* + * we are going to call the ordered done function, but * we leave the work item on the list as a barrier so * that later work items that are done don't have their * functions called before this one returns */ if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) break; - - spin_unlock(&workers->order_lock); - + trace_btrfs_ordered_sched(work); + spin_unlock_irqrestore(lock, flags); work->ordered_func(work); /* now take the lock again and drop our item from the list */ - spin_lock(&workers->order_lock); - list_del(&work->order_list); - spin_unlock(&workers->order_lock); + spin_lock_irqsave(lock, flags); + list_del(&work->ordered_list); + spin_unlock_irqrestore(lock, flags); /* * we don't want to call the ordered free functions * with the lock held though */ work->ordered_free(work); - spin_lock(&workers->order_lock); - } - - spin_unlock(&workers->order_lock); -} - -static void put_worker(struct btrfs_worker_thread *worker) -{ - if (atomic_dec_and_test(&worker->refs)) - kfree(worker); -} - -static int try_worker_shutdown(struct btrfs_worker_thread *worker) -{ - int freeit = 0; - - spin_lock_irq(&worker->lock); - spin_lock(&worker->workers->lock); - if (worker->workers->num_workers > 1 && - worker->idle && - !worker->working && - !list_empty(&worker->worker_list) && - list_empty(&worker->prio_pending) && - list_empty(&worker->pending) && - atomic_read(&worker->num_pending) == 0) { - freeit = 1; - list_del_init(&worker->worker_list); - worker->workers->num_workers--; + trace_btrfs_all_work_done(work); } - spin_unlock(&worker->workers->lock); - spin_unlock_irq(&worker->lock); - - if (freeit) - put_worker(worker); - return freeit; + spin_unlock_irqrestore(lock, flags); } -static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker, - struct list_head *prio_head, - struct list_head *head) +static void normal_work_helper(struct work_struct *arg) { - struct btrfs_work *work = NULL; - struct list_head *cur = NULL; - - if(!list_empty(prio_head)) - cur = prio_head->next; - - smp_mb(); - if (!list_empty(&worker->prio_pending)) - goto refill; - - if (!list_empty(head)) - cur = head->next; - - if (cur) - goto out; - -refill: - spin_lock_irq(&worker->lock); - list_splice_tail_init(&worker->prio_pending, prio_head); - list_splice_tail_init(&worker->pending, head); - - if (!list_empty(prio_head)) - cur = prio_head->next; - else if (!list_empty(head)) - cur = head->next; - spin_unlock_irq(&worker->lock); - - if (!cur) - goto out_fail; - -out: - work = list_entry(cur, struct btrfs_work, list); - -out_fail: - return work; -} - -/* - * main loop for servicing work items - */ -static int worker_loop(void *arg) -{ - struct btrfs_worker_thread *worker = arg; - struct list_head head; - struct list_head prio_head; struct btrfs_work *work; + struct __btrfs_workqueue *wq; + int need_order = 0; - INIT_LIST_HEAD(&head); - INIT_LIST_HEAD(&prio_head); - - do { -again: - while (1) { - - - work = get_next_work(worker, &prio_head, &head); - if (!work) - break; - - list_del(&work->list); - clear_bit(WORK_QUEUED_BIT, &work->flags); - - work->worker = worker; - - work->func(work); - - atomic_dec(&worker->num_pending); - /* - * unless this is an ordered work queue, - * 'work' was probably freed by func above. - */ - run_ordered_completions(worker->workers, work); - - check_pending_worker_creates(worker); - cond_resched(); - } - - spin_lock_irq(&worker->lock); - check_idle_worker(worker); - - if (freezing(current)) { - worker->working = 0; - spin_unlock_irq(&worker->lock); - try_to_freeze(); - } else { - spin_unlock_irq(&worker->lock); - if (!kthread_should_stop()) { - cpu_relax(); - /* - * we've dropped the lock, did someone else - * jump_in? - */ - smp_mb(); - if (!list_empty(&worker->pending) || - !list_empty(&worker->prio_pending)) - continue; - - /* - * this short schedule allows more work to - * come in without the queue functions - * needing to go through wake_up_process() - * - * worker->working is still 1, so nobody - * is going to try and wake us up - */ - schedule_timeout(1); - smp_mb(); - if (!list_empty(&worker->pending) || - !list_empty(&worker->prio_pending)) - continue; - - if (kthread_should_stop()) - break; - - /* still no more work?, sleep for real */ - spin_lock_irq(&worker->lock); - set_current_state(TASK_INTERRUPTIBLE); - if (!list_empty(&worker->pending) || - !list_empty(&worker->prio_pending)) { - spin_unlock_irq(&worker->lock); - set_current_state(TASK_RUNNING); - goto again; - } - - /* - * this makes sure we get a wakeup when someone - * adds something new to the queue - */ - worker->working = 0; - spin_unlock_irq(&worker->lock); - - if (!kthread_should_stop()) { - schedule_timeout(HZ * 120); - if (!worker->working && - try_worker_shutdown(worker)) { - return 0; - } - } - } - __set_current_state(TASK_RUNNING); - } - } while (!kthread_should_stop()); - return 0; -} - -/* - * this will wait for all the worker threads to shutdown - */ -void btrfs_stop_workers(struct btrfs_workers *workers) -{ - struct list_head *cur; - struct btrfs_worker_thread *worker; - int can_stop; - - spin_lock_irq(&workers->lock); - workers->stopping = 1; - list_splice_init(&workers->idle_list, &workers->worker_list); - while (!list_empty(&workers->worker_list)) { - cur = workers->worker_list.next; - worker = list_entry(cur, struct btrfs_worker_thread, - worker_list); - - atomic_inc(&worker->refs); - workers->num_workers -= 1; - if (!list_empty(&worker->worker_list)) { - list_del_init(&worker->worker_list); - put_worker(worker); - can_stop = 1; - } else - can_stop = 0; - spin_unlock_irq(&workers->lock); - if (can_stop) - kthread_stop(worker->task); - spin_lock_irq(&workers->lock); - put_worker(worker); + work = container_of(arg, struct btrfs_work, normal_work); + /* + * We should not touch things inside work in the following cases: + * 1) after work->func() if it has no ordered_free + * Since the struct is freed in work->func(). + * 2) after setting WORK_DONE_BIT + * The work may be freed in other threads almost instantly. + * So we save the needed things here. + */ + if (work->ordered_func) + need_order = 1; + wq = work->wq; + + trace_btrfs_work_sched(work); + thresh_exec_hook(wq); + work->func(work); + if (need_order) { + set_bit(WORK_DONE_BIT, &work->flags); + run_ordered_work(wq); } - spin_unlock_irq(&workers->lock); + if (!need_order) + trace_btrfs_all_work_done(work); } -/* - * simple init on struct btrfs_workers - */ -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, - struct btrfs_workers *async_helper) +void btrfs_init_work(struct btrfs_work *work, + btrfs_func_t func, + btrfs_func_t ordered_func, + btrfs_func_t ordered_free) { - workers->num_workers = 0; - workers->num_workers_starting = 0; - INIT_LIST_HEAD(&workers->worker_list); - INIT_LIST_HEAD(&workers->idle_list); - INIT_LIST_HEAD(&workers->order_list); - INIT_LIST_HEAD(&workers->prio_order_list); - spin_lock_init(&workers->lock); - spin_lock_init(&workers->order_lock); - workers->max_workers = max; - workers->idle_thresh = 32; - workers->name = name; - workers->ordered = 0; - workers->atomic_start_pending = 0; - workers->atomic_worker_start = async_helper; - workers->stopping = 0; + work->func = func; + work->ordered_func = ordered_func; + work->ordered_free = ordered_free; + INIT_WORK(&work->normal_work, normal_work_helper); + INIT_LIST_HEAD(&work->ordered_list); + work->flags = 0; } -/* - * starts new worker threads. This does not enforce the max worker - * count in case you need to temporarily go past it. - */ -static int __btrfs_start_workers(struct btrfs_workers *workers) +static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq, + struct btrfs_work *work) { - struct btrfs_worker_thread *worker; - int ret = 0; - - worker = kzalloc(sizeof(*worker), GFP_NOFS); - if (!worker) { - ret = -ENOMEM; - goto fail; - } - - INIT_LIST_HEAD(&worker->pending); - INIT_LIST_HEAD(&worker->prio_pending); - INIT_LIST_HEAD(&worker->worker_list); - spin_lock_init(&worker->lock); - - atomic_set(&worker->num_pending, 0); - atomic_set(&worker->refs, 1); - worker->workers = workers; - worker->task = kthread_create(worker_loop, worker, - "btrfs-%s-%d", workers->name, - workers->num_workers + 1); - if (IS_ERR(worker->task)) { - ret = PTR_ERR(worker->task); - goto fail; - } + unsigned long flags; - spin_lock_irq(&workers->lock); - if (workers->stopping) { - spin_unlock_irq(&workers->lock); - goto fail_kthread; + work->wq = wq; + thresh_queue_hook(wq); + if (work->ordered_func) { + spin_lock_irqsave(&wq->list_lock, flags); + list_add_tail(&work->ordered_list, &wq->ordered_list); + spin_unlock_irqrestore(&wq->list_lock, flags); } - list_add_tail(&worker->worker_list, &workers->idle_list); - worker->idle = 1; - workers->num_workers++; - workers->num_workers_starting--; - WARN_ON(workers->num_workers_starting < 0); - spin_unlock_irq(&workers->lock); - - wake_up_process(worker->task); - return 0; - -fail_kthread: - kthread_stop(worker->task); -fail: - kfree(worker); - spin_lock_irq(&workers->lock); - workers->num_workers_starting--; - spin_unlock_irq(&workers->lock); - return ret; + queue_work(wq->normal_wq, &work->normal_work); + trace_btrfs_work_queued(work); } -int btrfs_start_workers(struct btrfs_workers *workers) -{ - spin_lock_irq(&workers->lock); - workers->num_workers_starting++; - spin_unlock_irq(&workers->lock); - return __btrfs_start_workers(workers); -} - -/* - * run through the list and find a worker thread that doesn't have a lot - * to do right now. This can return null if we aren't yet at the thread - * count limit and all of the threads are busy. - */ -static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) +void btrfs_queue_work(struct btrfs_workqueue *wq, + struct btrfs_work *work) { - struct btrfs_worker_thread *worker; - struct list_head *next; - int enforce_min; + struct __btrfs_workqueue *dest_wq; - enforce_min = (workers->num_workers + workers->num_workers_starting) < - workers->max_workers; - - /* - * if we find an idle thread, don't move it to the end of the - * idle list. This improves the chance that the next submission - * will reuse the same thread, and maybe catch it while it is still - * working - */ - if (!list_empty(&workers->idle_list)) { - next = workers->idle_list.next; - worker = list_entry(next, struct btrfs_worker_thread, - worker_list); - return worker; - } - if (enforce_min || list_empty(&workers->worker_list)) - return NULL; - - /* - * if we pick a busy task, move the task to the end of the list. - * hopefully this will keep things somewhat evenly balanced. - * Do the move in batches based on the sequence number. This groups - * requests submitted at roughly the same time onto the same worker. - */ - next = workers->worker_list.next; - worker = list_entry(next, struct btrfs_worker_thread, worker_list); - worker->sequence++; - - if (worker->sequence % workers->idle_thresh == 0) - list_move_tail(next, &workers->worker_list); - return worker; + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high) + dest_wq = wq->high; + else + dest_wq = wq->normal; + __btrfs_queue_work(dest_wq, work); } -/* - * selects a worker thread to take the next job. This will either find - * an idle worker, start a new worker up to the max count, or just return - * one of the existing busy workers. - */ -static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) +static inline void +__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq) { - struct btrfs_worker_thread *worker; - unsigned long flags; - struct list_head *fallback; - int ret; - - spin_lock_irqsave(&workers->lock, flags); -again: - worker = next_worker(workers); - - if (!worker) { - if (workers->num_workers + workers->num_workers_starting >= - workers->max_workers) { - goto fallback; - } else if (workers->atomic_worker_start) { - workers->atomic_start_pending = 1; - goto fallback; - } else { - workers->num_workers_starting++; - spin_unlock_irqrestore(&workers->lock, flags); - /* we're below the limit, start another worker */ - ret = __btrfs_start_workers(workers); - spin_lock_irqsave(&workers->lock, flags); - if (ret) - goto fallback; - goto again; - } - } - goto found; - -fallback: - fallback = NULL; - /* - * we have failed to find any workers, just - * return the first one we can find. - */ - if (!list_empty(&workers->worker_list)) - fallback = workers->worker_list.next; - if (!list_empty(&workers->idle_list)) - fallback = workers->idle_list.next; - BUG_ON(!fallback); - worker = list_entry(fallback, - struct btrfs_worker_thread, worker_list); -found: - /* - * this makes sure the worker doesn't exit before it is placed - * onto a busy/idle list - */ - atomic_inc(&worker->num_pending); - spin_unlock_irqrestore(&workers->lock, flags); - return worker; + destroy_workqueue(wq->normal_wq); + trace_btrfs_workqueue_destroy(wq); + kfree(wq); } -/* - * btrfs_requeue_work just puts the work item back on the tail of the list - * it was taken from. It is intended for use with long running work functions - * that make some progress and want to give the cpu up for others. - */ -void btrfs_requeue_work(struct btrfs_work *work) +void btrfs_destroy_workqueue(struct btrfs_workqueue *wq) { - struct btrfs_worker_thread *worker = work->worker; - unsigned long flags; - int wake = 0; - - if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) + if (!wq) return; - - spin_lock_irqsave(&worker->lock, flags); - if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) - list_add_tail(&work->list, &worker->prio_pending); - else - list_add_tail(&work->list, &worker->pending); - atomic_inc(&worker->num_pending); - - /* by definition we're busy, take ourselves off the idle - * list - */ - if (worker->idle) { - spin_lock(&worker->workers->lock); - worker->idle = 0; - list_move_tail(&worker->worker_list, - &worker->workers->worker_list); - spin_unlock(&worker->workers->lock); - } - if (!worker->working) { - wake = 1; - worker->working = 1; - } - - if (wake) - wake_up_process(worker->task); - spin_unlock_irqrestore(&worker->lock, flags); + if (wq->high) + __btrfs_destroy_workqueue(wq->high); + __btrfs_destroy_workqueue(wq->normal); + kfree(wq); } -void btrfs_set_work_high_prio(struct btrfs_work *work) +void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max) { - set_bit(WORK_HIGH_PRIO_BIT, &work->flags); + if (!wq) + return; + wq->normal->max_active = max; + if (wq->high) + wq->high->max_active = max; } -/* - * places a struct btrfs_work into the pending queue of one of the kthreads - */ -void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) +void btrfs_set_work_high_priority(struct btrfs_work *work) { - struct btrfs_worker_thread *worker; - unsigned long flags; - int wake = 0; - - /* don't requeue something already on a list */ - if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) - return; - - worker = find_worker(workers); - if (workers->ordered) { - /* - * you're not allowed to do ordered queues from an - * interrupt handler - */ - spin_lock(&workers->order_lock); - if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) { - list_add_tail(&work->order_list, - &workers->prio_order_list); - } else { - list_add_tail(&work->order_list, &workers->order_list); - } - spin_unlock(&workers->order_lock); - } else { - INIT_LIST_HEAD(&work->order_list); - } - - spin_lock_irqsave(&worker->lock, flags); - - if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) - list_add_tail(&work->list, &worker->prio_pending); - else - list_add_tail(&work->list, &worker->pending); - check_busy_worker(worker); - - /* - * avoid calling into wake_up_process if this thread has already - * been kicked - */ - if (!worker->working) - wake = 1; - worker->working = 1; - - if (wake) - wake_up_process(worker->task); - spin_unlock_irqrestore(&worker->lock, flags); + set_bit(WORK_HIGH_PRIO_BIT, &work->flags); } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 1f26792683e..9c6b66d15fb 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -1,5 +1,6 @@ /* * Copyright (C) 2007 Oracle. All rights reserved. + * Copyright (C) 2014 Fujitsu. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public @@ -19,103 +20,35 @@ #ifndef __BTRFS_ASYNC_THREAD_ #define __BTRFS_ASYNC_THREAD_ -struct btrfs_worker_thread; +struct btrfs_workqueue; +/* Internal use only */ +struct __btrfs_workqueue; +struct btrfs_work; +typedef void (*btrfs_func_t)(struct btrfs_work *arg); -/* - * This is similar to a workqueue, but it is meant to spread the operations - * across all available cpus instead of just the CPU that was used to - * queue the work. There is also some batching introduced to try and - * cut down on context switches. - * - * By default threads are added on demand up to 2 * the number of cpus. - * Changing struct btrfs_workers->max_workers is one way to prevent - * demand creation of kthreads. - * - * the basic model of these worker threads is to embed a btrfs_work - * structure in your own data struct, and use container_of in a - * work function to get back to your data struct. - */ struct btrfs_work { - /* - * func should be set to the function you want called - * your work struct is passed as the only arg - * - * ordered_func must be set for work sent to an ordered work queue, - * and it is called to complete a given work item in the same - * order they were sent to the queue. - */ - void (*func)(struct btrfs_work *work); - void (*ordered_func)(struct btrfs_work *work); - void (*ordered_free)(struct btrfs_work *work); - - /* - * flags should be set to zero. It is used to make sure the - * struct is only inserted once into the list. - */ + btrfs_func_t func; + btrfs_func_t ordered_func; + btrfs_func_t ordered_free; + + /* Don't touch things below */ + struct work_struct normal_work; + struct list_head ordered_list; + struct __btrfs_workqueue *wq; unsigned long flags; - - /* don't touch these */ - struct btrfs_worker_thread *worker; - struct list_head list; - struct list_head order_list; -}; - -struct btrfs_workers { - /* current number of running workers */ - int num_workers; - - int num_workers_starting; - - /* max number of workers allowed. changed by btrfs_start_workers */ - int max_workers; - - /* once a worker has this many requests or fewer, it is idle */ - int idle_thresh; - - /* force completions in the order they were queued */ - int ordered; - - /* more workers required, but in an interrupt handler */ - int atomic_start_pending; - - /* - * are we allowed to sleep while starting workers or are we required - * to start them at a later time? If we can't sleep, this indicates - * which queue we need to use to schedule thread creation. - */ - struct btrfs_workers *atomic_worker_start; - - /* list with all the work threads. The workers on the idle thread - * may be actively servicing jobs, but they haven't yet hit the - * idle thresh limit above. - */ - struct list_head worker_list; - struct list_head idle_list; - - /* - * when operating in ordered mode, this maintains the list - * of work items waiting for completion - */ - struct list_head order_list; - struct list_head prio_order_list; - - /* lock for finding the next worker thread to queue on */ - spinlock_t lock; - - /* lock for the ordered lists */ - spinlock_t order_lock; - - /* extra name for this worker, used for current->name */ - char *name; - - int stopping; }; -void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); -int btrfs_start_workers(struct btrfs_workers *workers); -void btrfs_stop_workers(struct btrfs_workers *workers); -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, - struct btrfs_workers *async_starter); -void btrfs_requeue_work(struct btrfs_work *work); -void btrfs_set_work_high_prio(struct btrfs_work *work); +struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, + int flags, + int max_active, + int thresh); +void btrfs_init_work(struct btrfs_work *work, + btrfs_func_t func, + btrfs_func_t ordered_func, + btrfs_func_t ordered_free); +void btrfs_queue_work(struct btrfs_workqueue *wq, + struct btrfs_work *work); +void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); +void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); +void btrfs_set_work_high_priority(struct btrfs_work *work); #endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 0552a599b28..e25564bfcb4 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -66,6 +66,16 @@ static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb, return 0; } +static void free_inode_elem_list(struct extent_inode_elem *eie) +{ + struct extent_inode_elem *eie_next; + + for (; eie; eie = eie_next) { + eie_next = eie->next; + kfree(eie); + } +} + static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte, u64 extent_item_pos, struct extent_inode_elem **eie) @@ -185,6 +195,9 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, { struct __prelim_ref *ref; + if (root_id == BTRFS_DATA_RELOC_TREE_OBJECTID) + return 0; + ref = kmem_cache_alloc(btrfs_prelim_ref_cache, gfp_mask); if (!ref) return -ENOMEM; @@ -206,18 +219,20 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, } static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, - struct ulist *parents, int level, - struct btrfs_key *key_for_search, u64 time_seq, - u64 wanted_disk_byte, - const u64 *extent_item_pos) + struct ulist *parents, struct __prelim_ref *ref, + int level, u64 time_seq, const u64 *extent_item_pos, + u64 total_refs) { int ret = 0; int slot; struct extent_buffer *eb; struct btrfs_key key; + struct btrfs_key *key_for_search = &ref->key_for_search; struct btrfs_file_extent_item *fi; struct extent_inode_elem *eie = NULL, *old = NULL; u64 disk_byte; + u64 wanted_disk_byte = ref->wanted_disk_byte; + u64 count = 0; if (level != 0) { eb = path->nodes[level]; @@ -235,7 +250,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) ret = btrfs_next_old_leaf(root, path, time_seq); - while (!ret) { + while (!ret && count < total_refs) { eb = path->nodes[0]; slot = path->slots[0]; @@ -251,6 +266,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (disk_byte == wanted_disk_byte) { eie = NULL; old = NULL; + count++; if (extent_item_pos) { ret = check_extent_in_eb(&key, eb, fi, *extent_item_pos, @@ -270,6 +286,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, old = old->next; old->next = eie; } + eie = NULL; } next: ret = btrfs_next_old_item(root, path, time_seq); @@ -277,6 +294,8 @@ next: if (ret > 0) ret = 0; + else if (ret < 0) + free_inode_elem_list(eie); return ret; } @@ -288,7 +307,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, struct __prelim_ref *ref, struct ulist *parents, - const u64 *extent_item_pos) + const u64 *extent_item_pos, u64 total_refs) { struct btrfs_root *root; struct btrfs_key root_key; @@ -296,23 +315,37 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, int ret = 0; int root_level; int level = ref->level; + int index; root_key.objectid = ref->root_id; root_key.type = BTRFS_ROOT_ITEM_KEY; root_key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + root = btrfs_read_fs_root_no_name(fs_info, &root_key); if (IS_ERR(root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(root); goto out; } - root_level = btrfs_old_root_level(root, time_seq); + if (path->search_commit_root) + root_level = btrfs_header_level(root->commit_root); + else + root_level = btrfs_old_root_level(root, time_seq); - if (root_level + 1 == level) + if (root_level + 1 == level) { + srcu_read_unlock(&fs_info->subvol_srcu, index); goto out; + } path->lowest_level = level; ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq); + + /* root node has been locked, we can release @subvol_srcu safely here */ + srcu_read_unlock(&fs_info->subvol_srcu, index); + pr_debug("search slot in root %llu (level %d, ref count %d) returned " "%d for key (%llu %u %llu)\n", ref->root_id, level, ref->count, ret, @@ -323,8 +356,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, eb = path->nodes[level]; while (!eb) { - if (!level) { - WARN_ON(1); + if (WARN_ON(!level)) { ret = 1; goto out; } @@ -332,9 +364,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, eb = path->nodes[level]; } - ret = add_all_parents(root, path, parents, level, &ref->key_for_search, - time_seq, ref->wanted_disk_byte, - extent_item_pos); + ret = add_all_parents(root, path, parents, ref, level, time_seq, + extent_item_pos, total_refs); out: path->lowest_level = 0; btrfs_release_path(path); @@ -347,7 +378,7 @@ out: static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, struct list_head *head, - const u64 *extent_item_pos) + const u64 *extent_item_pos, u64 total_refs) { int err; int ret = 0; @@ -373,11 +404,18 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, if (ref->count == 0) continue; err = __resolve_indirect_ref(fs_info, path, time_seq, ref, - parents, extent_item_pos); - if (err == -ENOMEM) - goto out; - if (err) + parents, extent_item_pos, + total_refs); + /* + * we can only tolerate ENOENT,otherwise,we should catch error + * and return directly. + */ + if (err == -ENOENT) { continue; + } else if (err) { + ret = err; + goto out; + } /* we put the first parent into the ref at hand */ ULIST_ITER_INIT(&uiter); @@ -524,7 +562,7 @@ static void __merge_refs(struct list_head *head, int mode) * smaller or equal that seq to the list */ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, - struct list_head *prefs) + struct list_head *prefs, u64 *total_refs) { struct btrfs_delayed_extent_op *extent_op = head->extent_op; struct rb_node *n = &head->node.rb_node; @@ -536,14 +574,13 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, if (extent_op && extent_op->update_key) btrfs_disk_key_to_cpu(&op_key, &extent_op->key); - while ((n = rb_prev(n))) { + spin_lock(&head->lock); + n = rb_first(&head->ref_root); + while (n) { struct btrfs_delayed_ref_node *node; node = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); - if (node->bytenr != head->node.bytenr) - break; - WARN_ON(node->is_head); - + n = rb_next(n); if (node->seq > seq) continue; @@ -561,6 +598,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, default: BUG_ON(1); } + *total_refs += (node->ref_mod * sgn); switch (node->type) { case BTRFS_TREE_BLOCK_REF_KEY: { struct btrfs_delayed_tree_ref *ref; @@ -610,10 +648,10 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, WARN_ON(1); } if (ret) - return ret; + break; } - - return 0; + spin_unlock(&head->lock); + return ret; } /* @@ -621,7 +659,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, */ static int __add_inline_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, - int *info_level, struct list_head *prefs) + int *info_level, struct list_head *prefs, + u64 *total_refs) { int ret = 0; int slot; @@ -645,6 +684,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); flags = btrfs_extent_flags(leaf, ei); + *total_refs += btrfs_extent_refs(leaf, ei); btrfs_item_key_to_cpu(leaf, &found_key, slot); ptr = (unsigned long)(ei + 1); @@ -826,6 +866,8 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct list_head prefs_delayed; struct list_head prefs; struct __prelim_ref *ref; + struct extent_inode_elem *eie = NULL; + u64 total_refs = 0; INIT_LIST_HEAD(&prefs); INIT_LIST_HEAD(&prefs_delayed); @@ -840,8 +882,10 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - if (!trans) + if (!trans) { path->search_commit_root = 1; + path->skip_locking = 1; + } /* * grab both a lock on the path and a lock on the delayed ref head. @@ -856,7 +900,11 @@ again: goto out; BUG_ON(ret == 0); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (trans && likely(trans->type != __TRANS_DUMMY)) { +#else if (trans) { +#endif /* * look if there are updates for this ref queued and lock the * head @@ -880,15 +928,15 @@ again: btrfs_put_delayed_ref(&head->node); goto again; } + spin_unlock(&delayed_refs->lock); ret = __add_delayed_refs(head, time_seq, - &prefs_delayed); + &prefs_delayed, &total_refs); mutex_unlock(&head->mutex); - if (ret) { - spin_unlock(&delayed_refs->lock); + if (ret) goto out; - } + } else { + spin_unlock(&delayed_refs->lock); } - spin_unlock(&delayed_refs->lock); } if (path->slots[0]) { @@ -903,7 +951,8 @@ again: (key.type == BTRFS_EXTENT_ITEM_KEY || key.type == BTRFS_METADATA_ITEM_KEY)) { ret = __add_inline_refs(fs_info, path, bytenr, - &info_level, &prefs); + &info_level, &prefs, + &total_refs); if (ret) goto out; ret = __add_keyed_refs(fs_info, path, bytenr, @@ -923,7 +972,7 @@ again: __merge_refs(&prefs, 1); ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, - extent_item_pos); + extent_item_pos, total_refs); if (ret) goto out; @@ -932,19 +981,19 @@ again: while (!list_empty(&prefs)) { ref = list_first_entry(&prefs, struct __prelim_ref, list); WARN_ON(ref->count < 0); - if (ref->count && ref->root_id && ref->parent == 0) { + if (roots && ref->count && ref->root_id && ref->parent == 0) { /* no parent == root of tree */ ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); if (ret < 0) goto out; } if (ref->count && ref->parent) { - struct extent_inode_elem *eie = NULL; - if (extent_item_pos && !ref->inode_list) { + if (extent_item_pos && !ref->inode_list && + ref->level == 0) { u32 bsz; struct extent_buffer *eb; bsz = btrfs_level_size(fs_info->extent_root, - info_level); + ref->level); eb = read_tree_block(fs_info->extent_root, ref->parent, bsz, 0); if (!eb || !extent_buffer_uptodate(eb)) { @@ -974,6 +1023,7 @@ again: eie = eie->next; eie->next = ref->inode_list; } + eie = NULL; } list_del(&ref->list); kmem_cache_free(btrfs_prelim_ref_cache, ref); @@ -992,7 +1042,8 @@ out: list_del(&ref->list); kmem_cache_free(btrfs_prelim_ref_cache, ref); } - + if (ret < 0) + free_inode_elem_list(eie); return ret; } @@ -1000,7 +1051,6 @@ static void free_leaf_list(struct ulist *blocks) { struct ulist_node *node = NULL; struct extent_inode_elem *eie; - struct extent_inode_elem *eie_next; struct ulist_iterator uiter; ULIST_ITER_INIT(&uiter); @@ -1008,10 +1058,7 @@ static void free_leaf_list(struct ulist *blocks) if (!node->aux) continue; eie = (struct extent_inode_elem *)(uintptr_t)node->aux; - for (; eie; eie = eie_next) { - eie_next = eie->next; - kfree(eie); - } + free_inode_elem_list(eie); node->aux = 0; } @@ -1031,22 +1078,14 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, u64 time_seq, struct ulist **leafs, const u64 *extent_item_pos) { - struct ulist *tmp; int ret; - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - return -ENOMEM; *leafs = ulist_alloc(GFP_NOFS); - if (!*leafs) { - ulist_free(tmp); + if (!*leafs) return -ENOMEM; - } ret = find_parent_nodes(trans, fs_info, bytenr, - time_seq, *leafs, tmp, extent_item_pos); - ulist_free(tmp); - + time_seq, *leafs, NULL, extent_item_pos); if (ret < 0 && ret != -ENOENT) { free_leaf_list(*leafs); return ret; @@ -1068,9 +1107,9 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, * * returns 0 on success, < 0 on error. */ -int btrfs_find_all_roots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots) +static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **roots) { struct ulist *tmp; struct ulist_node *node = NULL; @@ -1099,42 +1138,25 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, if (!node) break; bytenr = node->val; + cond_resched(); } ulist_free(tmp); return 0; } - -static int __inode_info(u64 inum, u64 ioff, u8 key_type, - struct btrfs_root *fs_root, struct btrfs_path *path, - struct btrfs_key *found_key) +int btrfs_find_all_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **roots) { int ret; - struct btrfs_key key; - struct extent_buffer *eb; - - key.type = key_type; - key.objectid = inum; - key.offset = ioff; - ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); - if (ret < 0) - return ret; - - eb = path->nodes[0]; - if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(fs_root, path); - if (ret) - return ret; - eb = path->nodes[0]; - } - - btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); - if (found_key->type != key.type || found_key->objectid != key.objectid) - return 1; - - return 0; + if (!trans) + down_read(&fs_info->commit_root_sem); + ret = __btrfs_find_all_roots(trans, fs_info, bytenr, time_seq, roots); + if (!trans) + up_read(&fs_info->commit_root_sem); + return ret; } /* @@ -1144,16 +1166,16 @@ int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, struct btrfs_path *path) { struct btrfs_key key; - return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path, - &key); + return btrfs_find_item(fs_root, path, inum, ioff, + BTRFS_INODE_ITEM_KEY, &key); } static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, struct btrfs_path *path, struct btrfs_key *found_key) { - return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path, - found_key); + return btrfs_find_item(fs_root, path, inum, ioff, + BTRFS_INODE_REF_KEY, found_key); } int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, @@ -1333,20 +1355,20 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); if (ret < 0) return ret; - ret = btrfs_previous_item(fs_info->extent_root, path, - 0, BTRFS_EXTENT_ITEM_KEY); - if (ret < 0) - return ret; + ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0); + if (ret) { + if (ret > 0) + ret = -ENOENT; + return ret; + } btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); if (found_key->type == BTRFS_METADATA_ITEM_KEY) size = fs_info->extent_root->leafsize; else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) size = found_key->offset; - if ((found_key->type != BTRFS_EXTENT_ITEM_KEY && - found_key->type != BTRFS_METADATA_ITEM_KEY) || - found_key->objectid > logical || + if (found_key->objectid > logical || found_key->objectid + size <= logical) { pr_debug("logical %llu is not within any extent\n", logical); return -ENOENT; @@ -1387,9 +1409,10 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, * returns <0 on error */ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, - struct btrfs_extent_item *ei, u32 item_size, - struct btrfs_extent_inline_ref **out_eiref, - int *out_type) + struct btrfs_key *key, + struct btrfs_extent_item *ei, u32 item_size, + struct btrfs_extent_inline_ref **out_eiref, + int *out_type) { unsigned long end; u64 flags; @@ -1399,19 +1422,26 @@ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, /* first call */ flags = btrfs_extent_flags(eb, ei); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - info = (struct btrfs_tree_block_info *)(ei + 1); - *out_eiref = - (struct btrfs_extent_inline_ref *)(info + 1); + if (key->type == BTRFS_METADATA_ITEM_KEY) { + /* a skinny metadata extent */ + *out_eiref = + (struct btrfs_extent_inline_ref *)(ei + 1); + } else { + WARN_ON(key->type != BTRFS_EXTENT_ITEM_KEY); + info = (struct btrfs_tree_block_info *)(ei + 1); + *out_eiref = + (struct btrfs_extent_inline_ref *)(info + 1); + } } else { *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1); } *ptr = (unsigned long)*out_eiref; - if ((void *)*ptr >= (void *)ei + item_size) + if ((unsigned long)(*ptr) >= (unsigned long)ei + item_size) return -ENOENT; } end = (unsigned long)ei + item_size; - *out_eiref = (struct btrfs_extent_inline_ref *)*ptr; + *out_eiref = (struct btrfs_extent_inline_ref *)(*ptr); *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref); *ptr += btrfs_extent_inline_ref_size(*out_type); @@ -1430,8 +1460,8 @@ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, * <0 on error. */ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, - struct btrfs_extent_item *ei, u32 item_size, - u64 *out_root, u8 *out_level) + struct btrfs_key *key, struct btrfs_extent_item *ei, + u32 item_size, u64 *out_root, u8 *out_level) { int ret; int type; @@ -1442,8 +1472,8 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, return 1; while (1) { - ret = __get_extent_inline_ref(ptr, eb, ei, item_size, - &eiref, &type); + ret = __get_extent_inline_ref(ptr, eb, key, ei, item_size, + &eiref, &type); if (ret < 0) return ret; @@ -1516,6 +1546,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, if (IS_ERR(trans)) return PTR_ERR(trans); btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); + } else { + down_read(&fs_info->commit_root_sem); } ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, @@ -1526,8 +1558,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { - ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, - tree_mod_seq_elem.seq, &roots); + ret = __btrfs_find_all_roots(trans, fs_info, ref_node->val, + tree_mod_seq_elem.seq, &roots); if (ret) break; ULIST_ITER_INIT(&root_uiter); @@ -1549,6 +1581,8 @@ out: if (!search_commit_root) { btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); btrfs_end_transaction(trans, fs_info->extent_root); + } else { + up_read(&fs_info->commit_root_sem); } return ret; @@ -1599,7 +1633,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, struct btrfs_key found_key; while (!ret) { - path->leave_spinning = 1; ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, &found_key); if (ret < 0) @@ -1612,14 +1645,17 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, parent = found_key.offset; slot = path->slots[0]; - eb = path->nodes[0]; - /* make sure we can use eb after releasing the path */ - atomic_inc(&eb->refs); + eb = btrfs_clone_extent_buffer(path->nodes[0]); + if (!eb) { + ret = -ENOMEM; + break; + } + extent_buffer_get(eb); btrfs_tree_read_lock(eb); btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); - item = btrfs_item_nr(eb, slot); + item = btrfs_item_nr(slot); iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { @@ -1672,17 +1708,20 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, ++found; slot = path->slots[0]; - eb = path->nodes[0]; - /* make sure we can use eb after releasing the path */ - atomic_inc(&eb->refs); + eb = btrfs_clone_extent_buffer(path->nodes[0]); + if (!eb) { + ret = -ENOMEM; + break; + } + extent_buffer_get(eb); btrfs_tree_read_lock(eb); btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + item_size = btrfs_item_size_nr(leaf, slot); + ptr = btrfs_item_ptr_offset(leaf, slot); cur_offset = 0; while (cur_offset < item_size) { diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index a910b27a8ad..86fc20fec28 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -40,8 +40,8 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, u64 *flags); int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, - struct btrfs_extent_item *ei, u32 item_size, - u64 *out_root, u8 *out_level); + struct btrfs_key *key, struct btrfs_extent_item *ei, + u32 item_size, u64 *out_root, u8 *out_level); int iterate_extent_inodes(struct btrfs_fs_info *fs_info, u64 extent_item_objectid, @@ -55,8 +55,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); int btrfs_find_all_roots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots); + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **roots); char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, u32 name_len, unsigned long name_off, struct extent_buffer *eb_in, u64 parent, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 71f074e1870..4794923c410 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -19,6 +19,7 @@ #ifndef __BTRFS_I__ #define __BTRFS_I__ +#include <linux/hash.h> #include "extent_map.h" #include "extent_io.h" #include "ordered-data.h" @@ -42,6 +43,7 @@ #define BTRFS_INODE_COPY_EVERYTHING 8 #define BTRFS_INODE_IN_DELALLOC_LIST 9 #define BTRFS_INODE_READDIO_NEED_LOCK 10 +#define BTRFS_INODE_HAS_PROPS 11 /* in memory btrfs inode */ struct btrfs_inode { @@ -107,14 +109,17 @@ struct btrfs_inode { u64 last_trans; /* - * log transid when this inode was last modified + * transid that last logged this inode */ - u64 last_sub_trans; + u64 logged_trans; /* - * transid that last logged this inode + * log transid when this inode was last modified */ - u64 logged_trans; + int last_sub_trans; + + /* a local copy of root's last_log_commit */ + int last_log_commit; /* total number of bytes pending delalloc, used by stat to calc the * real block usage of the file @@ -134,6 +139,9 @@ struct btrfs_inode { */ u64 index_cnt; + /* Cache the directory index number to speed the dir/file remove */ + u64 dir_index; + /* the fsync log has some corner cases that mean we have to check * directories to see if any unlinks have been done before * the directory was logged. See tree-log.c for all the @@ -150,9 +158,6 @@ struct btrfs_inode { /* flags field from the on disk inode */ u32 flags; - /* a local copy of root's last_log_commit */ - unsigned long last_log_commit; - /* * Counters to keep track of the number of extent item's we may use due * to delalloc and such. outstanding_extents is the number of extent @@ -179,6 +184,25 @@ static inline struct btrfs_inode *BTRFS_I(struct inode *inode) return container_of(inode, struct btrfs_inode, vfs_inode); } +static inline unsigned long btrfs_inode_hash(u64 objectid, + const struct btrfs_root *root) +{ + u64 h = objectid ^ (root->objectid * GOLDEN_RATIO_PRIME); + +#if BITS_PER_LONG == 32 + h = (h >> 32) ^ (h & 0xffffffff); +#endif + + return (unsigned long)h; +} + +static inline void btrfs_insert_inode_hash(struct inode *inode) +{ + unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root); + + __insert_inode_hash(inode, h); +} + static inline u64 btrfs_ino(struct inode *inode) { u64 ino = BTRFS_I(inode)->location.objectid; @@ -255,9 +279,11 @@ static inline void btrfs_inode_block_unlocked_dio(struct inode *inode) static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags); } +bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end); + #endif diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 1c47be18724..ce92ae30250 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -77,17 +77,26 @@ * the integrity of (super)-block write requests, do not * enable the config option BTRFS_FS_CHECK_INTEGRITY to * include and compile the integrity check tool. + * + * Expect millions of lines of information in the kernel log with an + * enabled check_int_print_mask. Therefore set LOG_BUF_SHIFT in the + * kernel config to at least 26 (which is 64MB). Usually the value is + * limited to 21 (which is 2MB) in init/Kconfig. The file needs to be + * changed like this before LOG_BUF_SHIFT can be set to a high value: + * config LOG_BUF_SHIFT + * int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" + * range 12 30 */ #include <linux/sched.h> #include <linux/slab.h> #include <linux/buffer_head.h> #include <linux/mutex.h> -#include <linux/crc32c.h> #include <linux/genhd.h> #include <linux/blkdev.h> #include "ctree.h" #include "disk-io.h" +#include "hash.h" #include "transaction.h" #include "extent_io.h" #include "volumes.h" @@ -124,6 +133,7 @@ #define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400 #define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800 #define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000 +#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE 0x00002000 struct btrfsic_dev_state; struct btrfsic_state; @@ -323,7 +333,6 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); static int btrfsic_read_block(struct btrfsic_state *state, struct btrfsic_block_data_ctx *block_ctx); static void btrfsic_dump_database(struct btrfsic_state *state); -static void btrfsic_complete_bio_end_io(struct bio *bio, int err); static int btrfsic_test_for_metadata(struct btrfsic_state *state, char **datav, unsigned int num_pages); static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, @@ -1038,7 +1047,7 @@ leaf_item_out_of_bounce_error: disk_item_offset, sizeof(struct btrfs_item)); item_offset = btrfs_stack_item_offset(&disk_item); - item_size = btrfs_stack_item_offset(&disk_item); + item_size = btrfs_stack_item_size(&disk_item); disk_key = &disk_item.key; type = btrfs_disk_key_type(disk_key); @@ -1084,6 +1093,7 @@ leaf_item_out_of_bounce_error: next_stack = btrfsic_stack_frame_alloc(); if (NULL == next_stack) { + sf->error = -1; btrfsic_release_block_ctx( &sf-> next_block_ctx); @@ -1181,8 +1191,10 @@ continue_with_current_node_stack_frame: sf->next_block_ctx.datav[0]; next_stack = btrfsic_stack_frame_alloc(); - if (NULL == next_stack) + if (NULL == next_stack) { + sf->error = -1; goto one_stack_frame_backwards; + } next_stack->i = -1; next_stack->block = sf->next_block; @@ -1447,10 +1459,14 @@ static int btrfsic_handle_extent_data( btrfsic_read_from_block_data(block_ctx, &file_extent_item, file_extent_item_offset, sizeof(struct btrfs_file_extent_item)); - next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item) + - btrfs_stack_file_extent_offset(&file_extent_item); - generation = btrfs_stack_file_extent_generation(&file_extent_item); - num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item); + next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item); + if (btrfs_stack_file_extent_compression(&file_extent_item) == + BTRFS_COMPRESS_NONE) { + next_bytenr += btrfs_stack_file_extent_offset(&file_extent_item); + num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item); + } else { + num_bytes = btrfs_stack_file_extent_disk_num_bytes(&file_extent_item); + } generation = btrfs_stack_file_extent_generation(&file_extent_item); if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) @@ -1677,7 +1693,6 @@ static int btrfsic_read_block(struct btrfsic_state *state, for (i = 0; i < num_pages;) { struct bio *bio; unsigned int j; - DECLARE_COMPLETION_ONSTACK(complete); bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i); if (!bio) { @@ -1687,9 +1702,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, return -1; } bio->bi_bdev = block_ctx->dev->bdev; - bio->bi_sector = dev_bytenr >> 9; - bio->bi_end_io = btrfsic_complete_bio_end_io; - bio->bi_private = &complete; + bio->bi_iter.bi_sector = dev_bytenr >> 9; for (j = i; j < num_pages; j++) { ret = bio_add_page(bio, block_ctx->pagev[j], @@ -1702,12 +1715,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, "btrfsic: error, failed to add a single page!\n"); return -1; } - submit_bio(READ, bio); - - /* this will also unplug the queue */ - wait_for_completion(&complete); - - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + if (submit_bio_wait(READ, bio)) { printk(KERN_INFO "btrfsic: read error at logical %llu dev %s!\n", block_ctx->start, block_ctx->dev->name); @@ -1730,11 +1738,6 @@ static int btrfsic_read_block(struct btrfsic_state *state, return block_ctx->len; } -static void btrfsic_complete_bio_end_io(struct bio *bio, int err) -{ - complete((struct completion *)bio->bi_private); -} - static void btrfsic_dump_database(struct btrfsic_state *state) { struct list_head *elem_all; @@ -1823,7 +1826,7 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state, size_t sublen = i ? PAGE_CACHE_SIZE : (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE); - crc = crc32c(crc, data, sublen); + crc = btrfs_crc32c(crc, data, sublen); } btrfs_csum_final(crc, csum); if (memcmp(csum, h->csum, state->csum_size)) @@ -1900,7 +1903,9 @@ again: dev_state, dev_bytenr); } - if (block->logical_bytenr != bytenr) { + if (block->logical_bytenr != bytenr && + !(!block->is_metadata && + block->logical_bytenr == 0)) printk(KERN_INFO "Written block @%llu (%s/%llu/%d)" " found in hash table, %c," @@ -1910,15 +1915,14 @@ again: block->mirror_num, btrfsic_get_block_type(state, block), block->logical_bytenr); - block->logical_bytenr = bytenr; - } else if (state->print_mask & - BTRFSIC_PRINT_MASK_VERBOSE) + else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "Written block @%llu (%s/%llu/%d)" " found in hash table, %c.\n", bytenr, dev_state->name, dev_bytenr, block->mirror_num, btrfsic_get_block_type(state, block)); + block->logical_bytenr = bytenr; } else { if (num_pages * PAGE_CACHE_SIZE < state->datablock_size) { @@ -2463,10 +2467,8 @@ static int btrfsic_process_written_superblock( } } - if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) { - WARN_ON(1); + if (WARN_ON(-1 == btrfsic_check_all_ref_blocks(state, superblock, 0))) btrfsic_dump_tree(state); - } return 0; } @@ -2906,7 +2908,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, btrfsic_release_block_ctx(&block_ctx); } - if (!match) { + if (WARN_ON(!match)) { printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio," " buffer->log_bytenr=%llu, submit_bio(bdev=%s," " phys_bytenr=%llu)!\n", @@ -2923,7 +2925,6 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, bytenr, block_ctx.dev->name, block_ctx.dev_bytenr, mirror_num); } - WARN_ON(1); } } @@ -3000,14 +3001,12 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh) return submit_bh(rw, bh); } -void btrfsic_submit_bio(int rw, struct bio *bio) +static void __btrfsic_submit_bio(int rw, struct bio *bio) { struct btrfsic_dev_state *dev_state; - if (!btrfsic_is_initialized) { - submit_bio(rw, bio); + if (!btrfsic_is_initialized) return; - } mutex_lock(&btrfsic_mutex); /* since btrfsic_submit_bio() is also called before @@ -3017,10 +3016,11 @@ void btrfsic_submit_bio(int rw, struct bio *bio) (rw & WRITE) && NULL != bio->bi_io_vec) { unsigned int i; u64 dev_bytenr; + u64 cur_bytenr; int bio_is_patched; char **mapped_datav; - dev_bytenr = 512 * bio->bi_sector; + dev_bytenr = 512 * bio->bi_iter.bi_sector; bio_is_patched = 0; if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) @@ -3028,13 +3028,14 @@ void btrfsic_submit_bio(int rw, struct bio *bio) "submit_bio(rw=0x%x, bi_vcnt=%u," " bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", rw, bio->bi_vcnt, - (unsigned long long)bio->bi_sector, dev_bytenr, - bio->bi_bdev); + (unsigned long long)bio->bi_iter.bi_sector, + dev_bytenr, bio->bi_bdev); mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt, GFP_NOFS); if (!mapped_datav) goto leave; + cur_bytenr = dev_bytenr; for (i = 0; i < bio->bi_vcnt; i++) { BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE); mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page); @@ -3046,16 +3047,13 @@ void btrfsic_submit_bio(int rw, struct bio *bio) kfree(mapped_datav); goto leave; } - if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE) == - (dev_state->state->print_mask & - (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE))) + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) printk(KERN_INFO - "#%u: page=%p, len=%u, offset=%u\n", - i, bio->bi_io_vec[i].bv_page, - bio->bi_io_vec[i].bv_len, + "#%u: bytenr=%llu, len=%u, offset=%u\n", + i, cur_bytenr, bio->bi_io_vec[i].bv_len, bio->bi_io_vec[i].bv_offset); + cur_bytenr += bio->bi_io_vec[i].bv_len; } btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, bio->bi_vcnt, @@ -3099,10 +3097,20 @@ void btrfsic_submit_bio(int rw, struct bio *bio) } leave: mutex_unlock(&btrfsic_mutex); +} +void btrfsic_submit_bio(int rw, struct bio *bio) +{ + __btrfsic_submit_bio(rw, bio); submit_bio(rw, bio); } +int btrfsic_submit_bio_wait(int rw, struct bio *bio) +{ + __btrfsic_submit_bio(rw, bio); + return submit_bio_wait(rw, bio); +} + int btrfsic_mount(struct btrfs_root *root, struct btrfs_fs_devices *fs_devices, int including_extent_data, u32 print_mask) diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h index 8b59175cc50..13b8566c97a 100644 --- a/fs/btrfs/check-integrity.h +++ b/fs/btrfs/check-integrity.h @@ -22,9 +22,11 @@ #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY int btrfsic_submit_bh(int rw, struct buffer_head *bh); void btrfsic_submit_bio(int rw, struct bio *bio); +int btrfsic_submit_bio_wait(int rw, struct bio *bio); #else #define btrfsic_submit_bh submit_bh #define btrfsic_submit_bio submit_bio +#define btrfsic_submit_bio_wait submit_bio_wait #endif int btrfsic_mount(struct btrfs_root *root, diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h deleted file mode 100644 index 7c4503ef6ef..00000000000 --- a/fs/btrfs/compat.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef _COMPAT_H_ -#define _COMPAT_H_ - -#define btrfs_drop_nlink(inode) drop_nlink(inode) -#define btrfs_inc_nlink(inode) inc_nlink(inode) - -#endif /* _COMPAT_H_ */ diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 6aad98cb343..1daea0b4718 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -32,7 +32,6 @@ #include <linux/writeback.h> #include <linux/bit_spinlock.h> #include <linux/slab.h> -#include "compat.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -129,11 +128,10 @@ static int check_compressed_csum(struct inode *inode, kunmap_atomic(kaddr); if (csum != *cb_sum) { - printk(KERN_INFO "btrfs csum failed ino %llu " - "extent %llu csum %u " - "wanted %u mirror %d\n", - btrfs_ino(inode), disk_start, csum, *cb_sum, - cb->mirror_num); + btrfs_info(BTRFS_I(inode)->root->fs_info, + "csum failed ino %llu extent %llu csum %u wanted %u mirror %d", + btrfs_ino(inode), disk_start, csum, *cb_sum, + cb->mirror_num); ret = -EIO; goto fail; } @@ -173,7 +171,8 @@ static void end_compressed_bio_read(struct bio *bio, int err) goto out; inode = cb->inode; - ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9); + ret = check_compressed_csum(inode, cb, + (u64)bio->bi_iter.bi_sector << 9); if (ret) goto csum_failed; @@ -202,18 +201,16 @@ csum_failed: if (cb->errors) { bio_io_error(cb->orig_bio); } else { - int bio_index = 0; - struct bio_vec *bvec = cb->orig_bio->bi_io_vec; + int i; + struct bio_vec *bvec; /* * we have verified the checksum already, set page * checked so the end_io handlers know about it */ - while (bio_index < cb->orig_bio->bi_vcnt) { + bio_for_each_segment_all(bvec, cb->orig_bio, i) SetPageChecked(bvec->bv_page); - bvec++; - bio_index++; - } + bio_endio(cb->orig_bio, 0); } @@ -360,7 +357,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); - if(!bio) { + if (!bio) { kfree(cb); return -ENOMEM; } @@ -373,7 +370,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { page = compressed_pages[pg_index]; page->mapping = inode->i_mapping; - if (bio->bi_size) + if (bio->bi_iter.bi_size) ret = io_tree->ops->merge_bio_hook(WRITE, page, 0, PAGE_CACHE_SIZE, bio, 0); @@ -413,7 +410,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); } if (bytes_left < PAGE_CACHE_SIZE) { - printk("bytes left %lu compress len %lu nr %lu\n", + btrfs_info(BTRFS_I(inode)->root->fs_info, + "bytes left %lu compress len %lu nr %lu", bytes_left, cb->compressed_len, cb->nr_pages); } bytes_left -= PAGE_CACHE_SIZE; @@ -474,7 +472,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, rcu_read_lock(); page = radix_tree_lookup(&mapping->page_tree, pg_index); rcu_read_unlock(); - if (page) { + if (page && !radix_tree_exceptional_entry(page)) { misses++; if (misses > 4) break; @@ -507,7 +505,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (!em || last_offset < em->start || (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || - (em->block_start >> 9) != cb->orig_bio->bi_sector) { + (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) { free_extent_map(em); unlock_extent(tree, last_offset, end); unlock_page(page); @@ -553,7 +551,7 @@ next: * in it. We don't actually do IO on those pages but allocate new ones * to hold the compressed pages on disk. * - * bio->bi_sector points to the compressed extent on disk + * bio->bi_iter.bi_sector points to the compressed extent on disk * bio->bi_io_vec points to all of the inode pages * bio->bi_vcnt is a count of pages * @@ -574,7 +572,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, struct page *page; struct block_device *bdev; struct bio *comp_bio; - u64 cur_disk_byte = (u64)bio->bi_sector << 9; + u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9; u64 em_len; u64 em_start; struct extent_map *em; @@ -660,7 +658,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page->mapping = inode->i_mapping; page->index = em_start >> PAGE_CACHE_SHIFT; - if (comp_bio->bi_size) + if (comp_bio->bi_iter.bi_size) ret = tree->ops->merge_bio_hook(READ, page, 0, PAGE_CACHE_SIZE, comp_bio, 0); @@ -688,8 +686,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, comp_bio, sums); BUG_ON(ret); /* -ENOMEM */ } - sums += (comp_bio->bi_size + root->sectorsize - 1) / - root->sectorsize; + sums += (comp_bio->bi_iter.bi_size + + root->sectorsize - 1) / root->sectorsize; ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); @@ -823,7 +821,7 @@ static void free_workspace(int type, struct list_head *workspace) spin_lock(workspace_lock); if (*num_workspace < num_online_cpus()) { - list_add_tail(workspace, idle_workspace); + list_add(workspace, idle_workspace); (*num_workspace)++; spin_unlock(workspace_lock); goto wake; @@ -889,7 +887,7 @@ int btrfs_compress_pages(int type, struct address_space *mapping, workspace = find_workspace(type); if (IS_ERR(workspace)) - return -1; + return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, start, len, pages, @@ -925,7 +923,7 @@ static int btrfs_decompress_biovec(int type, struct page **pages_in, workspace = find_workspace(type); if (IS_ERR(workspace)) - return -ENOMEM; + return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in, disk_start, @@ -947,7 +945,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, workspace = find_workspace(type); if (IS_ERR(workspace)) - return -ENOMEM; + return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->decompress(workspace, data_in, dest_page, start_byte, @@ -1012,6 +1010,8 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, bytes = min(bytes, working_bytes); kaddr = kmap_atomic(page_out); memcpy(kaddr + *pg_offset, buf + buf_offset, bytes); + if (*pg_index == (vcnt - 1) && *pg_offset == 0) + memset(kaddr + bytes, 0, PAGE_CACHE_SIZE - bytes); kunmap_atomic(kaddr); flush_dcache_page(page_out); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 61b5bcd57b7..aeab453b8e2 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -39,9 +39,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *src_buf); static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, int slot); -static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, +static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb); -static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); struct btrfs_path *btrfs_alloc_path(void) { @@ -225,7 +224,8 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) static void add_root_to_dirty_list(struct btrfs_root *root) { spin_lock(&root->fs_info->trans_lock); - if (root->track_dirty && list_empty(&root->dirty_list)) { + if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) && + list_empty(&root->dirty_list)) { list_add(&root->dirty_list, &root->fs_info->dirty_cowonly_roots); } @@ -247,9 +247,10 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, int level; struct btrfs_disk_key disk_key; - WARN_ON(root->ref_cows && trans->transid != - root->fs_info->running_transaction->transid); - WARN_ON(root->ref_cows && trans->transid != root->last_trans); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->fs_info->running_transaction->transid); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->last_trans); level = btrfs_header_level(buf); if (level == 0) @@ -274,7 +275,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, else btrfs_set_header_owner(cow, new_root_objectid); - write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(cow), + write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); WARN_ON(btrfs_header_generation(buf) > trans->transid); @@ -355,44 +356,14 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info) } /* - * Increment the upper half of tree_mod_seq, set lower half zero. - * - * Must be called with fs_info->tree_mod_seq_lock held. + * Pull a new tree mod seq number for our operation. */ -static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info) -{ - u64 seq = atomic64_read(&fs_info->tree_mod_seq); - seq &= 0xffffffff00000000ull; - seq += 1ull << 32; - atomic64_set(&fs_info->tree_mod_seq, seq); - return seq; -} - -/* - * Increment the lower half of tree_mod_seq. - * - * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers - * are generated should not technically require a spin lock here. (Rationale: - * incrementing the minor while incrementing the major seq number is between its - * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it - * just returns a unique sequence number as usual.) We have decided to leave - * that requirement in here and rethink it once we notice it really imposes a - * problem on some workload. - */ -static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info) +static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) { return atomic64_inc_return(&fs_info->tree_mod_seq); } /* - * return the last minor in the previous major tree_mod_seq number - */ -u64 btrfs_tree_mod_seq_prev(u64 seq) -{ - return (seq & 0xffffffff00000000ull) - 1ull; -} - -/* * This adds a new blocker to the tree mod log's blocker list if the @elem * passed does not already have a sequence number set. So when a caller expects * to record tree modifications, it should ensure to set elem->seq to zero @@ -403,19 +374,16 @@ u64 btrfs_tree_mod_seq_prev(u64 seq) u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) { - u64 seq; - tree_mod_log_write_lock(fs_info); spin_lock(&fs_info->tree_mod_seq_lock); if (!elem->seq) { - elem->seq = btrfs_inc_tree_mod_seq_major(fs_info); + elem->seq = btrfs_inc_tree_mod_seq(fs_info); list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); } - seq = btrfs_inc_tree_mod_seq_minor(fs_info); spin_unlock(&fs_info->tree_mod_seq_lock); tree_mod_log_write_unlock(fs_info); - return seq; + return elem->seq; } void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, @@ -475,6 +443,8 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, * the index is the shifted logical of the *new* root node for root replace * operations, or the shifted logical of the affected block for all other * operations. + * + * Note: must be called with write lock (tree_mod_log_write_lock). */ static noinline int __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) @@ -483,27 +453,10 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) struct rb_node **new; struct rb_node *parent = NULL; struct tree_mod_elem *cur; - int ret = 0; BUG_ON(!tm); - tree_mod_log_write_lock(fs_info); - if (list_empty(&fs_info->tree_mod_seq_list)) { - tree_mod_log_write_unlock(fs_info); - /* - * Ok we no longer care about logging modifications, free up tm - * and return 0. Any callers shouldn't be using tm after - * calling tree_mod_log_insert, but if they do we can just - * change this to return a special error code to let the callers - * do their own thing. - */ - kfree(tm); - return 0; - } - - spin_lock(&fs_info->tree_mod_seq_lock); - tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info); - spin_unlock(&fs_info->tree_mod_seq_lock); + tm->seq = btrfs_inc_tree_mod_seq(fs_info); tm_root = &fs_info->tree_mod_log; new = &tm_root->rb_node; @@ -518,18 +471,13 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) new = &((*new)->rb_left); else if (cur->seq > tm->seq) new = &((*new)->rb_right); - else { - ret = -EEXIST; - kfree(tm); - goto out; - } + else + return -EEXIST; } rb_link_node(&tm->node, parent, new); rb_insert_color(&tm->node, tm_root); -out: - tree_mod_log_write_unlock(fs_info); - return ret; + return 0; } /* @@ -545,19 +493,38 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, return 1; if (eb && btrfs_header_level(eb) == 0) return 1; + + tree_mod_log_write_lock(fs_info); + if (list_empty(&(fs_info)->tree_mod_seq_list)) { + tree_mod_log_write_unlock(fs_info); + return 1; + } + return 0; } -static inline int -__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int slot, - enum mod_log_op op, gfp_t flags) +/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ +static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) +{ + smp_mb(); + if (list_empty(&(fs_info)->tree_mod_seq_list)) + return 0; + if (eb && btrfs_header_level(eb) == 0) + return 0; + + return 1; +} + +static struct tree_mod_elem * +alloc_tree_mod_elem(struct extent_buffer *eb, int slot, + enum mod_log_op op, gfp_t flags) { struct tree_mod_elem *tm; tm = kzalloc(sizeof(*tm), flags); if (!tm) - return -ENOMEM; + return NULL; tm->index = eb->start >> PAGE_CACHE_SHIFT; if (op != MOD_LOG_KEY_ADD) { @@ -567,8 +534,9 @@ __tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, tm->op = op; tm->slot = slot; tm->generation = btrfs_node_ptr_generation(eb, slot); + RB_CLEAR_NODE(&tm->node); - return __tree_mod_log_insert(fs_info, tm); + return tm; } static noinline int @@ -576,10 +544,27 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int slot, enum mod_log_op op, gfp_t flags) { - if (tree_mod_dont_log(fs_info, eb)) + struct tree_mod_elem *tm; + int ret; + + if (!tree_mod_need_log(fs_info, eb)) return 0; - return __tree_mod_log_insert_key(fs_info, eb, slot, op, flags); + tm = alloc_tree_mod_elem(eb, slot, op, flags); + if (!tm) + return -ENOMEM; + + if (tree_mod_dont_log(fs_info, eb)) { + kfree(tm); + return 0; + } + + ret = __tree_mod_log_insert(fs_info, tm); + tree_mod_log_write_unlock(fs_info); + if (ret) + kfree(tm); + + return ret; } static noinline int @@ -587,53 +572,95 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items, gfp_t flags) { - struct tree_mod_elem *tm; - int ret; + struct tree_mod_elem *tm = NULL; + struct tree_mod_elem **tm_list = NULL; + int ret = 0; int i; + int locked = 0; - if (tree_mod_dont_log(fs_info, eb)) + if (!tree_mod_need_log(fs_info, eb)) return 0; + tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags); + if (!tm_list) + return -ENOMEM; + + tm = kzalloc(sizeof(*tm), flags); + if (!tm) { + ret = -ENOMEM; + goto free_tms; + } + + tm->index = eb->start >> PAGE_CACHE_SHIFT; + tm->slot = src_slot; + tm->move.dst_slot = dst_slot; + tm->move.nr_items = nr_items; + tm->op = MOD_LOG_MOVE_KEYS; + + for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { + tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, + MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + + if (tree_mod_dont_log(fs_info, eb)) + goto free_tms; + locked = 1; + /* * When we override something during the move, we log these removals. * This can only happen when we move towards the beginning of the * buffer, i.e. dst_slot < src_slot. */ for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { - ret = __tree_mod_log_insert_key(fs_info, eb, i + dst_slot, - MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); - BUG_ON(ret < 0); + ret = __tree_mod_log_insert(fs_info, tm_list[i]); + if (ret) + goto free_tms; } - tm = kzalloc(sizeof(*tm), flags); - if (!tm) - return -ENOMEM; + ret = __tree_mod_log_insert(fs_info, tm); + if (ret) + goto free_tms; + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); - tm->index = eb->start >> PAGE_CACHE_SHIFT; - tm->slot = src_slot; - tm->move.dst_slot = dst_slot; - tm->move.nr_items = nr_items; - tm->op = MOD_LOG_MOVE_KEYS; + return 0; +free_tms: + for (i = 0; i < nr_items; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); + kfree(tm_list[i]); + } + if (locked) + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); + kfree(tm); - return __tree_mod_log_insert(fs_info, tm); + return ret; } -static inline void -__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) +static inline int +__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct tree_mod_elem **tm_list, + int nritems) { - int i; - u32 nritems; + int i, j; int ret; - if (btrfs_header_level(eb) == 0) - return; - - nritems = btrfs_header_nritems(eb); for (i = nritems - 1; i >= 0; i--) { - ret = __tree_mod_log_insert_key(fs_info, eb, i, - MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); - BUG_ON(ret < 0); + ret = __tree_mod_log_insert(fs_info, tm_list[i]); + if (ret) { + for (j = nritems - 1; j > i; j--) + rb_erase(&tm_list[j]->node, + &fs_info->tree_mod_log); + return ret; + } } + + return 0; } static noinline int @@ -642,17 +669,38 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, struct extent_buffer *new_root, gfp_t flags, int log_removal) { - struct tree_mod_elem *tm; + struct tree_mod_elem *tm = NULL; + struct tree_mod_elem **tm_list = NULL; + int nritems = 0; + int ret = 0; + int i; - if (tree_mod_dont_log(fs_info, NULL)) + if (!tree_mod_need_log(fs_info, NULL)) return 0; - if (log_removal) - __tree_mod_log_free_eb(fs_info, old_root); + if (log_removal && btrfs_header_level(old_root) > 0) { + nritems = btrfs_header_nritems(old_root); + tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *), + flags); + if (!tm_list) { + ret = -ENOMEM; + goto free_tms; + } + for (i = 0; i < nritems; i++) { + tm_list[i] = alloc_tree_mod_elem(old_root, i, + MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + } tm = kzalloc(sizeof(*tm), flags); - if (!tm) - return -ENOMEM; + if (!tm) { + ret = -ENOMEM; + goto free_tms; + } tm->index = new_root->start >> PAGE_CACHE_SHIFT; tm->old_root.logical = old_root->start; @@ -660,7 +708,30 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, tm->generation = btrfs_header_generation(old_root); tm->op = MOD_LOG_ROOT_REPLACE; - return __tree_mod_log_insert(fs_info, tm); + if (tree_mod_dont_log(fs_info, NULL)) + goto free_tms; + + if (tm_list) + ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems); + if (!ret) + ret = __tree_mod_log_insert(fs_info, tm); + + tree_mod_log_write_unlock(fs_info); + if (ret) + goto free_tms; + kfree(tm_list); + + return ret; + +free_tms: + if (tm_list) { + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + } + kfree(tm); + + return ret; } static struct tree_mod_elem * @@ -729,31 +800,75 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) return __tree_mod_log_search(fs_info, start, min_seq, 0); } -static noinline void +static noinline int tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, int nr_items) { - int ret; + int ret = 0; + struct tree_mod_elem **tm_list = NULL; + struct tree_mod_elem **tm_list_add, **tm_list_rem; int i; + int locked = 0; - if (tree_mod_dont_log(fs_info, NULL)) - return; + if (!tree_mod_need_log(fs_info, NULL)) + return 0; if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) - return; + return 0; + tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *), + GFP_NOFS); + if (!tm_list) + return -ENOMEM; + + tm_list_add = tm_list; + tm_list_rem = tm_list + nr_items; for (i = 0; i < nr_items; i++) { - ret = __tree_mod_log_insert_key(fs_info, src, - i + src_offset, - MOD_LOG_KEY_REMOVE, GFP_NOFS); - BUG_ON(ret < 0); - ret = __tree_mod_log_insert_key(fs_info, dst, - i + dst_offset, - MOD_LOG_KEY_ADD, - GFP_NOFS); - BUG_ON(ret < 0); + tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, + MOD_LOG_KEY_REMOVE, GFP_NOFS); + if (!tm_list_rem[i]) { + ret = -ENOMEM; + goto free_tms; + } + + tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, + MOD_LOG_KEY_ADD, GFP_NOFS); + if (!tm_list_add[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + + if (tree_mod_dont_log(fs_info, NULL)) + goto free_tms; + locked = 1; + + for (i = 0; i < nr_items; i++) { + ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]); + if (ret) + goto free_tms; + ret = __tree_mod_log_insert(fs_info, tm_list_add[i]); + if (ret) + goto free_tms; + } + + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); + + return 0; + +free_tms: + for (i = 0; i < nr_items * 2; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); + kfree(tm_list[i]); } + if (locked) + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); + + return ret; } static inline void @@ -772,18 +887,58 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, { int ret; - ret = __tree_mod_log_insert_key(fs_info, eb, slot, + ret = tree_mod_log_insert_key(fs_info, eb, slot, MOD_LOG_KEY_REPLACE, atomic ? GFP_ATOMIC : GFP_NOFS); BUG_ON(ret < 0); } -static noinline void +static noinline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { + struct tree_mod_elem **tm_list = NULL; + int nritems = 0; + int i; + int ret = 0; + + if (btrfs_header_level(eb) == 0) + return 0; + + if (!tree_mod_need_log(fs_info, NULL)) + return 0; + + nritems = btrfs_header_nritems(eb); + tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *), + GFP_NOFS); + if (!tm_list) + return -ENOMEM; + + for (i = 0; i < nritems; i++) { + tm_list[i] = alloc_tree_mod_elem(eb, i, + MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + if (tree_mod_dont_log(fs_info, eb)) - return; - __tree_mod_log_free_eb(fs_info, eb); + goto free_tms; + + ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems); + tree_mod_log_write_unlock(fs_info); + if (ret) + goto free_tms; + kfree(tm_list); + + return 0; + +free_tms: + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + + return ret; } static noinline void @@ -809,14 +964,14 @@ int btrfs_block_can_be_shared(struct btrfs_root *root, * snapshot and the block was not allocated by tree relocation, * we know the block is not shared. */ - if (root->ref_cows && + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && buf != root->node && buf != root->commit_root && (btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item) || btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) return 1; #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (root->ref_cows && + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) return 1; #endif @@ -958,9 +1113,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_assert_tree_locked(buf); - WARN_ON(root->ref_cows && trans->transid != - root->fs_info->running_transaction->transid); - WARN_ON(root->ref_cows && trans->transid != root->last_trans); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->fs_info->running_transaction->transid); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->last_trans); level = btrfs_header_level(buf); @@ -996,7 +1152,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, else btrfs_set_header_owner(cow, root->root_key.objectid); - write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(cow), + write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); @@ -1005,7 +1161,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, return ret; } - if (root->ref_cows) { + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); if (ret) return ret; @@ -1041,8 +1197,13 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); - if (last_ref) - tree_mod_log_free_eb(root->fs_info, buf); + if (last_ref) { + ret = tree_mod_log_free_eb(root->fs_info, buf); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + } btrfs_free_tree_block(trans, root, buf, parent_start, last_ref); } @@ -1285,11 +1446,10 @@ get_old_root(struct btrfs_root *root, u64 time_seq) free_extent_buffer(eb_root); blocksize = btrfs_level_size(root, old_root->level); old = read_tree_block(root, logical, blocksize, 0); - if (!old || !extent_buffer_uptodate(old)) { + if (WARN_ON(!old || !extent_buffer_uptodate(old))) { free_extent_buffer(old); - pr_warn("btrfs: failed to read tree block %llu from get_old_root\n", - logical); - WARN_ON(1); + btrfs_warn(root->fs_info, + "failed to read tree block %llu from get_old_root", logical); } else { eb = btrfs_clone_extent_buffer(old); free_extent_buffer(old); @@ -1346,6 +1506,10 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 0; +#endif /* ensure we can see the force_cow */ smp_rmb(); @@ -1364,7 +1528,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && - !root->force_cow) + !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) return 0; return 1; } @@ -2463,6 +2627,49 @@ static int key_search(struct extent_buffer *b, struct btrfs_key *key, return 0; } +int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path, + u64 iobjectid, u64 ioff, u8 key_type, + struct btrfs_key *found_key) +{ + int ret; + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_path *path; + + key.type = key_type; + key.objectid = iobjectid; + key.offset = ioff; + + if (found_path == NULL) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + } else + path = found_path; + + ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); + if ((ret < 0) || (found_key == NULL)) { + if (path != found_path) + btrfs_free_path(path); + return ret; + } + + eb = path->nodes[0]; + if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(fs_root, path); + if (ret) + return ret; + eb = path->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); + if (found_key->type != key.type || + found_key->objectid != key.objectid) + return 1; + + return 0; +} + /* * look for key in the tree. path is filled in with nodes along the way * if key is found, we return zero and you can find the item in the leaf @@ -2496,6 +2703,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); WARN_ON(p->nodes[0] != NULL); + BUG_ON(!cow && ins_len); if (ins_len < 0) { lowest_unlock = 2; @@ -2533,9 +2741,13 @@ again: * the commit roots are read only * so we always do read locks */ + if (p->need_commit_sem) + down_read(&root->fs_info->commit_root_sem); b = root->commit_root; extent_buffer_get(b); level = btrfs_header_level(b); + if (p->need_commit_sem) + up_read(&root->fs_info->commit_root_sem); if (!p->skip_locking) btrfs_tree_read_lock(b); } else { @@ -2604,8 +2816,6 @@ again: } } cow_done: - BUG_ON(!cow && ins_len); - p->nodes[level] = b; btrfs_clear_path_blocking(p, NULL, 0); @@ -2615,13 +2825,19 @@ cow_done: * It is safe to drop the lock on our parent before we * go through the expensive btree search on b. * - * If cow is true, then we might be changing slot zero, - * which may require changing the parent. So, we can't - * drop the lock until after we know which slot we're - * operating on. + * If we're inserting or deleting (ins_len != 0), then we might + * be changing slot zero, which may require changing the parent. + * So, we can't drop the lock until after we know which slot + * we're operating on. */ - if (!cow) - btrfs_unlock_up_safe(p, level + 1); + if (!ins_len && !p->keep_locks) { + int u = level + 1; + + if (u < BTRFS_MAX_LEVEL && p->locks[u]) { + btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); + p->locks[u] = 0; + } + } ret = key_search(b, key, level, &prev_cmp, &slot); @@ -2649,7 +2865,7 @@ cow_done: * which means we must have a write lock * on the parent */ - if (slot == 0 && cow && + if (slot == 0 && ins_len && write_lock_level < level + 1) { write_lock_level = level + 1; btrfs_release_path(p); @@ -2758,7 +2974,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, int level; int lowest_unlock = 1; u8 lowest_level = 0; - int prev_cmp; + int prev_cmp = -1; lowest_level = p->lowest_level; WARN_ON(p->nodes[0] != NULL); @@ -2769,7 +2985,6 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, } again: - prev_cmp = -1; b = get_old_root(root, time_seq); level = btrfs_header_level(b); p->locks[level] = BTRFS_READ_LOCK; @@ -2787,6 +3002,11 @@ again: */ btrfs_unlock_up_safe(p, level + 1); + /* + * Since we can unwind eb's we want to do a real search every + * time. + */ + prev_cmp = -1; ret = key_search(b, key, level, &prev_cmp, &slot); if (level != 0) { @@ -2898,7 +3118,9 @@ again: if (ret < 0) return ret; if (!ret) { - p->slots[0] = btrfs_header_nritems(leaf) - 1; + leaf = p->nodes[0]; + if (p->slots[0] == btrfs_header_nritems(leaf)) + p->slots[0]--; return 0; } if (!return_any) @@ -3019,8 +3241,12 @@ static int push_node_left(struct btrfs_trans_handle *trans, } else push_items = min(src_nritems - 8, push_items); - tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, - push_items); + ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, + push_items); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(dst_nritems), btrfs_node_key_ptr_offset(0), @@ -3090,8 +3316,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans, (dst_nritems) * sizeof(struct btrfs_key_ptr)); - tree_mod_log_eb_copy(root->fs_info, dst, src, 0, - src_nritems - push_items, push_items); + ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0, + src_nritems - push_items, push_items); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(src_nritems - push_items), @@ -3148,7 +3378,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(c, root->root_key.objectid); - write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(c), + write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); write_extent_buffer(c, root->fs_info->chunk_tree_uuid, @@ -3287,12 +3517,17 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(split, root->root_key.objectid); write_extent_buffer(split, root->fs_info->fsid, - btrfs_header_fsid(split), BTRFS_FSID_SIZE); + btrfs_header_fsid(), BTRFS_FSID_SIZE); write_extent_buffer(split, root->fs_info->chunk_tree_uuid, btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); - tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); + ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0, + mid, c_nritems - mid); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } copy_extent_buffer(split, c, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(mid), @@ -3337,8 +3572,8 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr) if (!nr) return 0; btrfs_init_map_token(&token); - start_item = btrfs_item_nr(l, start); - end_item = btrfs_item_nr(l, end); + start_item = btrfs_item_nr(start); + end_item = btrfs_item_nr(end); data_len = btrfs_token_item_offset(l, start_item, &token) + btrfs_token_item_size(l, start_item, &token); data_len = data_len - btrfs_token_item_offset(l, end_item, &token); @@ -3359,8 +3594,8 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root, int ret; ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems); if (ret < 0) { - printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, " - "used %d nritems %d\n", + btrfs_crit(root->fs_info, + "leaf free space ret %d, leaf data size %lu, used %d nritems %d", ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root), leaf_space_used(leaf, 0, nritems), nritems); } @@ -3406,7 +3641,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, slot = path->slots[1]; i = left_nritems - 1; while (i >= nr) { - item = btrfs_item_nr(left, i); + item = btrfs_item_nr(i); if (!empty && push_items > 0) { if (path->slots[0] > i) @@ -3470,7 +3705,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(root); for (i = 0; i < right_nritems; i++) { - item = btrfs_item_nr(right, i); + item = btrfs_item_nr(i); push_space -= btrfs_token_item_size(right, item, &token); btrfs_set_token_item_offset(right, item, push_space, &token); } @@ -3568,6 +3803,19 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (left_nritems == 0) goto out_unlock; + if (path->slots[0] == left_nritems && !empty) { + /* Key greater than all keys in the leaf, right neighbor has + * enough room for it and we're not emptying our leaf to delete + * it, therefore use right neighbor to insert the new item and + * no need to touch/dirty our left leaft. */ + btrfs_tree_unlock(left); + free_extent_buffer(left); + path->nodes[0] = right; + path->slots[0] = 0; + path->slots[1]++; + return 0; + } + return __push_leaf_right(trans, root, path, min_data_size, empty, right, free_space, left_nritems, min_slot); out_unlock: @@ -3612,7 +3860,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, nr = min(right_nritems - 1, max_slot); for (i = 0; i < nr; i++) { - item = btrfs_item_nr(right, i); + item = btrfs_item_nr(i); if (!empty && push_items > 0) { if (path->slots[0] < i) @@ -3639,8 +3887,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, ret = 1; goto out; } - if (!empty && push_items == btrfs_header_nritems(right)) - WARN_ON(1); + WARN_ON(!empty && push_items == btrfs_header_nritems(right)); /* push data from right to left */ copy_extent_buffer(left, right, @@ -3663,7 +3910,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { u32 ioff; - item = btrfs_item_nr(left, i); + item = btrfs_item_nr(i); ioff = btrfs_token_item_offset(left, item, &token); btrfs_set_token_item_offset(left, item, @@ -3694,7 +3941,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(root); for (i = 0; i < right_nritems; i++) { - item = btrfs_item_nr(right, i); + item = btrfs_item_nr(i); push_space = push_space - btrfs_token_item_size(right, item, &token); @@ -3835,7 +4082,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, btrfs_item_end_nr(l, mid); for (i = 0; i < nritems; i++) { - struct btrfs_item *item = btrfs_item_nr(right, i); + struct btrfs_item *item = btrfs_item_nr(i); u32 ioff; ioff = btrfs_token_item_offset(right, item, &token); @@ -3885,14 +4132,17 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans, int progress = 0; int slot; u32 nritems; + int space_needed = data_size; slot = path->slots[0]; + if (slot < btrfs_header_nritems(path->nodes[0])) + space_needed -= btrfs_leaf_free_space(root, path->nodes[0]); /* * try to push all the items after our slot into the * right leaf */ - ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot); + ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot); if (ret < 0) return ret; @@ -3912,7 +4162,7 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans, /* try to push all the items before our slot into the next leaf */ slot = path->slots[0]; - ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot); + ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); if (ret < 0) return ret; @@ -3956,13 +4206,18 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, /* first try to make some room by pushing left and right */ if (data_size && path->nodes[1]) { - wret = push_leaf_right(trans, root, path, data_size, - data_size, 0, 0); + int space_needed = data_size; + + if (slot < btrfs_header_nritems(l)) + space_needed -= btrfs_leaf_free_space(root, l); + + wret = push_leaf_right(trans, root, path, space_needed, + space_needed, 0, 0); if (wret < 0) return wret; if (wret) { - wret = push_leaf_left(trans, root, path, data_size, - data_size, 0, (u32)-1); + wret = push_leaf_left(trans, root, path, space_needed, + space_needed, 0, (u32)-1); if (wret < 0) return wret; } @@ -4016,7 +4271,7 @@ again: data_size > BTRFS_LEAF_DATA_SIZE(root)) { if (data_size && !tried_avoid_double) goto push_for_double; - split = 2 ; + split = 2; } } } @@ -4042,7 +4297,7 @@ again: btrfs_set_header_owner(right, root->root_key.objectid); btrfs_set_header_level(right, 0); write_extent_buffer(right, root->fs_info->fsid, - btrfs_header_fsid(right), BTRFS_FSID_SIZE); + btrfs_header_fsid(), BTRFS_FSID_SIZE); write_extent_buffer(right, root->fs_info->chunk_tree_uuid, btrfs_header_chunk_tree_uuid(right), @@ -4177,7 +4432,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans, btrfs_set_path_blocking(path); - item = btrfs_item_nr(leaf, path->slots[0]); + item = btrfs_item_nr(path->slots[0]); orig_offset = btrfs_item_offset(leaf, item); item_size = btrfs_item_size(leaf, item); @@ -4200,7 +4455,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans, btrfs_cpu_key_to_disk(&disk_key, new_key); btrfs_set_item_key(leaf, &disk_key, slot); - new_item = btrfs_item_nr(leaf, slot); + new_item = btrfs_item_nr(slot); btrfs_set_item_offset(leaf, new_item, orig_offset); btrfs_set_item_size(leaf, new_item, item_size - split_offset); @@ -4339,7 +4594,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, /* first correct the data pointers */ for (i = slot; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(leaf, i); + item = btrfs_item_nr(i); ioff = btrfs_token_item_offset(leaf, item, &token); btrfs_set_token_item_offset(leaf, item, @@ -4387,7 +4642,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, fixup_low_keys(root, path, &disk_key, 1); } - item = btrfs_item_nr(leaf, slot); + item = btrfs_item_nr(slot); btrfs_set_item_size(leaf, item, new_size); btrfs_mark_buffer_dirty(leaf); @@ -4430,7 +4685,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, BUG_ON(slot < 0); if (slot >= nritems) { btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d too large, nritems %d\n", + btrfs_crit(root->fs_info, "slot %d too large, nritems %d", slot, nritems); BUG_ON(1); } @@ -4441,7 +4696,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, /* first correct the data pointers */ for (i = slot; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(leaf, i); + item = btrfs_item_nr(i); ioff = btrfs_token_item_offset(leaf, item, &token); btrfs_set_token_item_offset(leaf, item, @@ -4455,7 +4710,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, data_end = old_data; old_size = btrfs_item_size_nr(leaf, slot); - item = btrfs_item_nr(leaf, slot); + item = btrfs_item_nr(slot); btrfs_set_item_size(leaf, item, old_size + data_size); btrfs_mark_buffer_dirty(leaf); @@ -4493,7 +4748,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, if (btrfs_leaf_free_space(root, leaf) < total_size) { btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "not enough freespace need %u have %d\n", + btrfs_crit(root->fs_info, "not enough freespace need %u have %d", total_size, btrfs_leaf_free_space(root, leaf)); BUG(); } @@ -4503,7 +4758,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, if (old_data < data_end) { btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d old_data %d data_end %d\n", + btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d", slot, old_data, data_end); BUG_ON(1); } @@ -4514,7 +4769,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, for (i = slot; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(leaf, i); + item = btrfs_item_nr( i); ioff = btrfs_token_item_offset(leaf, item, &token); btrfs_set_token_item_offset(leaf, item, ioff - total_data, &token); @@ -4535,7 +4790,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, for (i = 0; i < nr; i++) { btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); btrfs_set_item_key(leaf, &disk_key, slot + i); - item = btrfs_item_nr(leaf, slot + i); + item = btrfs_item_nr(slot + i); btrfs_set_token_item_offset(leaf, item, data_end - data_size[i], &token); data_end -= data_size[i]; @@ -4730,7 +4985,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, for (i = slot + nr; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(leaf, i); + item = btrfs_item_nr(i); ioff = btrfs_token_item_offset(leaf, item, &token); btrfs_set_token_item_offset(leaf, item, ioff + dsize, &token); @@ -4815,7 +5070,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, * This may release the path, and so you may lose any locks held at the * time you call it. */ -static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) +int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) { struct btrfs_key key; struct btrfs_disk_key found_key; @@ -4823,14 +5078,18 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) btrfs_item_key_to_cpu(path->nodes[0], &key, 0); - if (key.offset > 0) + if (key.offset > 0) { key.offset--; - else if (key.type > 0) + } else if (key.type > 0) { key.type--; - else if (key.objectid > 0) + key.offset = (u64)-1; + } else if (key.objectid > 0) { key.objectid--; - else + key.type = (u8)-1; + key.offset = (u64)-1; + } else { return 1; + } btrfs_release_path(path); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -4838,7 +5097,17 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) return ret; btrfs_item_key(path->nodes[0], &found_key, 0); ret = comp_keys(&found_key, &key); - if (ret < 0) + /* + * We might have had an item with the previous key in the tree right + * before we released our path. And after we released our path, that + * item might have been pushed to the first slot (0) of the leaf we + * were holding due to a tree balance. Alternatively, an item with the + * previous key can exist as the only element of a leaf (big fat item). + * Therefore account for these 2 cases, so that our callers (like + * btrfs_previous_item) don't miss an existing item with a key matching + * the previous key we computed above. + */ + if (ret <= 0) return 0; return 1; } @@ -4866,7 +5135,6 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) * was nothing in the tree that matched the search criteria. */ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, - struct btrfs_key *max_key, struct btrfs_path *path, u64 min_trans) { @@ -4911,10 +5179,8 @@ again: * If it is too old, old, skip to the next one. */ while (slot < nritems) { - u64 blockptr; u64 gen; - blockptr = btrfs_node_blockptr(cur, slot); gen = btrfs_node_ptr_generation(cur, slot); if (gen < min_trans) { slot++; @@ -5080,7 +5346,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root, { int ret; int cmp; - struct btrfs_trans_handle *trans = NULL; struct btrfs_path *left_path = NULL; struct btrfs_path *right_path = NULL; struct btrfs_key left_key; @@ -5096,9 +5361,8 @@ int btrfs_compare_trees(struct btrfs_root *left_root, int advance_right; u64 left_blockptr; u64 right_blockptr; - u64 left_start_ctransid; - u64 right_start_ctransid; - u64 ctransid; + u64 left_gen; + u64 right_gen; left_path = btrfs_alloc_path(); if (!left_path) { @@ -5122,21 +5386,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root, right_path->search_commit_root = 1; right_path->skip_locking = 1; - spin_lock(&left_root->root_item_lock); - left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); - spin_unlock(&left_root->root_item_lock); - - spin_lock(&right_root->root_item_lock); - right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); - spin_unlock(&right_root->root_item_lock); - - trans = btrfs_join_transaction(left_root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - /* * Strategy: Go to the first items of both trees. Then do * @@ -5173,6 +5422,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, * the right if possible or go up and right. */ + down_read(&left_root->fs_info->commit_root_sem); left_level = btrfs_header_level(left_root->commit_root); left_root_level = left_level; left_path->nodes[left_level] = left_root->commit_root; @@ -5182,6 +5432,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, right_root_level = right_level; right_path->nodes[right_level] = right_root->commit_root; extent_buffer_get(right_path->nodes[right_level]); + up_read(&left_root->fs_info->commit_root_sem); if (left_level == 0) btrfs_item_key_to_cpu(left_path->nodes[left_level], @@ -5200,67 +5451,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root, advance_left = advance_right = 0; while (1) { - /* - * We need to make sure the transaction does not get committed - * while we do anything on commit roots. This means, we need to - * join and leave transactions for every item that we process. - */ - if (trans && btrfs_should_end_transaction(trans, left_root)) { - btrfs_release_path(left_path); - btrfs_release_path(right_path); - - ret = btrfs_end_transaction(trans, left_root); - trans = NULL; - if (ret < 0) - goto out; - } - /* now rejoin the transaction */ - if (!trans) { - trans = btrfs_join_transaction(left_root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - - spin_lock(&left_root->root_item_lock); - ctransid = btrfs_root_ctransid(&left_root->root_item); - spin_unlock(&left_root->root_item_lock); - if (ctransid != left_start_ctransid) - left_start_ctransid = 0; - - spin_lock(&right_root->root_item_lock); - ctransid = btrfs_root_ctransid(&right_root->root_item); - spin_unlock(&right_root->root_item_lock); - if (ctransid != right_start_ctransid) - right_start_ctransid = 0; - - if (!left_start_ctransid || !right_start_ctransid) { - WARN(1, KERN_WARNING - "btrfs: btrfs_compare_tree detected " - "a change in one of the trees while " - "iterating. This is probably a " - "bug.\n"); - ret = -EIO; - goto out; - } - - /* - * the commit root may have changed, so start again - * where we stopped - */ - left_path->lowest_level = left_level; - right_path->lowest_level = right_level; - ret = btrfs_search_slot(NULL, left_root, - &left_key, left_path, 0, 0); - if (ret < 0) - goto out; - ret = btrfs_search_slot(NULL, right_root, - &right_key, right_path, 0, 0); - if (ret < 0) - goto out; - } - if (advance_left && !left_end_reached) { ret = tree_advance(left_root, left_path, &left_level, left_root_level, @@ -5360,7 +5550,14 @@ int btrfs_compare_trees(struct btrfs_root *left_root, right_blockptr = btrfs_node_blockptr( right_path->nodes[right_level], right_path->slots[right_level]); - if (left_blockptr == right_blockptr) { + left_gen = btrfs_node_ptr_generation( + left_path->nodes[left_level], + left_path->slots[left_level]); + right_gen = btrfs_node_ptr_generation( + right_path->nodes[right_level], + right_path->slots[right_level]); + if (left_blockptr == right_blockptr && + left_gen == right_gen) { /* * As we're on a shared block, don't * allow to go deeper. @@ -5383,14 +5580,6 @@ out: btrfs_free_path(left_path); btrfs_free_path(right_path); kfree(tmp_buf); - - if (trans) { - if (!ret) - ret = btrfs_end_transaction(trans, left_root); - else - btrfs_end_transaction(trans, left_root); - } - return ret; } @@ -5529,6 +5718,24 @@ again: ret = 0; goto done; } + /* + * So the above check misses one case: + * - after releasing the path above, someone has removed the item that + * used to be at the very end of the block, and balance between leafs + * gets another one with bigger key.offset to replace it. + * + * This one should be returned as well, or we can get leaf corruption + * later(esp. in __btrfs_drop_extents()). + * + * And a bit more explanation about this check, + * with ret > 0, the key isn't found, the path points to the slot + * where it should be inserted, so the path->slots[0] item must be the + * bigger one. + */ + if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { + ret = 0; + goto done; + } while (level < BTRFS_MAX_LEVEL) { if (!path->nodes[level]) { @@ -5677,3 +5884,46 @@ int btrfs_previous_item(struct btrfs_root *root, } return 1; } + +/* + * search in extent tree to find a previous Metadata/Data extent item with + * min objecitd. + * + * returns 0 if something is found, 1 if nothing was found and < 0 on error + */ +int btrfs_previous_extent_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid) +{ + struct btrfs_key found_key; + struct extent_buffer *leaf; + u32 nritems; + int ret; + + while (1) { + if (path->slots[0] == 0) { + btrfs_set_path_blocking(path); + ret = btrfs_prev_leaf(root, path); + if (ret != 0) + return ret; + } else { + path->slots[0]--; + } + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (nritems == 0) + return 1; + if (path->slots[0] == nritems) + path->slots[0]--; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid < min_objectid) + break; + if (found_key.type == BTRFS_EXTENT_ITEM_KEY || + found_key.type == BTRFS_METADATA_ITEM_KEY) + return 0; + if (found_key.objectid == min_objectid && + found_key.type < BTRFS_EXTENT_ITEM_KEY) + break; + } + return 1; +} diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0506f40ede8..be91397f4e9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -33,6 +33,7 @@ #include <asm/kmap_types.h> #include <linux/pagemap.h> #include <linux/btrfs.h> +#include <linux/workqueue.h> #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" @@ -47,6 +48,12 @@ extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; struct btrfs_ordered_sum; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +#define STATIC noinline +#else +#define STATIC static noinline +#endif + #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ #define BTRFS_MAX_MIRRORS 3 @@ -345,6 +352,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) #define BTRFS_FS_STATE_ERROR 0 #define BTRFS_FS_STATE_REMOUNTING 1 #define BTRFS_FS_STATE_TRANS_ABORTED 2 +#define BTRFS_FS_STATE_DEV_REPLACING 3 /* Super block flags */ /* Errors detected */ @@ -515,9 +523,15 @@ struct btrfs_super_block { #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) #define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) #define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8) +#define BTRFS_FEATURE_INCOMPAT_NO_HOLES (1ULL << 9) #define BTRFS_FEATURE_COMPAT_SUPP 0ULL +#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL +#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL +#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL +#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL + #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ @@ -526,7 +540,12 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ BTRFS_FEATURE_INCOMPAT_RAID56 | \ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ - BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) + BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ + BTRFS_FEATURE_INCOMPAT_NO_HOLES) + +#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ + (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) +#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL /* * A leaf is full of items. offset and size tell us where to find @@ -591,6 +610,7 @@ struct btrfs_path { unsigned int skip_locking:1; unsigned int leave_spinning:1; unsigned int search_commit_root:1; + unsigned int need_commit_sem:1; }; /* @@ -737,6 +757,12 @@ struct btrfs_dir_item { #define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0) +/* + * Internal in-memory flag that a subvolume has been marked for deletion but + * still visible as a directory + */ +#define BTRFS_ROOT_SUBVOL_DEAD (1ULL << 48) + struct btrfs_root_item { struct btrfs_inode_item inode; __le64 generation; @@ -821,7 +847,10 @@ struct btrfs_disk_balance_args { /* BTRFS_BALANCE_ARGS_* */ __le64 flags; - __le64 unused[8]; + /* BTRFS_BALANCE_ARGS_LIMIT value */ + __le64 limit; + + __le64 unused[7]; } __attribute__ ((__packed__)); /* @@ -968,7 +997,8 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) #define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7) #define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) -#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE +#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ + BTRFS_SPACE_INFO_GLOBAL_RSV) enum btrfs_raid_types { BTRFS_RAID_RAID10, @@ -1000,6 +1030,12 @@ enum btrfs_raid_types { */ #define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) +/* + * A fake block group type that is used to communicate global block reserve + * size to userspace via the SPACE_INFO ioctl. + */ +#define BTRFS_SPACE_INFO_GLOBAL_RSV (1ULL << 49) + #define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \ BTRFS_AVAIL_ALLOC_BIT_SINGLE) @@ -1087,8 +1123,14 @@ struct btrfs_qgroup_limit_item { __le64 rsv_excl; } __attribute__ ((__packed__)); +/* For raid type sysfs entries */ +struct raid_kobject { + int raid_type; + struct kobject kobj; +}; + struct btrfs_space_info { - u64 flags; + spinlock_t lock; u64 total_bytes; /* total bytes in the space, this doesn't take mirrors into account */ @@ -1098,14 +1140,25 @@ struct btrfs_space_info { transaction finishes */ u64 bytes_reserved; /* total bytes the allocator has reserved for current allocations */ - u64 bytes_readonly; /* total bytes that are read only */ - u64 bytes_may_use; /* number of bytes that may be used for delalloc/allocations */ + u64 bytes_readonly; /* total bytes that are read only */ + + unsigned int full:1; /* indicates that we cannot allocate any more + chunks for this space */ + unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ + + unsigned int flush:1; /* set if we are trying to make space */ + + unsigned int force_alloc; /* set if we need to force a chunk + alloc for this space */ + u64 disk_used; /* total bytes used on disk */ u64 disk_total; /* total bytes on disk, takes mirrors into account */ + u64 flags; + /* * bytes_pinned is kept in line with what is actually pinned, as in * we've called update_block_group and dropped the bytes_used counter @@ -1118,22 +1171,15 @@ struct btrfs_space_info { */ struct percpu_counter total_bytes_pinned; - unsigned int full:1; /* indicates that we cannot allocate any more - chunks for this space */ - unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ - - unsigned int flush:1; /* set if we are trying to make space */ - - unsigned int force_alloc; /* set if we need to force a chunk - alloc for this space */ - struct list_head list; + struct rw_semaphore groups_sem; /* for block groups in our same type */ struct list_head block_groups[BTRFS_NR_RAID_TYPES]; - spinlock_t lock; - struct rw_semaphore groups_sem; wait_queue_head_t wait; + + struct kobject kobj; + struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; }; #define BTRFS_BLOCK_RSV_GLOBAL 1 @@ -1213,11 +1259,19 @@ struct btrfs_block_group_cache { spinlock_t lock; u64 pinned; u64 reserved; + u64 delalloc_bytes; u64 bytes_super; u64 flags; u64 sectorsize; u64 cache_generation; + /* + * It is just used for the delayed data space allocation because + * only the data space allocation and the relative metadata update + * can be done cross the transaction. + */ + struct rw_semaphore data_rwsem; + /* for raid56, this is a full stripe, without parity */ unsigned long full_stripe_len; @@ -1283,6 +1337,8 @@ struct btrfs_stripe_hash_table { #define BTRFS_STRIPE_HASH_TABLE_BITS 11 +void btrfs_init_async_reclaim_work(struct work_struct *work); + /* fs_info */ struct reloc_control; struct btrfs_device; @@ -1340,6 +1396,7 @@ struct btrfs_fs_info { u64 generation; u64 last_trans_committed; + u64 avg_delayed_ref_runtime; /* * this is updated to the current trans every time a full commit @@ -1417,7 +1474,7 @@ struct btrfs_fs_info { */ struct mutex ordered_extent_flush_mutex; - struct rw_semaphore extent_commit_sem; + struct rw_semaphore commit_root_sem; struct rw_semaphore cleanup_work_sem; @@ -1442,7 +1499,6 @@ struct btrfs_fs_info { spinlock_t tree_mod_seq_lock; atomic64_t tree_mod_seq; struct list_head tree_mod_seq_list; - struct seq_list tree_mod_seq_elem; /* this protects tree_mod_log */ rwlock_t tree_mod_log_lock; @@ -1468,6 +1524,7 @@ struct btrfs_fs_info { */ struct list_head ordered_roots; + struct mutex delalloc_root_mutex; spinlock_t delalloc_root_lock; /* all fs/file tree roots that have delalloc inodes. */ struct list_head delalloc_roots; @@ -1482,33 +1539,37 @@ struct btrfs_fs_info { * A third pool does submit_bio to avoid deadlocking with the other * two */ - struct btrfs_workers generic_worker; - struct btrfs_workers workers; - struct btrfs_workers delalloc_workers; - struct btrfs_workers flush_workers; - struct btrfs_workers endio_workers; - struct btrfs_workers endio_meta_workers; - struct btrfs_workers endio_raid56_workers; - struct btrfs_workers rmw_workers; - struct btrfs_workers endio_meta_write_workers; - struct btrfs_workers endio_write_workers; - struct btrfs_workers endio_freespace_worker; - struct btrfs_workers submit_workers; - struct btrfs_workers caching_workers; - struct btrfs_workers readahead_workers; + struct btrfs_workqueue *workers; + struct btrfs_workqueue *delalloc_workers; + struct btrfs_workqueue *flush_workers; + struct btrfs_workqueue *endio_workers; + struct btrfs_workqueue *endio_meta_workers; + struct btrfs_workqueue *endio_raid56_workers; + struct btrfs_workqueue *rmw_workers; + struct btrfs_workqueue *endio_meta_write_workers; + struct btrfs_workqueue *endio_write_workers; + struct btrfs_workqueue *endio_freespace_worker; + struct btrfs_workqueue *submit_workers; + struct btrfs_workqueue *caching_workers; + struct btrfs_workqueue *readahead_workers; /* * fixup workers take dirty pages that didn't properly go through * the cow mechanism and make them safe to write. It happens * for the sys_munmap function call path */ - struct btrfs_workers fixup_workers; - struct btrfs_workers delayed_workers; + struct btrfs_workqueue *fixup_workers; + struct btrfs_workqueue *delayed_workers; + + /* the extent workers do delayed refs on the extent allocation tree */ + struct btrfs_workqueue *extent_workers; struct task_struct *transaction_kthread; struct task_struct *cleaner_kthread; int thread_pool_size; struct kobject super_kobj; + struct kobject *space_info_kobj; + struct kobject *device_dir_kobj; struct completion kobj_unregister; int do_barriers; int closing; @@ -1580,11 +1641,10 @@ struct btrfs_fs_info { atomic_t scrubs_paused; atomic_t scrub_cancel_req; wait_queue_head_t scrub_pause_wait; - struct rw_semaphore scrub_super_lock; int scrub_workers_refcnt; - struct btrfs_workers scrub_workers; - struct btrfs_workers scrub_wr_completion_workers; - struct btrfs_workers scrub_nocow_workers; + struct btrfs_workqueue *scrub_workers; + struct btrfs_workqueue *scrub_wr_completion_workers; + struct btrfs_workqueue *scrub_nocow_workers; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY u32 check_integrity_print_mask; @@ -1605,7 +1665,10 @@ struct btrfs_fs_info { /* holds configuration and tracking. Protected by qgroup_lock */ struct rb_root qgroup_tree; + struct rb_root qgroup_op_tree; spinlock_t qgroup_lock; + spinlock_t qgroup_op_lock; + atomic_t qgroup_op_seq; /* * used to avoid frequently calling ulist_alloc()/ulist_free() @@ -1625,7 +1688,7 @@ struct btrfs_fs_info { /* qgroup rescan items */ struct mutex qgroup_rescan_lock; /* protects the progress item */ struct btrfs_key qgroup_rescan_progress; - struct btrfs_workers qgroup_rescan_workers; + struct btrfs_workqueue *qgroup_rescan_workers; struct completion qgroup_rescan_completion; struct btrfs_work qgroup_rescan_work; @@ -1638,6 +1701,10 @@ struct btrfs_fs_info { spinlock_t reada_lock; struct radix_tree_root reada_tree; + /* Extent buffer radix tree */ + spinlock_t buffer_lock; + struct radix_tree_root buffer_radix; + /* next backup root to be overwritten */ int backup_root_index; @@ -1648,11 +1715,42 @@ struct btrfs_fs_info { atomic_t mutually_exclusive_operation_running; + struct percpu_counter bio_counter; + wait_queue_head_t replace_wait; + struct semaphore uuid_tree_rescan_sem; unsigned int update_uuid_tree_gen:1; + + /* Used to reclaim the metadata space in the background. */ + struct work_struct async_reclaim_work; +}; + +struct btrfs_subvolume_writers { + struct percpu_counter counter; + wait_queue_head_t wait; }; /* + * The state of btrfs root + */ +/* + * btrfs_record_root_in_trans is a multi-step process, + * and it can race with the balancing code. But the + * race is very small, and only the first time the root + * is added to each transaction. So IN_TRANS_SETUP + * is used to tell us when more checks are required + */ +#define BTRFS_ROOT_IN_TRANS_SETUP 0 +#define BTRFS_ROOT_REF_COWS 1 +#define BTRFS_ROOT_TRACK_DIRTY 2 +#define BTRFS_ROOT_IN_RADIX 3 +#define BTRFS_ROOT_DUMMY_ROOT 4 +#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 5 +#define BTRFS_ROOT_DEFRAG_RUNNING 6 +#define BTRFS_ROOT_FORCE_COW 7 +#define BTRFS_ROOT_MULTI_LOG_TASKS 8 + +/* * in ram representation of the tree. extent_root is used for all allocations * and for the extent tree extent_root root. */ @@ -1663,6 +1761,7 @@ struct btrfs_root { struct btrfs_root *log_root; struct btrfs_root *reloc_root; + unsigned long state; struct btrfs_root_item root_item; struct btrfs_key root_key; struct btrfs_fs_info *fs_info; @@ -1676,7 +1775,6 @@ struct btrfs_root { struct btrfs_block_rsv *block_rsv; /* free ino cache stuff */ - struct mutex fs_commit_mutex; struct btrfs_free_space_ctl *free_ino_ctl; enum btrfs_caching_type cached; spinlock_t cache_lock; @@ -1688,13 +1786,16 @@ struct btrfs_root { struct mutex log_mutex; wait_queue_head_t log_writer_wait; wait_queue_head_t log_commit_wait[2]; + struct list_head log_ctxs[2]; atomic_t log_writers; atomic_t log_commit[2]; atomic_t log_batch; - unsigned long log_transid; - unsigned long last_log_commit; + int log_transid; + /* No matter the commit succeeds or not*/ + int log_transid_committed; + /* Just be updated when the commit succeeds. */ + int last_log_commit; pid_t log_start_pid; - bool log_multiple_pids; u64 objectid; u64 last_trans; @@ -1714,21 +1815,13 @@ struct btrfs_root { u64 highest_objectid; - /* btrfs_record_root_in_trans is a multi-step process, - * and it can race with the balancing code. But the - * race is very small, and only the first time the root - * is added to each transaction. So in_trans_setup - * is used to tell us when more checks are required - */ - unsigned long in_trans_setup; - int ref_cows; - int track_dirty; - int in_radix; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + u64 alloc_bytenr; +#endif u64 defrag_trans_start; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; - int defrag_running; char *name; /* the dirty list is only used by non-reference counted roots */ @@ -1742,7 +1835,6 @@ struct btrfs_root { spinlock_t orphan_lock; atomic_t orphan_inodes; struct btrfs_block_rsv *orphan_block_rsv; - int orphan_item_inserted; int orphan_cleanup_state; spinlock_t inode_lock; @@ -1760,11 +1852,10 @@ struct btrfs_root { */ dev_t anon_dev; - int force_cow; - spinlock_t root_item_lock; atomic_t refs; + struct mutex delalloc_mutex; spinlock_t delalloc_lock; /* * all of the inodes that have delalloc bytes. It is possible for @@ -1774,6 +1865,8 @@ struct btrfs_root { struct list_head delalloc_inodes; struct list_head delalloc_root; u64 nr_delalloc_inodes; + + struct mutex ordered_extent_mutex; /* * this is used by the balancing code to wait for all the pending * ordered extents @@ -1788,6 +1881,14 @@ struct btrfs_root { struct list_head ordered_extents; struct list_head ordered_root; u64 nr_ordered_extents; + + /* + * Number of currently running SEND ioctls to prevent + * manipulation with the read-only status via SUBVOL_SETFLAGS + */ + int send_in_progress; + struct btrfs_subvolume_writers *subv_writers; + atomic_t will_be_snapshoted; }; struct btrfs_ioctl_defrag_range_args { @@ -1990,6 +2091,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) #define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) +#define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) @@ -1998,6 +2100,20 @@ struct btrfs_ioctl_defrag_range_args { #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) #define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ BTRFS_MOUNT_##opt) +#define btrfs_set_and_info(root, opt, fmt, args...) \ +{ \ + if (!btrfs_test_opt(root, opt)) \ + btrfs_info(root->fs_info, fmt, ##args); \ + btrfs_set_opt(root->fs_info->mount_opt, opt); \ +} + +#define btrfs_clear_and_info(root, opt, fmt, args...) \ +{ \ + if (btrfs_test_opt(root, opt)) \ + btrfs_info(root->fs_info, fmt, ##args); \ + btrfs_clear_opt(root->fs_info->mount_opt, opt); \ +} + /* * Inode flags */ @@ -2461,8 +2577,7 @@ static inline unsigned long btrfs_item_nr_offset(int nr) sizeof(struct btrfs_item) * nr; } -static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb, - int nr) +static inline struct btrfs_item *btrfs_item_nr(int nr) { return (struct btrfs_item *)btrfs_item_nr_offset(nr); } @@ -2475,30 +2590,30 @@ static inline u32 btrfs_item_end(struct extent_buffer *eb, static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr) { - return btrfs_item_end(eb, btrfs_item_nr(eb, nr)); + return btrfs_item_end(eb, btrfs_item_nr(nr)); } static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr) { - return btrfs_item_offset(eb, btrfs_item_nr(eb, nr)); + return btrfs_item_offset(eb, btrfs_item_nr(nr)); } static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr) { - return btrfs_item_size(eb, btrfs_item_nr(eb, nr)); + return btrfs_item_size(eb, btrfs_item_nr(nr)); } static inline void btrfs_item_key(struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { - struct btrfs_item *item = btrfs_item_nr(eb, nr); + struct btrfs_item *item = btrfs_item_nr(nr); read_eb_member(eb, item, struct btrfs_item, key, disk_key); } static inline void btrfs_set_item_key(struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { - struct btrfs_item *item = btrfs_item_nr(eb, nr); + struct btrfs_item *item = btrfs_item_nr(nr); write_eb_member(eb, item, struct btrfs_item, key, disk_key); } @@ -2666,7 +2781,7 @@ static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, btrfs_set_header_flags(eb, flags); } -static inline unsigned long btrfs_header_fsid(struct extent_buffer *eb) +static inline unsigned long btrfs_header_fsid(void) { return offsetof(struct btrfs_header, fsid); } @@ -2715,6 +2830,11 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root) return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; } +static inline bool btrfs_root_dead(struct btrfs_root *root) +{ + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; +} + /* struct btrfs_root_backup */ BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, tree_root, 64); @@ -2824,6 +2944,7 @@ btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, cpu->vend = le64_to_cpu(disk->vend); cpu->target = le64_to_cpu(disk->target); cpu->flags = le64_to_cpu(disk->flags); + cpu->limit = le64_to_cpu(disk->limit); } static inline void @@ -2841,6 +2962,7 @@ btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, disk->vend = cpu_to_le64(cpu->vend); disk->target = cpu_to_le64(cpu->target); disk->flags = cpu_to_le64(cpu->flags); + disk->limit = cpu_to_le64(cpu->limit); } /* struct btrfs_super_block */ @@ -2919,6 +3041,10 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation, struct btrfs_file_extent_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes, struct btrfs_file_extent_item, num_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes, + struct btrfs_file_extent_item, disk_num_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, + struct btrfs_file_extent_item, compression, 8); static inline unsigned long btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e) @@ -2952,15 +3078,6 @@ BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, other_encoding, 16); -/* this returns the number of file bytes represented by the inline item. - * If an item is compressed, this is the uncompressed size - */ -static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, - struct btrfs_file_extent_item *e) -{ - return btrfs_file_extent_ram_bytes(eb, e); -} - /* * this returns the number of bytes used by the item on disk, minus the * size of any extent headers. If a file is compressed on disk, this is @@ -2974,6 +3091,32 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, return btrfs_item_size(eb, e) - offset; } +/* this returns the number of file bytes represented by the inline item. + * If an item is compressed, this is the uncompressed size + */ +static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, + int slot, + struct btrfs_file_extent_item *fi) +{ + struct btrfs_map_token token; + + btrfs_init_map_token(&token); + /* + * return the space used on disk if this item isn't + * compressed or encoded + */ + if (btrfs_token_file_extent_compression(eb, fi, &token) == 0 && + btrfs_token_file_extent_encryption(eb, fi, &token) == 0 && + btrfs_token_file_extent_other_encoding(eb, fi, &token) == 0) { + return btrfs_file_extent_inline_item_len(eb, + btrfs_item_nr(slot)); + } + + /* otherwise use the ram bytes field */ + return btrfs_token_file_extent_ram_bytes(eb, fi, &token); +} + + /* btrfs_dev_stats_item */ static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb, struct btrfs_dev_stats_item *ptr, @@ -3105,11 +3248,6 @@ static inline u32 btrfs_level_size(struct btrfs_root *root, int level) ((unsigned long)(btrfs_leaf_data(leaf) + \ btrfs_item_offset_nr(leaf, slot))) -static inline struct dentry *fdentry(struct file *file) -{ - return file->f_path.dentry; -} - static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) { return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && @@ -3142,9 +3280,13 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root, int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); +int btrfs_async_run_delayed_refs(struct btrfs_root *root, + unsigned long count, int wait); int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, @@ -3162,6 +3304,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group( struct btrfs_fs_info *info, u64 bytenr); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); +int get_block_group_index(struct btrfs_block_group_cache *cache); struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize, u64 parent, u64 root_objectid, @@ -3181,11 +3324,11 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, struct btrfs_key *ins); int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, int is_data); + struct btrfs_key *ins, int is_data, int delalloc); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow); + struct extent_buffer *buf, int full_backref, int no_quota); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow); + struct extent_buffer *buf, int full_backref, int no_quota); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 flags, @@ -3193,9 +3336,10 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int for_cow); + u64 owner, u64 offset, int no_quota); -int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len, + int delalloc); int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, u64 start, u64 len); void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, @@ -3205,7 +3349,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset, int for_cow); + u64 root_objectid, u64 owner, u64 offset, int no_quota); int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -3293,6 +3437,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int __get_raid_index(u64 flags); +int btrfs_start_nocow_write(struct btrfs_root *root); +void btrfs_end_nocow_write(struct btrfs_root *root); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -3300,6 +3446,8 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2); int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); +int btrfs_previous_extent_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid); void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); @@ -3308,7 +3456,6 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, int lowest_level, u64 min_trans); int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, - struct btrfs_key *max_key, struct btrfs_path *path, u64 min_trans); enum btrfs_compare_tree_result { @@ -3350,6 +3497,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *new_key); +int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, + u64 inum, u64 ioff, u8 key_type, struct btrfs_key *found_key); int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow); @@ -3399,6 +3548,7 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, } int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); +int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); static inline int btrfs_next_old_item(struct btrfs_root *root, @@ -3462,7 +3612,6 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem); void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem); -u64 btrfs_tree_mod_seq_prev(u64 seq); int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); /* root-item.c */ @@ -3563,12 +3712,6 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, u64 inode_objectid, u64 ref_objectid, u64 *index); -int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, int mod, - u64 *ret_index); int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); @@ -3613,11 +3756,14 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum *sums); int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 file_start, int contig); -int btrfs_csum_truncate(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - u64 isize); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); +void btrfs_extent_item_to_extent_map(struct inode *inode, + const struct btrfs_path *path, + struct btrfs_file_extent_item *fi, + const bool new_inline, + struct extent_map *em); + /* inode.c */ struct btrfs_delalloc_work { struct inode *inode; @@ -3675,12 +3821,14 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); -int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info, - int delay_iput); +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, + int nr); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, u64 new_dirid); + struct btrfs_root *new_root, + struct btrfs_root *parent_root, + u64 new_dirid); int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); @@ -3745,14 +3893,14 @@ void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int skip_pinned); -int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace, - u64 start, u64 end, int skip_pinned, - int modified); extern const struct file_operations btrfs_file_operations; int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, u64 start, u64 end, - u64 *drop_end, int drop_cache); + u64 *drop_end, int drop_cache, + int replace_extent, + u32 extent_item_size, + int *key_inserted); int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, int drop_cache); @@ -3771,6 +3919,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, /* sysfs.c */ int btrfs_init_sysfs(void); void btrfs_exit_sysfs(void); +int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info); +void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info); /* xattr.c */ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); @@ -3803,14 +3953,20 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) btrfs_printk(fs_info, KERN_NOTICE fmt, ##args) #define btrfs_info(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_INFO fmt, ##args) + +#ifdef DEBUG #define btrfs_debug(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) +#else +#define btrfs_debug(fs_info, fmt, args...) \ + no_printk(KERN_DEBUG fmt, ##args) +#endif #ifdef CONFIG_BTRFS_ASSERT static inline void assfail(char *expr, char *file, int line) { - printk(KERN_ERR "BTRFS assertion failed: %s, file: %s, line: %d", + pr_err("BTRFS: assertion failed: %s, file: %s, line: %d", expr, file, line); BUG(); } @@ -3848,7 +4004,7 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, if (!(features & flag)) { features |= flag; btrfs_set_super_incompat_flags(disk_super, features); - printk(KERN_INFO "btrfs: setting %llu feature flag\n", + btrfs_info(fs_info, "setting %llu feature flag", flag); } spin_unlock(&fs_info->super_lock); @@ -3906,20 +4062,17 @@ do { \ /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type); +int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); int btrfs_init_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir); -int btrfs_acl_chmod(struct inode *inode); #else #define btrfs_get_acl NULL +#define btrfs_set_acl NULL static inline int btrfs_init_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir) { return 0; } -static inline int btrfs_acl_chmod(struct inode *inode) -{ - return 0; -} #endif /* relocation.c */ @@ -3944,15 +4097,18 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, int readonly, int is_dev_replace); void btrfs_scrub_pause(struct btrfs_root *root); -void btrfs_scrub_pause_super(struct btrfs_root *root); void btrfs_scrub_continue(struct btrfs_root *root); -void btrfs_scrub_continue_super(struct btrfs_root *root); int btrfs_scrub_cancel(struct btrfs_fs_info *info); int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, struct btrfs_device *dev); int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, struct btrfs_scrub_progress *progress); +/* dev-replace.c */ +void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); +void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info); +void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info); + /* reada.c */ struct reada_control { struct btrfs_root *root; /* tree to prefetch */ @@ -3969,52 +4125,6 @@ void btrfs_reada_detach(void *handle); int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, u64 start, int err); -/* qgroup.c */ -struct qgroup_update { - struct list_head list; - struct btrfs_delayed_ref_node *node; - struct btrfs_delayed_extent_op *extent_op; -}; - -int btrfs_quota_enable(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_quota_disable(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); -void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); -int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info); -int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 src, u64 dst); -int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 src, u64 dst); -int btrfs_create_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid, - char *name); -int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid); -int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid, - struct btrfs_qgroup_limit *limit); -int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); -void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); -struct btrfs_delayed_extent_op; -int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op); -int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op); -int btrfs_run_qgroups(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, - struct btrfs_qgroup_inherit *inherit); -int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes); -void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes); - -void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); - static inline int is_fstree(u64 rootid) { if (rootid == BTRFS_FS_TREE_OBJECTID || @@ -4028,5 +4138,11 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) return signal_pending(current); } +/* Sanity test specific functions */ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_destroy_inode(struct inode *inode); +int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, + u64 rfer, u64 excl); +#endif #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index cbd9523ad09..da775bfdebc 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -55,8 +55,7 @@ static inline void btrfs_init_delayed_node( delayed_node->inode_id = inode_id; atomic_set(&delayed_node->refs, 0); delayed_node->count = 0; - delayed_node->in_list = 0; - delayed_node->inode_dirty = 0; + delayed_node->flags = 0; delayed_node->ins_root = RB_ROOT; delayed_node->del_root = RB_ROOT; mutex_init(&delayed_node->mutex); @@ -108,8 +107,8 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode) return node; } btrfs_inode->delayed_node = node; - atomic_inc(&node->refs); /* can be accessed */ - atomic_inc(&node->refs); /* cached in the inode */ + /* can be accessed and cached in the inode */ + atomic_add(2, &node->refs); spin_unlock(&root->inode_lock); return node; } @@ -138,8 +137,8 @@ again: return ERR_PTR(-ENOMEM); btrfs_init_delayed_node(node, root, ino); - atomic_inc(&node->refs); /* cached in the btrfs inode */ - atomic_inc(&node->refs); /* can be accessed */ + /* cached in the btrfs inode and can be accessed */ + atomic_add(2, &node->refs); ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); if (ret) { @@ -150,8 +149,8 @@ again: spin_lock(&root->inode_lock); ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); if (ret == -EEXIST) { - kmem_cache_free(delayed_node_cache, node); spin_unlock(&root->inode_lock); + kmem_cache_free(delayed_node_cache, node); radix_tree_preload_end(); goto again; } @@ -172,7 +171,7 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root, int mod) { spin_lock(&root->lock); - if (node->in_list) { + if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { if (!list_empty(&node->p_list)) list_move_tail(&node->p_list, &root->prepare_list); else if (mod) @@ -182,7 +181,7 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root, list_add_tail(&node->p_list, &root->prepare_list); atomic_inc(&node->refs); /* inserted into list */ root->nodes++; - node->in_list = 1; + set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); } spin_unlock(&root->lock); } @@ -192,13 +191,13 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, struct btrfs_delayed_node *node) { spin_lock(&root->lock); - if (node->in_list) { + if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { root->nodes--; atomic_dec(&node->refs); /* not in the list */ list_del_init(&node->n_list); if (!list_empty(&node->p_list)) list_del_init(&node->p_list); - node->in_list = 0; + clear_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); } spin_unlock(&root->lock); } @@ -231,7 +230,8 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node( delayed_root = node->root->fs_info->delayed_root; spin_lock(&delayed_root->lock); - if (!node->in_list) { /* not in the list */ + if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { + /* not in the list */ if (list_empty(&delayed_root->node_list)) goto out; p = delayed_root->node_list.next; @@ -267,14 +267,17 @@ static void __btrfs_release_delayed_node( mutex_unlock(&delayed_node->mutex); if (atomic_dec_and_test(&delayed_node->refs)) { + bool free = false; struct btrfs_root *root = delayed_node->root; spin_lock(&root->inode_lock); if (atomic_read(&delayed_node->refs) == 0) { radix_tree_delete(&root->delayed_nodes_tree, delayed_node->inode_id); - kmem_cache_free(delayed_node_cache, delayed_node); + free = true; } spin_unlock(&root->inode_lock); + if (free) + kmem_cache_free(delayed_node_cache, delayed_node); } } @@ -649,14 +652,13 @@ static int btrfs_delayed_inode_reserve_metadata( goto out; ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (!ret) + if (!WARN_ON(ret)) goto out; /* * Ok this is a problem, let's just steal from the global rsv * since this really shouldn't happen that often. */ - WARN_ON(1); ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv, dst_rsv, num_bytes); goto out; @@ -771,13 +773,13 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, */ btrfs_set_path_blocking(path); - keys = kmalloc(sizeof(struct btrfs_key) * nitems, GFP_NOFS); + keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS); if (!keys) { ret = -ENOMEM; goto out; } - data_size = kmalloc(sizeof(u32) * nitems, GFP_NOFS); + data_size = kmalloc_array(nitems, sizeof(u32), GFP_NOFS); if (!data_size) { ret = -ENOMEM; goto error; @@ -1005,9 +1007,10 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) { struct btrfs_delayed_root *delayed_root; - if (delayed_node && delayed_node->inode_dirty) { + if (delayed_node && + test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { BUG_ON(!delayed_node->root); - delayed_node->inode_dirty = 0; + clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); delayed_node->count--; delayed_root = delayed_node->root->fs_info->delayed_root; @@ -1015,6 +1018,18 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) } } +static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node) +{ + struct btrfs_delayed_root *delayed_root; + + ASSERT(delayed_node->root); + clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags); + delayed_node->count--; + + delayed_root = delayed_node->root->fs_info->delayed_root; + finish_one_item(delayed_root); +} + static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -1023,13 +1038,19 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, struct btrfs_key key; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; + int mod; int ret; key.objectid = node->inode_id; btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; - ret = btrfs_lookup_inode(trans, root, path, &key, 1); + if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) + mod = -1; + else + mod = 1; + + ret = btrfs_lookup_inode(trans, root, path, &key, mod); if (ret > 0) { btrfs_release_path(path); return -ENOENT; @@ -1037,19 +1058,58 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, return ret; } - btrfs_unlock_up_safe(path, 1); leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item, sizeof(struct btrfs_inode_item)); btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); + if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) + goto no_iref; + + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(leaf)) + goto search; +again: + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != node->inode_id) + goto out; + + if (key.type != BTRFS_INODE_REF_KEY && + key.type != BTRFS_INODE_EXTREF_KEY) + goto out; + + /* + * Delayed iref deletion is for the inode who has only one link, + * so there is only one iref. The case that several irefs are + * in the same item doesn't exist. + */ + btrfs_del_item(trans, root, path); +out: + btrfs_release_delayed_iref(node); +no_iref: + btrfs_release_path(path); +err_out: btrfs_delayed_inode_release_metadata(root, node); btrfs_release_delayed_inode(node); - return 0; + return ret; + +search: + btrfs_release_path(path); + + btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); + key.offset = -1; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto err_out; + ASSERT(ret); + + ret = 0; + leaf = path->nodes[0]; + path->slots[0]--; + goto again; } static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, @@ -1060,7 +1120,7 @@ static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, int ret; mutex_lock(&node->mutex); - if (!node->inode_dirty) { + if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &node->flags)) { mutex_unlock(&node->mutex); return 0; } @@ -1174,8 +1234,10 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, mutex_unlock(&delayed_node->mutex); path = btrfs_alloc_path(); - if (!path) + if (!path) { + btrfs_release_delayed_node(delayed_node); return -ENOMEM; + } path->leave_spinning = 1; block_rsv = trans->block_rsv; @@ -1202,7 +1264,7 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode) return 0; mutex_lock(&delayed_node->mutex); - if (!delayed_node->inode_dirty) { + if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_node(delayed_node); return 0; @@ -1226,7 +1288,7 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode) trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; mutex_lock(&delayed_node->mutex); - if (delayed_node->inode_dirty) + if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) ret = __btrfs_update_delayed_inode(trans, delayed_node->root, path, delayed_node); else @@ -1299,36 +1361,9 @@ again: trans->block_rsv = &root->fs_info->delayed_block_rsv; __btrfs_commit_inode_delayed_items(trans, path, delayed_node); - /* - * Maybe new delayed items have been inserted, so we need requeue - * the work. Besides that, we must dequeue the empty delayed nodes - * to avoid the race between delayed items balance and the worker. - * The race like this: - * Task1 Worker thread - * count == 0, needn't requeue - * also needn't insert the - * delayed node into prepare - * list again. - * add lots of delayed items - * queue the delayed node - * already in the list, - * and not in the prepare - * list, it means the delayed - * node is being dealt with - * by the worker. - * do delayed items balance - * the delayed node is being - * dealt with by the worker - * now, just wait. - * the worker goto idle. - * Task1 will sleep until the transaction is commited. - */ - mutex_lock(&delayed_node->mutex); - btrfs_dequeue_delayed_node(root->fs_info->delayed_root, delayed_node); - mutex_unlock(&delayed_node->mutex); trans->block_rsv = block_rsv; - btrfs_end_transaction_dmeta(trans, root); + btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty_nodelay(root); release_path: @@ -1360,11 +1395,11 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, return -ENOMEM; async_work->delayed_root = delayed_root; - async_work->work.func = btrfs_async_run_delayed_root; - async_work->work.flags = 0; + btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, + NULL, NULL); async_work->nr = nr; - btrfs_queue_worker(&root->fs_info->delayed_workers, &async_work->work); + btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work); return 0; } @@ -1375,52 +1410,41 @@ void btrfs_assert_delayed_root_empty(struct btrfs_root *root) WARN_ON(btrfs_first_delayed_node(delayed_root)); } -static int refs_newer(struct btrfs_delayed_root *delayed_root, - int seq, int count) +static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) { int val = atomic_read(&delayed_root->items_seq); - if (val < seq || val >= seq + count) + if (val < seq || val >= seq + BTRFS_DELAYED_BATCH) + return 1; + + if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) return 1; + return 0; } void btrfs_balance_delayed_items(struct btrfs_root *root) { struct btrfs_delayed_root *delayed_root; - int seq; delayed_root = btrfs_get_delayed_root(root); if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) return; - seq = atomic_read(&delayed_root->items_seq); - if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) { + int seq; int ret; - DEFINE_WAIT(__wait); + + seq = atomic_read(&delayed_root->items_seq); ret = btrfs_wq_run_delayed_node(delayed_root, root, 0); if (ret) return; - while (1) { - prepare_to_wait(&delayed_root->wait, &__wait, - TASK_INTERRUPTIBLE); - - if (refs_newer(delayed_root, seq, - BTRFS_DELAYED_BATCH) || - atomic_read(&delayed_root->items) < - BTRFS_DELAYED_BACKGROUND) { - break; - } - if (!signal_pending(current)) - schedule(); - else - break; - } - finish_wait(&delayed_root->wait, &__wait); + wait_event_interruptible(delayed_root->wait, + could_end_wait(delayed_root, seq)); + return; } btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH); @@ -1471,9 +1495,9 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, mutex_lock(&delayed_node->mutex); ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); if (unlikely(ret)) { - printk(KERN_ERR "err add delayed dir index item(name: %.*s) " + btrfs_err(root->fs_info, "err add delayed dir index item(name: %.*s) " "into the insertion tree of the delayed node" - "(root id: %llu, inode id: %llu, errno: %d)\n", + "(root id: %llu, inode id: %llu, errno: %d)", name_len, name, delayed_node->root->objectid, delayed_node->inode_id, ret); BUG(); @@ -1543,9 +1567,9 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, mutex_lock(&node->mutex); ret = __btrfs_add_delayed_deletion_item(node, item); if (unlikely(ret)) { - printk(KERN_ERR "err add delayed dir index item(index: %llu) " + btrfs_err(root->fs_info, "err add delayed dir index item(index: %llu) " "into the deletion tree of the delayed node" - "(root id: %llu, inode id: %llu, errno: %d)\n", + "(root id: %llu, inode id: %llu, errno: %d)", index, node->root->objectid, node->inode_id, ret); BUG(); @@ -1758,7 +1782,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) return -ENOENT; mutex_lock(&delayed_node->mutex); - if (!delayed_node->inode_dirty) { + if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_node(delayed_node); return -ENOENT; @@ -1809,7 +1833,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, return PTR_ERR(delayed_node); mutex_lock(&delayed_node->mutex); - if (delayed_node->inode_dirty) { + if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { fill_stack_inode_item(trans, &delayed_node->inode_item, inode); goto release_node; } @@ -1820,7 +1844,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, goto release_node; fill_stack_inode_item(trans, &delayed_node->inode_item, inode); - delayed_node->inode_dirty = 1; + set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); delayed_node->count++; atomic_inc(&root->fs_info->delayed_root->items); release_node: @@ -1829,6 +1853,41 @@ release_node: return ret; } +int btrfs_delayed_delete_inode_ref(struct inode *inode) +{ + struct btrfs_delayed_node *delayed_node; + + delayed_node = btrfs_get_or_create_delayed_node(inode); + if (IS_ERR(delayed_node)) + return PTR_ERR(delayed_node); + + /* + * We don't reserve space for inode ref deletion is because: + * - We ONLY do async inode ref deletion for the inode who has only + * one link(i_nlink == 1), it means there is only one inode ref. + * And in most case, the inode ref and the inode item are in the + * same leaf, and we will deal with them at the same time. + * Since we are sure we will reserve the space for the inode item, + * it is unnecessary to reserve space for inode ref deletion. + * - If the inode ref and the inode item are not in the same leaf, + * We also needn't worry about enospc problem, because we reserve + * much more space for the inode update than it needs. + * - At the worst, we can steal some space from the global reservation. + * It is very rare. + */ + mutex_lock(&delayed_node->mutex); + if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) + goto release_node; + + set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags); + delayed_node->count++; + atomic_inc(&BTRFS_I(inode)->root->fs_info->delayed_root->items); +release_node: + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return 0; +} + static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) { struct btrfs_root *root = delayed_node->root; @@ -1851,7 +1910,10 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) btrfs_release_delayed_item(prev_item); } - if (delayed_node->inode_dirty) { + if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) + btrfs_release_delayed_iref(delayed_node); + + if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { btrfs_delayed_inode_release_metadata(root, delayed_node); btrfs_release_delayed_inode(delayed_node); } diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index a4b38f934d1..f70119f2542 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -48,6 +48,10 @@ struct btrfs_delayed_root { wait_queue_head_t wait; }; +#define BTRFS_DELAYED_NODE_IN_LIST 0 +#define BTRFS_DELAYED_NODE_INODE_DIRTY 1 +#define BTRFS_DELAYED_NODE_DEL_IREF 2 + struct btrfs_delayed_node { u64 inode_id; u64 bytes_reserved; @@ -65,8 +69,7 @@ struct btrfs_delayed_node { struct btrfs_inode_item inode_item; atomic_t refs; u64 index_cnt; - bool in_list; - bool inode_dirty; + unsigned long flags; int count; }; @@ -125,6 +128,7 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode); int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); int btrfs_fill_inode(struct inode *inode, u32 *rdev); +int btrfs_delayed_delete_inode_ref(struct inode *inode); /* Used for drop dead root */ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index e4d467be2dd..6d16bea94e1 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -106,6 +106,10 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2, return -1; if (ref1->type > ref2->type) return 1; + if (ref1->no_quota > ref2->no_quota) + return 1; + if (ref1->no_quota < ref2->no_quota) + return -1; /* merging of sequenced refs is not allowed */ if (compare_seq) { if (ref1->seq < ref2->seq) @@ -161,56 +165,69 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, return NULL; } +/* insert a new ref to head ref rbtree */ +static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent_node = NULL; + struct btrfs_delayed_ref_head *entry; + struct btrfs_delayed_ref_head *ins; + u64 bytenr; + + ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); + bytenr = ins->node.bytenr; + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, + href_node); + + if (bytenr < entry->node.bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->node.bytenr) + p = &(*p)->rb_right; + else + return entry; + } + + rb_link_node(node, parent_node, p); + rb_insert_color(node, root); + return NULL; +} + /* * find an head entry based on bytenr. This returns the delayed ref * head if it was able to find one, or NULL if nothing was in that spot. * If return_bigger is given, the next bigger entry is returned if no exact * match is found. */ -static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, - u64 bytenr, - struct btrfs_delayed_ref_node **last, - int return_bigger) +static struct btrfs_delayed_ref_head * +find_ref_head(struct rb_root *root, u64 bytenr, + int return_bigger) { struct rb_node *n; - struct btrfs_delayed_ref_node *entry; - int cmp = 0; + struct btrfs_delayed_ref_head *entry; -again: n = root->rb_node; entry = NULL; while (n) { - entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); - WARN_ON(!entry->in_tree); - if (last) - *last = entry; - - if (bytenr < entry->bytenr) - cmp = -1; - else if (bytenr > entry->bytenr) - cmp = 1; - else if (!btrfs_delayed_ref_is_head(entry)) - cmp = 1; - else - cmp = 0; + entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); - if (cmp < 0) + if (bytenr < entry->node.bytenr) n = n->rb_left; - else if (cmp > 0) + else if (bytenr > entry->node.bytenr) n = n->rb_right; else return entry; } if (entry && return_bigger) { - if (cmp > 0) { - n = rb_next(&entry->rb_node); + if (bytenr > entry->node.bytenr) { + n = rb_next(&entry->href_node); if (!n) n = rb_first(root); - entry = rb_entry(n, struct btrfs_delayed_ref_node, - rb_node); - bytenr = entry->bytenr; - return_bigger = 0; - goto again; + entry = rb_entry(n, struct btrfs_delayed_ref_head, + href_node); + return entry; } return entry; } @@ -243,33 +260,38 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref) { - rb_erase(&ref->rb_node, &delayed_refs->root); + if (btrfs_delayed_ref_is_head(ref)) { + head = btrfs_delayed_node_to_head(ref); + rb_erase(&head->href_node, &delayed_refs->href_root); + } else { + assert_spin_locked(&head->lock); + rb_erase(&ref->rb_node, &head->ref_root); + } ref->in_tree = 0; btrfs_put_delayed_ref(ref); - delayed_refs->num_entries--; + atomic_dec(&delayed_refs->num_entries); if (trans->delayed_ref_updates) trans->delayed_ref_updates--; } static int merge_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref, u64 seq) { struct rb_node *node; - int merged = 0; int mod = 0; int done = 0; - node = rb_prev(&ref->rb_node); - while (node) { + node = rb_next(&ref->rb_node); + while (!done && node) { struct btrfs_delayed_ref_node *next; next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - node = rb_prev(node); - if (next->bytenr != ref->bytenr) - break; + node = rb_next(node); if (seq && next->seq >= seq) break; if (comp_entry(ref, next, 0)) @@ -289,12 +311,11 @@ static int merge_ref(struct btrfs_trans_handle *trans, mod = -next->ref_mod; } - merged++; - drop_delayed_ref(trans, delayed_refs, next); + drop_delayed_ref(trans, delayed_refs, head, next); ref->ref_mod += mod; if (ref->ref_mod == 0) { - drop_delayed_ref(trans, delayed_refs, ref); - break; + drop_delayed_ref(trans, delayed_refs, head, ref); + done = 1; } else { /* * You can't have multiples of the same ref on a tree @@ -303,13 +324,8 @@ static int merge_ref(struct btrfs_trans_handle *trans, WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || ref->type == BTRFS_SHARED_BLOCK_REF_KEY); } - - if (done) - break; - node = rb_prev(&ref->rb_node); } - - return merged; + return done; } void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, @@ -320,6 +336,14 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, struct rb_node *node; u64 seq = 0; + assert_spin_locked(&head->lock); + /* + * We don't have too much refs to merge in the case of delayed data + * refs. + */ + if (head->is_data) + return; + spin_lock(&fs_info->tree_mod_seq_lock); if (!list_empty(&fs_info->tree_mod_seq_list)) { struct seq_list *elem; @@ -330,22 +354,19 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, } spin_unlock(&fs_info->tree_mod_seq_lock); - node = rb_prev(&head->node.rb_node); + node = rb_first(&head->ref_root); while (node) { struct btrfs_delayed_ref_node *ref; ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (ref->bytenr != head->node.bytenr) - break; - /* We can't merge refs that are outside of our seq count */ if (seq && ref->seq >= seq) break; - if (merge_ref(trans, delayed_refs, ref, seq)) - node = rb_prev(&head->node.rb_node); + if (merge_ref(trans, delayed_refs, head, ref, seq)) + node = rb_first(&head->ref_root); else - node = rb_prev(node); + node = rb_next(&ref->rb_node); } } @@ -373,71 +394,52 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, return ret; } -int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, - struct list_head *cluster, u64 start) +struct btrfs_delayed_ref_head * +btrfs_select_ref_head(struct btrfs_trans_handle *trans) { - int count = 0; struct btrfs_delayed_ref_root *delayed_refs; - struct rb_node *node; - struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *head; + u64 start; + bool loop = false; delayed_refs = &trans->transaction->delayed_refs; - if (start == 0) { - node = rb_first(&delayed_refs->root); - } else { - ref = NULL; - find_ref_head(&delayed_refs->root, start + 1, &ref, 1); - if (ref) { - node = &ref->rb_node; - } else - node = rb_first(&delayed_refs->root); - } + again: - while (node && count < 32) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (btrfs_delayed_ref_is_head(ref)) { - head = btrfs_delayed_node_to_head(ref); - if (list_empty(&head->cluster)) { - list_add_tail(&head->cluster, cluster); - delayed_refs->run_delayed_start = - head->node.bytenr; - count++; - - WARN_ON(delayed_refs->num_heads_ready == 0); - delayed_refs->num_heads_ready--; - } else if (count) { - /* the goal of the clustering is to find extents - * that are likely to end up in the same extent - * leaf on disk. So, we don't want them spread - * all over the tree. Stop now if we've hit - * a head that was already in use - */ - break; - } - } - node = rb_next(node); - } - if (count) { - return 0; - } else if (start) { - /* - * we've gone to the end of the rbtree without finding any - * clusters. start from the beginning and try again - */ + start = delayed_refs->run_delayed_start; + head = find_ref_head(&delayed_refs->href_root, start, 1); + if (!head && !loop) { + delayed_refs->run_delayed_start = 0; start = 0; - node = rb_first(&delayed_refs->root); - goto again; + loop = true; + head = find_ref_head(&delayed_refs->href_root, start, 1); + if (!head) + return NULL; + } else if (!head && loop) { + return NULL; } - return 1; -} -void btrfs_release_ref_cluster(struct list_head *cluster) -{ - struct list_head *pos, *q; + while (head->processing) { + struct rb_node *node; - list_for_each_safe(pos, q, cluster) - list_del_init(pos); + node = rb_next(&head->href_node); + if (!node) { + if (loop) + return NULL; + delayed_refs->run_delayed_start = 0; + start = 0; + loop = true; + goto again; + } + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + } + + head->processing = 1; + WARN_ON(delayed_refs->num_heads_ready == 0); + delayed_refs->num_heads_ready--; + delayed_refs->run_delayed_start = head->node.bytenr + + head->node.num_bytes; + return head; } /* @@ -451,6 +453,7 @@ void btrfs_release_ref_cluster(struct list_head *cluster) static noinline void update_existing_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *existing, struct btrfs_delayed_ref_node *update) { @@ -463,7 +466,7 @@ update_existing_ref(struct btrfs_trans_handle *trans, */ existing->ref_mod--; if (existing->ref_mod == 0) - drop_delayed_ref(trans, delayed_refs, existing); + drop_delayed_ref(trans, delayed_refs, head, existing); else WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || existing->type == BTRFS_SHARED_BLOCK_REF_KEY); @@ -496,6 +499,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, ref = btrfs_delayed_node_to_head(update); BUG_ON(existing_ref->is_data != ref->is_data); + spin_lock(&existing_ref->lock); if (ref->must_insert_reserved) { /* if the extent was freed and then * reallocated before the delayed ref @@ -533,9 +537,12 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, } } /* - * update the reference mod on the head to reflect this new operation + * update the reference mod on the head to reflect this new operation, + * only need the lock for this case cause we could be processing it + * currently, for refs we just added we know we're a-ok. */ existing->ref_mod += update->ref_mod; + spin_unlock(&existing_ref->lock); } /* @@ -543,13 +550,13 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, * this does all the dirty work in terms of maintaining the correct * overall modification count. */ -static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 num_bytes, - int action, int is_data) +static noinline struct btrfs_delayed_ref_head * +add_delayed_ref_head(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, u64 bytenr, + u64 num_bytes, int action, int is_data) { - struct btrfs_delayed_ref_node *existing; + struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; int count_mod = 1; @@ -596,38 +603,43 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info, head_ref = btrfs_delayed_node_to_head(ref); head_ref->must_insert_reserved = must_insert_reserved; head_ref->is_data = is_data; + head_ref->ref_root = RB_ROOT; + head_ref->processing = 0; - INIT_LIST_HEAD(&head_ref->cluster); + spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); trace_add_delayed_ref_head(ref, head_ref, action); - existing = tree_insert(&delayed_refs->root, &ref->rb_node); - + existing = htree_insert(&delayed_refs->href_root, + &head_ref->href_node); if (existing) { - update_existing_head_ref(existing, ref); + update_existing_head_ref(&existing->node, ref); /* * we've updated the existing ref, free the newly * allocated ref */ kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); + head_ref = existing; } else { delayed_refs->num_heads++; delayed_refs->num_heads_ready++; - delayed_refs->num_entries++; + atomic_inc(&delayed_refs->num_entries); trans->delayed_ref_updates++; } + return head_ref; } /* * helper to insert a delayed tree ref into the rbtree. */ -static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, int level, int action, - int for_cow) +static noinline void +add_delayed_tree_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head_ref, + struct btrfs_delayed_ref_node *ref, u64 bytenr, + u64 num_bytes, u64 parent, u64 ref_root, int level, + int action, int no_quota) { struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_tree_ref *full_ref; @@ -637,6 +649,8 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; + if (is_fstree(ref_root)) + seq = atomic64_read(&fs_info->tree_mod_seq); delayed_refs = &trans->transaction->delayed_refs; /* first set the basic ref node struct up */ @@ -647,9 +661,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->action = action; ref->is_head = 0; ref->in_tree = 1; - - if (need_ref_seq(for_cow, ref_root)) - seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem); + ref->no_quota = no_quota; ref->seq = seq; full_ref = btrfs_delayed_node_to_tree_ref(ref); @@ -663,30 +675,33 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, trace_add_delayed_tree_ref(ref, full_ref, action); - existing = tree_insert(&delayed_refs->root, &ref->rb_node); - + spin_lock(&head_ref->lock); + existing = tree_insert(&head_ref->ref_root, &ref->rb_node); if (existing) { - update_existing_ref(trans, delayed_refs, existing, ref); + update_existing_ref(trans, delayed_refs, head_ref, existing, + ref); /* * we've updated the existing ref, free the newly * allocated ref */ kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref); } else { - delayed_refs->num_entries++; + atomic_inc(&delayed_refs->num_entries); trans->delayed_ref_updates++; } + spin_unlock(&head_ref->lock); } /* * helper to insert a delayed data ref into the rbtree. */ -static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, u64 owner, u64 offset, - int action, int for_cow) +static noinline void +add_delayed_data_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head_ref, + struct btrfs_delayed_ref_node *ref, u64 bytenr, + u64 num_bytes, u64 parent, u64 ref_root, u64 owner, + u64 offset, int action, int no_quota) { struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_data_ref *full_ref; @@ -698,6 +713,9 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; + if (is_fstree(ref_root)) + seq = atomic64_read(&fs_info->tree_mod_seq); + /* first set the basic ref node struct up */ atomic_set(&ref->refs, 1); ref->bytenr = bytenr; @@ -706,9 +724,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->action = action; ref->is_head = 0; ref->in_tree = 1; - - if (need_ref_seq(for_cow, ref_root)) - seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem); + ref->no_quota = no_quota; ref->seq = seq; full_ref = btrfs_delayed_node_to_data_ref(ref); @@ -724,19 +740,21 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, trace_add_delayed_data_ref(ref, full_ref, action); - existing = tree_insert(&delayed_refs->root, &ref->rb_node); - + spin_lock(&head_ref->lock); + existing = tree_insert(&head_ref->ref_root, &ref->rb_node); if (existing) { - update_existing_ref(trans, delayed_refs, existing, ref); + update_existing_ref(trans, delayed_refs, head_ref, existing, + ref); /* * we've updated the existing ref, free the newly * allocated ref */ kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); } else { - delayed_refs->num_entries++; + atomic_inc(&delayed_refs->num_entries); trans->delayed_ref_updates++; } + spin_unlock(&head_ref->lock); } /* @@ -749,12 +767,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, struct btrfs_delayed_extent_op *extent_op, - int for_cow) + int no_quota) { struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; + if (!is_fstree(ref_root) || !fs_info->quota_enabled) + no_quota = 0; + BUG_ON(extent_op && extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); if (!ref) @@ -775,15 +796,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, - num_bytes, action, 0); + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, + bytenr, num_bytes, action, 0); - add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, + add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, level, action, - for_cow); + no_quota); spin_unlock(&delayed_refs->lock); - if (need_ref_seq(for_cow, ref_root)) - btrfs_qgroup_record_ref(trans, &ref->node, extent_op); return 0; } @@ -797,12 +816,15 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, u64 parent, u64 ref_root, u64 owner, u64 offset, int action, struct btrfs_delayed_extent_op *extent_op, - int for_cow) + int no_quota) { struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; + if (!is_fstree(ref_root) || !fs_info->quota_enabled) + no_quota = 0; + BUG_ON(extent_op && !extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); if (!ref) @@ -823,15 +845,13 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, - num_bytes, action, 1); + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, + bytenr, num_bytes, action, 1); - add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, + add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, - action, for_cow); + action, no_quota); spin_unlock(&delayed_refs->lock); - if (need_ref_seq(for_cow, ref_root)) - btrfs_qgroup_record_ref(trans, &ref->node, extent_op); return 0; } @@ -869,14 +889,10 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) { - struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_root *delayed_refs; delayed_refs = &trans->transaction->delayed_refs; - ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0); - if (ref) - return btrfs_delayed_node_to_head(ref); - return NULL; + return find_ref_head(&delayed_refs->href_root, bytenr, 0); } void btrfs_delayed_ref_exit(void) diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 70b962cc177..a764e2340d4 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -52,6 +52,7 @@ struct btrfs_delayed_ref_node { unsigned int action:8; unsigned int type:8; + unsigned int no_quota:1; /* is this node still in the rbtree? */ unsigned int is_head:1; unsigned int in_tree:1; @@ -81,7 +82,10 @@ struct btrfs_delayed_ref_head { */ struct mutex mutex; - struct list_head cluster; + spinlock_t lock; + struct rb_root ref_root; + + struct rb_node href_node; struct btrfs_delayed_extent_op *extent_op; /* @@ -98,6 +102,7 @@ struct btrfs_delayed_ref_head { */ unsigned int must_insert_reserved:1; unsigned int is_data:1; + unsigned int processing:1; }; struct btrfs_delayed_tree_ref { @@ -116,7 +121,8 @@ struct btrfs_delayed_data_ref { }; struct btrfs_delayed_ref_root { - struct rb_root root; + /* head ref rbtree */ + struct rb_root href_root; /* this spin lock protects the rbtree and the entries inside */ spinlock_t lock; @@ -124,7 +130,7 @@ struct btrfs_delayed_ref_root { /* how many delayed ref updates we've queued, used by the * throttling code */ - unsigned long num_entries; + atomic_t num_entries; /* total number of head nodes in tree */ unsigned long num_heads; @@ -133,15 +139,6 @@ struct btrfs_delayed_ref_root { unsigned long num_heads_ready; /* - * bumped when someone is making progress on the delayed - * refs, so that other procs know they are just adding to - * contention intead of helping - */ - atomic_t procs_running_refs; - atomic_t ref_seq; - wait_queue_head_t wait; - - /* * set when the tree is flushing before a transaction commit, * used by the throttling code to decide if new updates need * to be run right away @@ -200,14 +197,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, struct btrfs_delayed_extent_op *extent_op, - int for_cow); + int no_quota); int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, int action, struct btrfs_delayed_extent_op *extent_op, - int for_cow); + int no_quota); int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, @@ -226,34 +223,15 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) mutex_unlock(&head->mutex); } -int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, - struct list_head *cluster, u64 search_start); -void btrfs_release_ref_cluster(struct list_head *cluster); + +struct btrfs_delayed_ref_head * +btrfs_select_ref_head(struct btrfs_trans_handle *trans); int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, u64 seq); /* - * delayed refs with a ref_seq > 0 must be held back during backref walking. - * this only applies to items in one of the fs-trees. for_cow items never need - * to be held back, so they won't get a ref_seq number. - */ -static inline int need_ref_seq(int for_cow, u64 rootid) -{ - if (for_cow) - return 0; - - if (rootid == BTRFS_FS_TREE_OBJECTID) - return 1; - - if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) - return 1; - - return 0; -} - -/* * a node might live in a head or a regular ref, this lets you * test for the proper type to use. */ diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 9efb94e9585..eea26e1b2fd 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -26,7 +26,6 @@ #include <linux/kthread.h> #include <linux/math64.h> #include <asm/div64.h> -#include "compat.h" #include "ctree.h" #include "extent_map.h" #include "disk-io.h" @@ -37,8 +36,8 @@ #include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" +#include "sysfs.h" -static u64 btrfs_get_seconds_since_1970(void); static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret); static void btrfs_dev_replace_update_device_in_mapping_tree( @@ -104,7 +103,8 @@ no_valid_dev_replace_entry_found: ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item); if (item_size != sizeof(struct btrfs_dev_replace_item)) { - pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n"); + btrfs_warn(fs_info, + "dev_replace entry found has unexpected size, ignore entry"); goto no_valid_dev_replace_entry_found; } @@ -147,13 +147,19 @@ no_valid_dev_replace_entry_found: if (!dev_replace->srcdev && !btrfs_test_opt(dev_root, DEGRADED)) { ret = -EIO; - pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n", - src_devid); + btrfs_warn(fs_info, + "cannot mount because device replace operation is ongoing and"); + btrfs_warn(fs_info, + "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?", + src_devid); } if (!dev_replace->tgtdev && !btrfs_test_opt(dev_root, DEGRADED)) { ret = -EIO; - pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n", + btrfs_warn(fs_info, + "cannot mount because device replace operation is ongoing and"); + btrfs_warn(fs_info, + "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?", BTRFS_DEV_REPLACE_DEVID); } if (dev_replace->tgtdev) { @@ -212,7 +218,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, } ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { - pr_warn("btrfs: error %d while searching for dev_replace item!\n", + btrfs_warn(fs_info, "error %d while searching for dev_replace item!", ret); goto out; } @@ -232,7 +238,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, */ ret = btrfs_del_item(trans, dev_root, path); if (ret != 0) { - pr_warn("btrfs: delete too small dev_replace item failed %d!\n", + btrfs_warn(fs_info, "delete too small dev_replace item failed %d!", ret); goto out; } @@ -245,7 +251,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, dev_root, path, &key, sizeof(*ptr)); if (ret < 0) { - pr_warn("btrfs: insert dev_replace item failed %d!\n", + btrfs_warn(fs_info, "insert dev_replace item failed %d!", ret); goto out; } @@ -296,13 +302,6 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) dev_replace->cursor_left_last_write_of_item; } -static u64 btrfs_get_seconds_since_1970(void) -{ - struct timespec t = CURRENT_TIME_SEC; - - return t.tv_sec; -} - int btrfs_dev_replace_start(struct btrfs_root *root, struct btrfs_ioctl_dev_replace_args *args) { @@ -314,8 +313,8 @@ int btrfs_dev_replace_start(struct btrfs_root *root, struct btrfs_device *src_device = NULL; if (btrfs_fs_incompat(fs_info, RAID56)) { - pr_warn("btrfs: dev_replace cannot yet handle RAID5/RAID6\n"); - return -EINVAL; + btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6"); + return -EOPNOTSUPP; } switch (args->start.cont_reading_from_srcdev_mode) { @@ -334,7 +333,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, &tgt_device); if (ret) { - pr_err("btrfs: target device %s is invalid!\n", + btrfs_err(fs_info, "target device %s is invalid!", args->start.tgtdev_name); mutex_unlock(&fs_info->volume_mutex); return -EINVAL; @@ -350,7 +349,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, } if (tgt_device->total_bytes < src_device->total_bytes) { - pr_err("btrfs: target device is smaller than source device!\n"); + btrfs_err(fs_info, "target device is smaller than source device!"); ret = -EINVAL; goto leave_no_lock; } @@ -375,7 +374,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, dev_replace->tgtdev = tgt_device; printk_in_rcu(KERN_INFO - "btrfs: dev_replace from %s (devid %llu) to %s) started\n", + "BTRFS: dev_replace from %s (devid %llu) to %s started\n", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), src_device->devid, @@ -390,7 +389,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, * go to the tgtdev as well (refer to btrfs_map_block()). */ dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; - dev_replace->time_started = btrfs_get_seconds_since_1970(); + dev_replace->time_started = get_seconds(); dev_replace->cursor_left = 0; dev_replace->committed_cursor_left = 0; dev_replace->cursor_left_last_write_of_item = 0; @@ -400,7 +399,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; btrfs_dev_replace_unlock(dev_replace); - btrfs_wait_all_ordered_extents(root->fs_info); + btrfs_wait_ordered_roots(root->fs_info, -1); /* force writing the updated state information to disk */ trans = btrfs_start_transaction(root, 0); @@ -433,6 +432,35 @@ leave_no_lock: return ret; } +/* + * blocked until all flighting bios are finished. + */ +static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) +{ + s64 writers; + DEFINE_WAIT(wait); + + set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); + do { + prepare_to_wait(&fs_info->replace_wait, &wait, + TASK_UNINTERRUPTIBLE); + writers = percpu_counter_sum(&fs_info->bio_counter); + if (writers) + schedule(); + finish_wait(&fs_info->replace_wait, &wait); + } while (writers); +} + +/* + * we have removed target device, it is safe to allow new bios request. + */ +static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info) +{ + clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); + if (waitqueue_active(&fs_info->replace_wait)) + wake_up(&fs_info->replace_wait); +} + static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret) { @@ -460,22 +488,16 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, src_device = dev_replace->srcdev; btrfs_dev_replace_unlock(dev_replace); - /* replace old device with new one in mapping tree */ - if (!scrub_ret) - btrfs_dev_replace_update_device_in_mapping_tree(fs_info, - src_device, - tgt_device); - /* * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0); + ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1); if (ret) { mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; } - btrfs_wait_all_ordered_extents(root->fs_info); + btrfs_wait_ordered_roots(root->fs_info, -1); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { @@ -486,6 +508,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, WARN_ON(ret); /* keep away write_all_supers() during the finishing procedure */ + mutex_lock(&root->fs_info->chunk_mutex); mutex_lock(&root->fs_info->fs_devices->device_list_mutex); btrfs_dev_replace_lock(dev_replace); dev_replace->replace_state = @@ -493,18 +516,24 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; dev_replace->tgtdev = NULL; dev_replace->srcdev = NULL; - dev_replace->time_stopped = btrfs_get_seconds_since_1970(); + dev_replace->time_stopped = get_seconds(); dev_replace->item_needs_writeback = 1; - if (scrub_ret) { + /* replace old device with new one in mapping tree */ + if (!scrub_ret) { + btrfs_dev_replace_update_device_in_mapping_tree(fs_info, + src_device, + tgt_device); + } else { printk_in_rcu(KERN_ERR - "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", + "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), src_device->devid, rcu_str_deref(tgt_device->name), scrub_ret); btrfs_dev_replace_unlock(dev_replace); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&root->fs_info->chunk_mutex); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); @@ -513,7 +542,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, } printk_in_rcu(KERN_INFO - "btrfs: dev_replace from %s (devid %llu) to %s) finished\n", + "BTRFS: dev_replace from %s (devid %llu) to %s) finished\n", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), src_device->devid, @@ -534,8 +563,16 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, fs_info->fs_devices->latest_bdev = tgt_device->bdev; list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); + /* replace the sysfs entry */ + btrfs_kobj_rm_device(fs_info, src_device); + btrfs_kobj_add_device(fs_info, tgt_device); + + btrfs_rm_dev_replace_blocked(fs_info); + btrfs_rm_dev_replace_srcdev(fs_info, src_device); + btrfs_rm_dev_replace_unblocked(fs_info); + /* * this is again a consistent state where no dev_replace procedure * is running, the target device is part of the filesystem, the @@ -545,6 +582,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, */ btrfs_dev_replace_unlock(dev_replace); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&root->fs_info->chunk_mutex); /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); @@ -650,6 +688,9 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) u64 result; int ret; + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); btrfs_dev_replace_lock(dev_replace); switch (dev_replace->replace_state) { @@ -668,7 +709,7 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) break; } dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; - dev_replace->time_stopped = btrfs_get_seconds_since_1970(); + dev_replace->time_stopped = get_seconds(); dev_replace->item_needs_writeback = 1; btrfs_dev_replace_unlock(dev_replace); btrfs_scrub_cancel(fs_info); @@ -703,9 +744,9 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; - dev_replace->time_stopped = btrfs_get_seconds_since_1970(); + dev_replace->time_stopped = get_seconds(); dev_replace->item_needs_writeback = 1; - pr_info("btrfs: suspending dev_replace for unmount\n"); + btrfs_info(fs_info, "suspending dev_replace for unmount"); break; } @@ -734,8 +775,9 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) break; } if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) { - pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n" - "btrfs: you may cancel the operation after 'mount -o degraded'\n"); + btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing"); + btrfs_info(fs_info, + "you may cancel the operation after 'mount -o degraded'"); btrfs_dev_replace_unlock(dev_replace); return 0; } @@ -761,14 +803,14 @@ static int btrfs_dev_replace_kthread(void *data) kfree(status_args); do_div(progress, 10); printk_in_rcu(KERN_INFO - "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", - dev_replace->srcdev->missing ? "<missing disk>" : - rcu_str_deref(dev_replace->srcdev->name), - dev_replace->srcdev->devid, - dev_replace->tgtdev ? - rcu_str_deref(dev_replace->tgtdev->name) : - "<missing target disk>", - (unsigned int)progress); + "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", + dev_replace->srcdev->missing ? "<missing disk>" : + rcu_str_deref(dev_replace->srcdev->name), + dev_replace->srcdev->devid, + dev_replace->tgtdev ? + rcu_str_deref(dev_replace->tgtdev->name) : + "<missing target disk>", + (unsigned int)progress); } btrfs_dev_replace_continue_on_mount(fs_info); atomic_set(&fs_info->mutually_exclusive_operation_running, 0); @@ -860,3 +902,31 @@ void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace) mutex_unlock(&dev_replace->lock_management_lock); } } + +void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) +{ + percpu_counter_inc(&fs_info->bio_counter); +} + +void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) +{ + percpu_counter_dec(&fs_info->bio_counter); + + if (waitqueue_active(&fs_info->replace_wait)) + wake_up(&fs_info->replace_wait); +} + +void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info) +{ + DEFINE_WAIT(wait); +again: + percpu_counter_inc(&fs_info->bio_counter); + if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) { + btrfs_bio_counter_dec(fs_info); + wait_event(fs_info->replace_wait, + !test_bit(BTRFS_FS_STATE_DEV_REPLACING, + &fs_info->fs_state)); + goto again; + } + +} diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 79e594e341c..a0691df5dce 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -58,7 +58,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle return ERR_PTR(ret); WARN_ON(ret > 0); leaf = path->nodes[0]; - item = btrfs_item_nr(leaf, path->slots[0]); + item = btrfs_item_nr(path->slots[0]); ptr = btrfs_item_ptr(leaf, path->slots[0], char); BUG_ON(data_size > btrfs_item_size(leaf, item)); ptr += btrfs_item_size(leaf, item) - data_size; @@ -261,7 +261,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, * see if there is room in the item to insert this * name */ - data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item); + data_size = sizeof(*di) + name_len; leaf = path->nodes[0]; slot = path->slots[0]; if (data_size + btrfs_item_size_nr(leaf, slot) + @@ -459,7 +459,7 @@ int verify_dir_item(struct btrfs_root *root, u8 type = btrfs_dir_type(leaf, dir_item); if (type >= BTRFS_FT_MAX) { - printk(KERN_CRIT "btrfs: invalid dir item type: %d\n", + btrfs_crit(root->fs_info, "invalid dir item type: %d", (int)type); return 1; } @@ -468,14 +468,16 @@ int verify_dir_item(struct btrfs_root *root, namelen = XATTR_NAME_MAX; if (btrfs_dir_name_len(leaf, dir_item) > namelen) { - printk(KERN_CRIT "btrfs: invalid dir item name len: %u\n", + btrfs_crit(root->fs_info, "invalid dir item name len: %u", (unsigned)btrfs_dir_data_len(leaf, dir_item)); return 1; } /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */ - if (btrfs_dir_data_len(leaf, dir_item) > BTRFS_MAX_XATTR_SIZE(root)) { - printk(KERN_CRIT "btrfs: invalid dir item data len: %u\n", + if ((btrfs_dir_data_len(leaf, dir_item) + + btrfs_dir_name_len(leaf, dir_item)) > BTRFS_MAX_XATTR_SIZE(root)) { + btrfs_crit(root->fs_info, "invalid dir item name + data len: %u + %u", + (unsigned)btrfs_dir_name_len(leaf, dir_item), (unsigned)btrfs_dir_data_len(leaf, dir_item)); return 1; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4ae17ed13b3..08e65e9cf2a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -26,16 +26,15 @@ #include <linux/workqueue.h> #include <linux/kthread.h> #include <linux/freezer.h> -#include <linux/crc32c.h> #include <linux/slab.h> #include <linux/migrate.h> #include <linux/ratelimit.h> #include <linux/uuid.h> #include <linux/semaphore.h> #include <asm/unaligned.h> -#include "compat.h" #include "ctree.h" #include "disk-io.h" +#include "hash.h" #include "transaction.h" #include "btrfs_inode.h" #include "volumes.h" @@ -49,6 +48,8 @@ #include "rcu-string.h" #include "dev-replace.h" #include "raid56.h" +#include "sysfs.h" +#include "qgroup.h" #ifdef CONFIG_X86 #include <asm/cpufeature.h> @@ -64,7 +65,6 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_root *root); -static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t); static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root); static int btrfs_destroy_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, @@ -245,7 +245,7 @@ out: u32 btrfs_csum_data(char *data, u32 seed, size_t len) { - return crc32c(seed, data, len); + return btrfs_crc32c(seed, data, len); } void btrfs_csum_final(u32 crc, char *result) @@ -301,11 +301,11 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, memcpy(&found, result, csum_size); read_extent_buffer(buf, &val, 0, csum_size); - printk_ratelimited(KERN_INFO "btrfs: %s checksum verify " - "failed on %llu wanted %X found %X " - "level %d\n", - root->fs_info->sb->s_id, buf->start, - val, found, btrfs_header_level(buf)); + printk_ratelimited(KERN_INFO + "BTRFS: %s checksum verify failed on %llu wanted %X found %X " + "level %d\n", + root->fs_info->sb->s_id, buf->start, + val, found, btrfs_header_level(buf)); if (result != (char *)&inline_result) kfree(result); return 1; @@ -330,6 +330,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, { struct extent_state *cached_state = NULL; int ret; + bool need_lock = (current->journal_info == + (void *)BTRFS_SEND_TRANS_STUB); if (!parent_transid || btrfs_header_generation(eb) == parent_transid) return 0; @@ -337,6 +339,11 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, if (atomic) return -EAGAIN; + if (need_lock) { + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + } + lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, 0, &cached_state); if (extent_buffer_uptodate(eb) && @@ -348,10 +355,22 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, "found %llu\n", eb->start, parent_transid, btrfs_header_generation(eb)); ret = 1; - clear_extent_buffer_uptodate(eb); + + /* + * Things reading via commit roots that don't have normal protection, + * like send, can have a really old block in cache that may point at a + * block that has been free'd and re-allocated. So don't clear uptodate + * if we find an eb that is under IO (dirty/writeback) because we could + * end up reading in the stale data and then writing it back out and + * making everybody very sad. + */ + if (!extent_buffer_under_io(eb)) + clear_extent_buffer_uptodate(eb); out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state, GFP_NOFS); + if (need_lock) + btrfs_tree_read_unlock_blocking(eb); return ret; } @@ -384,13 +403,14 @@ static int btrfs_check_super_csum(char *raw_disk_sb) ret = 1; if (ret && btrfs_super_generation(disk_sb) < 10) { - printk(KERN_WARNING "btrfs: super block crcs don't match, older mkfs detected\n"); + printk(KERN_WARNING + "BTRFS: super block crcs don't match, older mkfs detected\n"); ret = 0; } } if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) { - printk(KERN_ERR "btrfs: unsupported checksum algorithm %u\n", + printk(KERN_ERR "BTRFS: unsupported checksum algorithm %u\n", csum_type); ret = 1; } @@ -466,25 +486,16 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) { - struct extent_io_tree *tree; u64 start = page_offset(page); u64 found_start; struct extent_buffer *eb; - tree = &BTRFS_I(page->mapping->host)->io_tree; - eb = (struct extent_buffer *)page->private; if (page != eb->pages[0]) return 0; found_start = btrfs_header_bytenr(eb); - if (found_start != start) { - WARN_ON(1); + if (WARN_ON(found_start != start || !PageUptodate(page))) return 0; - } - if (!PageUptodate(page)) { - WARN_ON(1); - return 0; - } csum_tree_block(root, eb, 0); return 0; } @@ -496,7 +507,7 @@ static int check_tree_block_fsid(struct btrfs_root *root, u8 fsid[BTRFS_UUID_SIZE]; int ret = 1; - read_extent_buffer(eb, fsid, btrfs_header_fsid(eb), BTRFS_FSID_SIZE); + read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); while (fs_devices) { if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) { ret = 0; @@ -508,8 +519,8 @@ static int check_tree_block_fsid(struct btrfs_root *root, } #define CORRUPT(reason, eb, root, slot) \ - printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \ - "root=%llu, slot=%d\n", reason, \ + btrfs_crit(root->fs_info, "corrupt leaf, %s: block=%llu," \ + "root=%llu, slot=%d", reason, \ btrfs_header_bytenr(eb), root->objectid, slot) static noinline int check_leaf(struct btrfs_root *root, @@ -577,7 +588,6 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror) { - struct extent_io_tree *tree; u64 found_start; int found_level; struct extent_buffer *eb; @@ -588,7 +598,6 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, if (!page->private) goto out; - tree = &BTRFS_I(page->mapping->host)->io_tree; eb = (struct extent_buffer *)page->private; /* the pending IO might have been the only thing that kept this buffer @@ -608,21 +617,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { - printk_ratelimited(KERN_INFO "btrfs bad tree block start " + printk_ratelimited(KERN_INFO "BTRFS: bad tree block start " "%llu %llu\n", found_start, eb->start); ret = -EIO; goto err; } if (check_tree_block_fsid(root, eb)) { - printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n", + printk_ratelimited(KERN_INFO "BTRFS: bad fsid on block %llu\n", eb->start); ret = -EIO; goto err; } found_level = btrfs_header_level(eb); if (found_level >= BTRFS_MAX_LEVEL) { - btrfs_info(root->fs_info, "bad tree block level %d\n", + btrfs_info(root->fs_info, "bad tree block level %d", (int)btrfs_header_level(eb)); ret = -EIO; goto err; @@ -689,32 +698,31 @@ static void end_workqueue_bio(struct bio *bio, int err) fs_info = end_io_wq->info; end_io_wq->error = err; - end_io_wq->work.func = end_workqueue_fn; - end_io_wq->work.flags = 0; + btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL); if (bio->bi_rw & REQ_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) - btrfs_queue_worker(&fs_info->endio_meta_write_workers, - &end_io_wq->work); + btrfs_queue_work(fs_info->endio_meta_write_workers, + &end_io_wq->work); else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) - btrfs_queue_worker(&fs_info->endio_freespace_worker, - &end_io_wq->work); + btrfs_queue_work(fs_info->endio_freespace_worker, + &end_io_wq->work); else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - btrfs_queue_worker(&fs_info->endio_raid56_workers, - &end_io_wq->work); + btrfs_queue_work(fs_info->endio_raid56_workers, + &end_io_wq->work); else - btrfs_queue_worker(&fs_info->endio_write_workers, - &end_io_wq->work); + btrfs_queue_work(fs_info->endio_write_workers, + &end_io_wq->work); } else { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - btrfs_queue_worker(&fs_info->endio_raid56_workers, - &end_io_wq->work); + btrfs_queue_work(fs_info->endio_raid56_workers, + &end_io_wq->work); else if (end_io_wq->metadata) - btrfs_queue_worker(&fs_info->endio_meta_workers, - &end_io_wq->work); + btrfs_queue_work(fs_info->endio_meta_workers, + &end_io_wq->work); else - btrfs_queue_worker(&fs_info->endio_workers, - &end_io_wq->work); + btrfs_queue_work(fs_info->endio_workers, + &end_io_wq->work); } } @@ -749,7 +757,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) { unsigned long limit = min_t(unsigned long, - info->workers.max_workers, + info->thread_pool_size, info->fs_devices->open_devices); return 256 * limit; } @@ -822,11 +830,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->submit_bio_start = submit_bio_start; async->submit_bio_done = submit_bio_done; - async->work.func = run_one_async_start; - async->work.ordered_func = run_one_async_done; - async->work.ordered_free = run_one_async_free; + btrfs_init_work(&async->work, run_one_async_start, + run_one_async_done, run_one_async_free); - async->work.flags = 0; async->bio_flags = bio_flags; async->bio_offset = bio_offset; @@ -835,9 +841,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, atomic_inc(&fs_info->nr_async_submits); if (rw & REQ_SYNC) - btrfs_set_work_high_prio(&async->work); + btrfs_set_work_high_priority(&async->work); - btrfs_queue_worker(&fs_info->workers, &async->work); + btrfs_queue_work(fs_info->workers, &async->work); while (atomic_read(&fs_info->async_submit_draining) && atomic_read(&fs_info->nr_async_submits)) { @@ -850,20 +856,17 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, static int btree_csum_one_bio(struct bio *bio) { - struct bio_vec *bvec = bio->bi_io_vec; - int bio_index = 0; + struct bio_vec *bvec; struct btrfs_root *root; - int ret = 0; + int i, ret = 0; - WARN_ON(bio->bi_vcnt <= 0); - while (bio_index < bio->bi_vcnt) { + bio_for_each_segment_all(bvec, bio, i) { root = BTRFS_I(bvec->bv_page->mapping->host)->root; ret = csum_dirty_buffer(root, bvec->bv_page); if (ret) break; - bio_index++; - bvec++; } + return ret; } @@ -975,11 +978,9 @@ static int btree_migratepage(struct address_space *mapping, static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct extent_io_tree *tree; struct btrfs_fs_info *fs_info; int ret; - tree = &BTRFS_I(mapping->host)->io_tree; if (wbc->sync_mode == WB_SYNC_NONE) { if (wbc->for_kupdate) @@ -1018,8 +1019,9 @@ static void btree_invalidatepage(struct page *page, unsigned int offset, extent_invalidatepage(tree, page, offset); btree_releasepage(page, GFP_NOFS); if (PagePrivate(page)) { - printk(KERN_WARNING "btrfs warning page private not zero " - "on page %llu\n", (unsigned long long)page_offset(page)); + btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info, + "page private not zero on page %llu", + (unsigned long long)page_offset(page)); ClearPagePrivate(page); set_page_private(page, 0); page_cache_release(page); @@ -1103,22 +1105,18 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { - struct inode *btree_inode = root->fs_info->btree_inode; - struct extent_buffer *eb; - eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, - bytenr, blocksize); - return eb; + return find_extent_buffer(root->fs_info, bytenr); } struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { - struct inode *btree_inode = root->fs_info->btree_inode; - struct extent_buffer *eb; - - eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, - bytenr, blocksize); - return eb; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return alloc_test_extent_buffer(root->fs_info, bytenr, + blocksize); +#endif + return alloc_extent_buffer(root->fs_info, bytenr, blocksize); } @@ -1173,6 +1171,32 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, } } +static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void) +{ + struct btrfs_subvolume_writers *writers; + int ret; + + writers = kmalloc(sizeof(*writers), GFP_NOFS); + if (!writers) + return ERR_PTR(-ENOMEM); + + ret = percpu_counter_init(&writers->counter, 0); + if (ret < 0) { + kfree(writers); + return ERR_PTR(ret); + } + + init_waitqueue_head(&writers->wait); + return writers; +} + +static void +btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers) +{ + percpu_counter_destroy(&writers->counter); + kfree(writers); +} + static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, u32 stripesize, struct btrfs_root *root, struct btrfs_fs_info *fs_info, @@ -1184,10 +1208,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->nodesize = nodesize; root->leafsize = leafsize; root->stripesize = stripesize; - root->ref_cows = 0; - root->track_dirty = 0; - root->in_radix = 0; - root->orphan_item_inserted = 0; + root->state = 0; root->orphan_cleanup_state = 0; root->objectid = objectid; @@ -1218,27 +1239,36 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, spin_lock_init(&root->log_extents_lock[1]); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); + mutex_init(&root->ordered_extent_mutex); + mutex_init(&root->delalloc_mutex); init_waitqueue_head(&root->log_writer_wait); init_waitqueue_head(&root->log_commit_wait[0]); init_waitqueue_head(&root->log_commit_wait[1]); + INIT_LIST_HEAD(&root->log_ctxs[0]); + INIT_LIST_HEAD(&root->log_ctxs[1]); atomic_set(&root->log_commit[0], 0); atomic_set(&root->log_commit[1], 0); atomic_set(&root->log_writers, 0); atomic_set(&root->log_batch, 0); atomic_set(&root->orphan_inodes, 0); atomic_set(&root->refs, 1); + atomic_set(&root->will_be_snapshoted, 0); root->log_transid = 0; + root->log_transid_committed = -1; root->last_log_commit = 0; - extent_io_tree_init(&root->dirty_log_pages, - fs_info->btree_inode->i_mapping); + if (fs_info) + extent_io_tree_init(&root->dirty_log_pages, + fs_info->btree_inode->i_mapping); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); memset(&root->root_kobj, 0, sizeof(root->root_kobj)); - root->defrag_trans_start = fs_info->generation; + if (fs_info) + root->defrag_trans_start = fs_info->generation; + else + root->defrag_trans_start = 0; init_completion(&root->kobj_unregister); - root->defrag_running = 0; root->root_key.objectid = objectid; root->anon_dev = 0; @@ -1253,6 +1283,23 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) return root; } +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +/* Should only be used by the testing infrastructure */ +struct btrfs_root *btrfs_alloc_dummy_root(void) +{ + struct btrfs_root *root; + + root = btrfs_alloc_root(NULL); + if (!root) + return ERR_PTR(-ENOMEM); + __setup_root(4096, 4096, 4096, 4096, root, NULL, 1); + set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state); + root->alloc_bytenr = 0; + + return root; +} +#endif + struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 objectid) @@ -1262,7 +1309,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root; struct btrfs_key key; int ret = 0; - u64 bytenr; uuid_le uuid; root = btrfs_alloc_root(fs_info); @@ -1284,7 +1330,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, goto fail; } - bytenr = leaf->start; memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(leaf, leaf->start); btrfs_set_header_generation(leaf, trans->transid); @@ -1292,7 +1337,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_set_header_owner(leaf, objectid); root->node = leaf; - write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(leaf), + write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); write_extent_buffer(leaf, fs_info->chunk_tree_uuid, btrfs_header_chunk_tree_uuid(leaf), @@ -1300,8 +1345,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); root->commit_root = btrfs_root_node(root); - root->track_dirty = 1; - + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); root->root_item.flags = 0; root->root_item.byte_limit = 0; @@ -1330,6 +1374,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, fail: if (leaf) { btrfs_tree_unlock(leaf); + free_extent_buffer(root->commit_root); free_extent_buffer(leaf); } kfree(root); @@ -1355,13 +1400,15 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; + /* + * DON'T set REF_COWS for log trees + * * log trees do not get reference counted because they go away * before a real commit is actually done. They do store pointers * to file data extents, and those reference counts still get * updated (along with back refs to the log tree). */ - root->ref_cows = 0; leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, BTRFS_TREE_LOG_OBJECTID, NULL, @@ -1379,7 +1426,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, root->node = leaf; write_extent_buffer(root->node, root->fs_info->fsid, - btrfs_header_fsid(root->node), BTRFS_FSID_SIZE); + btrfs_header_fsid(), BTRFS_FSID_SIZE); btrfs_mark_buffer_dirty(root->node); btrfs_tree_unlock(root->node); return root; @@ -1423,6 +1470,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, WARN_ON(root->log_root); root->log_root = log_root; root->log_transid = 0; + root->log_transid_committed = -1; root->last_log_commit = 0; return 0; } @@ -1494,7 +1542,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, return root; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - root->ref_cows = 1; + set_bit(BTRFS_ROOT_REF_COWS, &root->state); btrfs_check_and_init_root_item(&root->root_item); } @@ -1504,6 +1552,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, int btrfs_init_fs_root(struct btrfs_root *root) { int ret; + struct btrfs_subvolume_writers *writers; root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), @@ -1513,15 +1562,24 @@ int btrfs_init_fs_root(struct btrfs_root *root) goto fail; } + writers = btrfs_alloc_subvolume_writers(); + if (IS_ERR(writers)) { + ret = PTR_ERR(writers); + goto fail; + } + root->subv_writers = writers; + btrfs_init_free_ino_ctl(root); - mutex_init(&root->fs_commit_mutex); spin_lock_init(&root->cache_lock); init_waitqueue_head(&root->cache_wait); ret = get_anon_bdev(&root->anon_dev); if (ret) - goto fail; + goto free_writers; return 0; + +free_writers: + btrfs_free_subvolume_writers(root->subv_writers); fail: kfree(root->free_ino_ctl); kfree(root->free_ino_pinned); @@ -1554,15 +1612,16 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, (unsigned long)root->root_key.objectid, root); if (ret == 0) - root->in_radix = 1; + set_bit(BTRFS_ROOT_IN_RADIX, &root->state); spin_unlock(&fs_info->fs_roots_radix_lock); radix_tree_preload_end(); return ret; } -struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, - struct btrfs_key *location) +struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *location, + bool check_ref) { struct btrfs_root *root; int ret; @@ -1586,7 +1645,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, again: root = btrfs_lookup_fs_root(fs_info, location->objectid); if (root) { - if (btrfs_root_refs(&root->root_item) == 0) + if (check_ref && btrfs_root_refs(&root->root_item) == 0) return ERR_PTR(-ENOENT); return root; } @@ -1595,7 +1654,7 @@ again: if (IS_ERR(root)) return root; - if (btrfs_root_refs(&root->root_item) == 0) { + if (check_ref && btrfs_root_refs(&root->root_item) == 0) { ret = -ENOENT; goto fail; } @@ -1604,11 +1663,12 @@ again: if (ret) goto fail; - ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); + ret = btrfs_find_item(fs_info->tree_root, NULL, BTRFS_ORPHAN_OBJECTID, + location->objectid, BTRFS_ORPHAN_ITEM_KEY, NULL); if (ret < 0) goto fail; if (ret == 0) - root->orphan_item_inserted = 1; + set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); ret = btrfs_insert_fs_root(fs_info, root); if (ret) { @@ -1672,18 +1732,16 @@ static void end_workqueue_fn(struct btrfs_work *work) { struct bio *bio; struct end_io_wq *end_io_wq; - struct btrfs_fs_info *fs_info; int error; end_io_wq = container_of(work, struct end_io_wq, work); bio = end_io_wq->bio; - fs_info = end_io_wq->info; error = end_io_wq->error; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; kfree(end_io_wq); - bio_endio(bio, error); + bio_endio_nodec(bio, error); } static int cleaner_kthread(void *arg) @@ -1779,6 +1837,9 @@ sleep: wake_up_process(root->fs_info->cleaner_kthread); mutex_unlock(&root->fs_info->transaction_kthread_mutex); + if (unlikely(test_bit(BTRFS_FS_STATE_ERROR, + &root->fs_info->fs_state))) + btrfs_cleanup_transaction(root); if (!try_to_freeze()) { set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop() && @@ -1993,72 +2054,50 @@ static noinline int next_root_backup(struct btrfs_fs_info *info, /* helper to cleanup workers */ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) { - btrfs_stop_workers(&fs_info->generic_worker); - btrfs_stop_workers(&fs_info->fixup_workers); - btrfs_stop_workers(&fs_info->delalloc_workers); - btrfs_stop_workers(&fs_info->workers); - btrfs_stop_workers(&fs_info->endio_workers); - btrfs_stop_workers(&fs_info->endio_meta_workers); - btrfs_stop_workers(&fs_info->endio_raid56_workers); - btrfs_stop_workers(&fs_info->rmw_workers); - btrfs_stop_workers(&fs_info->endio_meta_write_workers); - btrfs_stop_workers(&fs_info->endio_write_workers); - btrfs_stop_workers(&fs_info->endio_freespace_worker); - btrfs_stop_workers(&fs_info->submit_workers); - btrfs_stop_workers(&fs_info->delayed_workers); - btrfs_stop_workers(&fs_info->caching_workers); - btrfs_stop_workers(&fs_info->readahead_workers); - btrfs_stop_workers(&fs_info->flush_workers); - btrfs_stop_workers(&fs_info->qgroup_rescan_workers); + btrfs_destroy_workqueue(fs_info->fixup_workers); + btrfs_destroy_workqueue(fs_info->delalloc_workers); + btrfs_destroy_workqueue(fs_info->workers); + btrfs_destroy_workqueue(fs_info->endio_workers); + btrfs_destroy_workqueue(fs_info->endio_meta_workers); + btrfs_destroy_workqueue(fs_info->endio_raid56_workers); + btrfs_destroy_workqueue(fs_info->rmw_workers); + btrfs_destroy_workqueue(fs_info->endio_meta_write_workers); + btrfs_destroy_workqueue(fs_info->endio_write_workers); + btrfs_destroy_workqueue(fs_info->endio_freespace_worker); + btrfs_destroy_workqueue(fs_info->submit_workers); + btrfs_destroy_workqueue(fs_info->delayed_workers); + btrfs_destroy_workqueue(fs_info->caching_workers); + btrfs_destroy_workqueue(fs_info->readahead_workers); + btrfs_destroy_workqueue(fs_info->flush_workers); + btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); + btrfs_destroy_workqueue(fs_info->extent_workers); +} + +static void free_root_extent_buffers(struct btrfs_root *root) +{ + if (root) { + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + root->node = NULL; + root->commit_root = NULL; + } } /* helper to cleanup tree roots */ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) { - free_extent_buffer(info->tree_root->node); - free_extent_buffer(info->tree_root->commit_root); - info->tree_root->node = NULL; - info->tree_root->commit_root = NULL; - - if (info->dev_root) { - free_extent_buffer(info->dev_root->node); - free_extent_buffer(info->dev_root->commit_root); - info->dev_root->node = NULL; - info->dev_root->commit_root = NULL; - } - if (info->extent_root) { - free_extent_buffer(info->extent_root->node); - free_extent_buffer(info->extent_root->commit_root); - info->extent_root->node = NULL; - info->extent_root->commit_root = NULL; - } - if (info->csum_root) { - free_extent_buffer(info->csum_root->node); - free_extent_buffer(info->csum_root->commit_root); - info->csum_root->node = NULL; - info->csum_root->commit_root = NULL; - } - if (info->quota_root) { - free_extent_buffer(info->quota_root->node); - free_extent_buffer(info->quota_root->commit_root); - info->quota_root->node = NULL; - info->quota_root->commit_root = NULL; - } - if (info->uuid_root) { - free_extent_buffer(info->uuid_root->node); - free_extent_buffer(info->uuid_root->commit_root); - info->uuid_root->node = NULL; - info->uuid_root->commit_root = NULL; - } - if (chunk_root) { - free_extent_buffer(info->chunk_root->node); - free_extent_buffer(info->chunk_root->commit_root); - info->chunk_root->node = NULL; - info->chunk_root->commit_root = NULL; - } + free_root_extent_buffers(info->tree_root); + + free_root_extent_buffers(info->dev_root); + free_root_extent_buffers(info->extent_root); + free_root_extent_buffers(info->csum_root); + free_root_extent_buffers(info->quota_root); + free_root_extent_buffers(info->uuid_root); + if (chunk_root) + free_root_extent_buffers(info->chunk_root); } -static void del_fs_roots(struct btrfs_fs_info *fs_info) +void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) { int ret; struct btrfs_root *gang[8]; @@ -2069,7 +2108,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info) struct btrfs_root, root_list); list_del(&gang[0]->root_list); - if (gang[0]->in_radix) { + if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) { btrfs_drop_and_free_fs_root(fs_info, gang[0]); } else { free_extent_buffer(gang[0]->node); @@ -2087,6 +2126,12 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info) for (i = 0; i < ret; i++) btrfs_drop_and_free_fs_root(fs_info, gang[i]); } + + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + btrfs_free_log_root_tree(NULL, fs_info); + btrfs_destroy_pinned_extent(fs_info->tree_root, + fs_info->pinned_extents); + } } int open_ctree(struct super_block *sb, @@ -2116,6 +2161,8 @@ int open_ctree(struct super_block *sb, int err = -EINVAL; int num_backups_tried = 0; int backup_index = 0; + int max_active; + int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; bool create_uuid_tree; bool check_uuid_tree; @@ -2152,15 +2199,22 @@ int open_ctree(struct super_block *sb, goto fail_dirty_metadata_bytes; } + ret = percpu_counter_init(&fs_info->bio_counter, 0); + if (ret) { + err = ret; + goto fail_delalloc_bytes; + } + fs_info->btree_inode = new_inode(sb); if (!fs_info->btree_inode) { err = -ENOMEM; - goto fail_delalloc_bytes; + goto fail_bio_counter; } mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); @@ -2174,8 +2228,11 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->free_chunk_lock); spin_lock_init(&fs_info->tree_mod_seq_lock); spin_lock_init(&fs_info->super_lock); + spin_lock_init(&fs_info->qgroup_op_lock); + spin_lock_init(&fs_info->buffer_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); + mutex_init(&fs_info->delalloc_root_mutex); seqlock_init(&fs_info->profiles_lock); init_completion(&fs_info->kobj_unregister); @@ -2197,6 +2254,7 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->async_submit_draining, 0); atomic_set(&fs_info->nr_async_bios, 0); atomic_set(&fs_info->defrag_running, 0); + atomic_set(&fs_info->qgroup_op_seq, 0); atomic64_set(&fs_info->tree_mod_seq, 0); fs_info->sb = sb; fs_info->max_inline = 8192 * 1024; @@ -2205,7 +2263,7 @@ int open_ctree(struct super_block *sb, fs_info->free_chunk_space = 0; fs_info->tree_mod_log = RB_ROOT; fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; - + fs_info->avg_delayed_ref_runtime = div64_u64(NSEC_PER_SEC, 64); /* readahead state */ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); spin_lock_init(&fs_info->reada_lock); @@ -2228,8 +2286,8 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->scrub_pause_req, 0); atomic_set(&fs_info->scrubs_paused, 0); atomic_set(&fs_info->scrub_cancel_req, 0); + init_waitqueue_head(&fs_info->replace_wait); init_waitqueue_head(&fs_info->scrub_pause_wait); - init_rwsem(&fs_info->scrub_super_lock); fs_info->scrub_workers_refcnt = 0; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY fs_info->check_integrity_print_mask = 0; @@ -2242,6 +2300,7 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->balance_cancel_req, 0); fs_info->balance_ctl = NULL; init_waitqueue_head(&fs_info->balance_wait_q); + btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); @@ -2271,7 +2330,7 @@ int open_ctree(struct super_block *sb, sizeof(struct btrfs_key)); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(fs_info->btree_inode)->runtime_flags); - insert_inode_hash(fs_info->btree_inode); + btrfs_insert_inode_hash(fs_info->btree_inode); spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; @@ -2292,7 +2351,7 @@ int open_ctree(struct super_block *sb, mutex_init(&fs_info->transaction_kthread_mutex); mutex_init(&fs_info->cleaner_mutex); mutex_init(&fs_info->volume_mutex); - init_rwsem(&fs_info->extent_commit_sem); + init_rwsem(&fs_info->commit_root_sem); init_rwsem(&fs_info->cleanup_work_sem); init_rwsem(&fs_info->subvol_sem); sema_init(&fs_info->uuid_tree_rescan_sem, 1); @@ -2305,6 +2364,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->qgroup_lock); mutex_init(&fs_info->qgroup_ioctl_lock); fs_info->qgroup_tree = RB_ROOT; + fs_info->qgroup_op_tree = RB_ROOT; INIT_LIST_HEAD(&fs_info->dirty_qgroups); fs_info->qgroup_seq = 1; fs_info->quota_enabled = 0; @@ -2345,7 +2405,7 @@ int open_ctree(struct super_block *sb, * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). */ if (btrfs_check_super_csum(bh->b_data)) { - printk(KERN_ERR "btrfs: superblock checksum mismatch\n"); + printk(KERN_ERR "BTRFS: superblock checksum mismatch\n"); err = -EINVAL; goto fail_alloc; } @@ -2364,7 +2424,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); if (ret) { - printk(KERN_ERR "btrfs: superblock contains fatal errors\n"); + printk(KERN_ERR "BTRFS: superblock contains fatal errors\n"); err = -EINVAL; goto fail_alloc; } @@ -2429,7 +2489,7 @@ int open_ctree(struct super_block *sb, features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) - printk(KERN_ERR "btrfs: has skinny extents\n"); + printk(KERN_ERR "BTRFS: has skinny extents\n"); /* * flag our filesystem as having big metadata blocks if @@ -2437,7 +2497,7 @@ int open_ctree(struct super_block *sb, */ if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) { if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) - printk(KERN_INFO "btrfs flagging fs with big metadata feature\n"); + printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n"); features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; } @@ -2454,7 +2514,7 @@ int open_ctree(struct super_block *sb, */ if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && (sectorsize != leafsize)) { - printk(KERN_WARNING "btrfs: unequal leaf/node/sector sizes " + printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes " "are not allowed for mixed block groups on %s\n", sb->s_id); goto fail_alloc; @@ -2476,104 +2536,73 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } - btrfs_init_workers(&fs_info->generic_worker, - "genwork", 1, NULL); - - btrfs_init_workers(&fs_info->workers, "worker", - fs_info->thread_pool_size, - &fs_info->generic_worker); + max_active = fs_info->thread_pool_size; - btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", - fs_info->thread_pool_size, NULL); + fs_info->workers = + btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI, + max_active, 16); - btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc", - fs_info->thread_pool_size, NULL); + fs_info->delalloc_workers = + btrfs_alloc_workqueue("delalloc", flags, max_active, 2); - btrfs_init_workers(&fs_info->submit_workers, "submit", - min_t(u64, fs_devices->num_devices, - fs_info->thread_pool_size), NULL); + fs_info->flush_workers = + btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0); - btrfs_init_workers(&fs_info->caching_workers, "cache", - fs_info->thread_pool_size, NULL); + fs_info->caching_workers = + btrfs_alloc_workqueue("cache", flags, max_active, 0); - /* a higher idle thresh on the submit workers makes it much more + /* + * a higher idle thresh on the submit workers makes it much more * likely that bios will be send down in a sane order to the * devices */ - fs_info->submit_workers.idle_thresh = 64; - - fs_info->workers.idle_thresh = 16; - fs_info->workers.ordered = 1; - - fs_info->delalloc_workers.idle_thresh = 2; - fs_info->delalloc_workers.ordered = 1; - - btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_workers, "endio", - fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", - fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_meta_write_workers, - "endio-meta-write", fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_raid56_workers, - "endio-raid56", fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->rmw_workers, - "rmw", fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", - fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write", - 1, &fs_info->generic_worker); - btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta", - fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->readahead_workers, "readahead", - fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1, - &fs_info->generic_worker); + fs_info->submit_workers = + btrfs_alloc_workqueue("submit", flags, + min_t(u64, fs_devices->num_devices, + max_active), 64); + + fs_info->fixup_workers = + btrfs_alloc_workqueue("fixup", flags, 1, 0); /* * endios are largely parallel and should have a very * low idle thresh */ - fs_info->endio_workers.idle_thresh = 4; - fs_info->endio_meta_workers.idle_thresh = 4; - fs_info->endio_raid56_workers.idle_thresh = 4; - fs_info->rmw_workers.idle_thresh = 2; - - fs_info->endio_write_workers.idle_thresh = 2; - fs_info->endio_meta_write_workers.idle_thresh = 2; - fs_info->readahead_workers.idle_thresh = 2; - - /* - * btrfs_start_workers can really only fail because of ENOMEM so just - * return -ENOMEM if any of these fail. - */ - ret = btrfs_start_workers(&fs_info->workers); - ret |= btrfs_start_workers(&fs_info->generic_worker); - ret |= btrfs_start_workers(&fs_info->submit_workers); - ret |= btrfs_start_workers(&fs_info->delalloc_workers); - ret |= btrfs_start_workers(&fs_info->fixup_workers); - ret |= btrfs_start_workers(&fs_info->endio_workers); - ret |= btrfs_start_workers(&fs_info->endio_meta_workers); - ret |= btrfs_start_workers(&fs_info->rmw_workers); - ret |= btrfs_start_workers(&fs_info->endio_raid56_workers); - ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); - ret |= btrfs_start_workers(&fs_info->endio_write_workers); - ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); - ret |= btrfs_start_workers(&fs_info->delayed_workers); - ret |= btrfs_start_workers(&fs_info->caching_workers); - ret |= btrfs_start_workers(&fs_info->readahead_workers); - ret |= btrfs_start_workers(&fs_info->flush_workers); - ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers); - if (ret) { + fs_info->endio_workers = + btrfs_alloc_workqueue("endio", flags, max_active, 4); + fs_info->endio_meta_workers = + btrfs_alloc_workqueue("endio-meta", flags, max_active, 4); + fs_info->endio_meta_write_workers = + btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); + fs_info->endio_raid56_workers = + btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); + fs_info->rmw_workers = + btrfs_alloc_workqueue("rmw", flags, max_active, 2); + fs_info->endio_write_workers = + btrfs_alloc_workqueue("endio-write", flags, max_active, 2); + fs_info->endio_freespace_worker = + btrfs_alloc_workqueue("freespace-write", flags, max_active, 0); + fs_info->delayed_workers = + btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0); + fs_info->readahead_workers = + btrfs_alloc_workqueue("readahead", flags, max_active, 2); + fs_info->qgroup_rescan_workers = + btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); + fs_info->extent_workers = + btrfs_alloc_workqueue("extent-refs", flags, + min_t(u64, fs_devices->num_devices, + max_active), 8); + + if (!(fs_info->workers && fs_info->delalloc_workers && + fs_info->submit_workers && fs_info->flush_workers && + fs_info->endio_workers && fs_info->endio_meta_workers && + fs_info->endio_meta_write_workers && + fs_info->endio_write_workers && fs_info->endio_raid56_workers && + fs_info->endio_freespace_worker && fs_info->rmw_workers && + fs_info->caching_workers && fs_info->readahead_workers && + fs_info->fixup_workers && fs_info->delayed_workers && + fs_info->fixup_workers && fs_info->extent_workers && + fs_info->qgroup_rescan_workers)) { err = -ENOMEM; goto fail_sb_buffer; } @@ -2591,12 +2620,12 @@ int open_ctree(struct super_block *sb, sb->s_blocksize_bits = blksize_bits(sectorsize); if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) { - printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); + printk(KERN_INFO "BTRFS: valid FS not found on %s\n", sb->s_id); goto fail_sb_buffer; } if (sectorsize != PAGE_SIZE) { - printk(KERN_WARNING "btrfs: Incompatible sector size(%lu) " + printk(KERN_WARNING "BTRFS: Incompatible sector size(%lu) " "found on %s\n", (unsigned long)sectorsize, sb->s_id); goto fail_sb_buffer; } @@ -2605,7 +2634,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_read_sys_array(tree_root); mutex_unlock(&fs_info->chunk_mutex); if (ret) { - printk(KERN_WARNING "btrfs: failed to read the system " + printk(KERN_WARNING "BTRFS: failed to read the system " "array on %s\n", sb->s_id); goto fail_sb_buffer; } @@ -2622,7 +2651,7 @@ int open_ctree(struct super_block *sb, blocksize, generation); if (!chunk_root->node || !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { - printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", + printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n", sb->s_id); goto fail_tree_roots; } @@ -2634,7 +2663,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_read_chunk_tree(chunk_root); if (ret) { - printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", + printk(KERN_WARNING "BTRFS: failed to read chunk tree on %s\n", sb->s_id); goto fail_tree_roots; } @@ -2646,7 +2675,7 @@ int open_ctree(struct super_block *sb, btrfs_close_extra_devices(fs_info, fs_devices, 0); if (!fs_devices->latest_bdev) { - printk(KERN_CRIT "btrfs: failed to read devices on %s\n", + printk(KERN_CRIT "BTRFS: failed to read devices on %s\n", sb->s_id); goto fail_tree_roots; } @@ -2661,7 +2690,7 @@ retry_root_backup: blocksize, generation); if (!tree_root->node || !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { - printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", + printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", sb->s_id); goto recovery_tree_root; @@ -2669,6 +2698,7 @@ retry_root_backup: btrfs_set_root_node(&tree_root->root_item, tree_root->node); tree_root->commit_root = btrfs_root_node(tree_root); + btrfs_set_root_refs(&tree_root->root_item, 1); location.objectid = BTRFS_EXTENT_TREE_OBJECTID; location.type = BTRFS_ROOT_ITEM_KEY; @@ -2679,7 +2709,7 @@ retry_root_backup: ret = PTR_ERR(extent_root); goto recovery_tree_root; } - extent_root->track_dirty = 1; + set_bit(BTRFS_ROOT_TRACK_DIRTY, &extent_root->state); fs_info->extent_root = extent_root; location.objectid = BTRFS_DEV_TREE_OBJECTID; @@ -2688,7 +2718,7 @@ retry_root_backup: ret = PTR_ERR(dev_root); goto recovery_tree_root; } - dev_root->track_dirty = 1; + set_bit(BTRFS_ROOT_TRACK_DIRTY, &dev_root->state); fs_info->dev_root = dev_root; btrfs_init_devices_late(fs_info); @@ -2698,13 +2728,13 @@ retry_root_backup: ret = PTR_ERR(csum_root); goto recovery_tree_root; } - csum_root->track_dirty = 1; + set_bit(BTRFS_ROOT_TRACK_DIRTY, &csum_root->state); fs_info->csum_root = csum_root; location.objectid = BTRFS_QUOTA_TREE_OBJECTID; quota_root = btrfs_read_tree_root(tree_root, &location); if (!IS_ERR(quota_root)) { - quota_root->track_dirty = 1; + set_bit(BTRFS_ROOT_TRACK_DIRTY, "a_root->state); fs_info->quota_enabled = 1; fs_info->pending_quota_state = 1; fs_info->quota_root = quota_root; @@ -2719,7 +2749,7 @@ retry_root_backup: create_uuid_tree = true; check_uuid_tree = false; } else { - uuid_root->track_dirty = 1; + set_bit(BTRFS_ROOT_TRACK_DIRTY, &uuid_root->state); fs_info->uuid_root = uuid_root; create_uuid_tree = false; check_uuid_tree = @@ -2731,50 +2761,56 @@ retry_root_backup: ret = btrfs_recover_balance(fs_info); if (ret) { - printk(KERN_WARNING "btrfs: failed to recover balance\n"); + printk(KERN_WARNING "BTRFS: failed to recover balance\n"); goto fail_block_groups; } ret = btrfs_init_dev_stats(fs_info); if (ret) { - printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n", + printk(KERN_ERR "BTRFS: failed to init dev_stats: %d\n", ret); goto fail_block_groups; } ret = btrfs_init_dev_replace(fs_info); if (ret) { - pr_err("btrfs: failed to init dev_replace: %d\n", ret); + pr_err("BTRFS: failed to init dev_replace: %d\n", ret); goto fail_block_groups; } btrfs_close_extra_devices(fs_info, fs_devices, 1); - ret = btrfs_init_space_info(fs_info); + ret = btrfs_sysfs_add_one(fs_info); if (ret) { - printk(KERN_ERR "Failed to initial space info: %d\n", ret); + pr_err("BTRFS: failed to init sysfs interface: %d\n", ret); goto fail_block_groups; } + ret = btrfs_init_space_info(fs_info); + if (ret) { + printk(KERN_ERR "BTRFS: Failed to initial space info: %d\n", ret); + goto fail_sysfs; + } + ret = btrfs_read_block_groups(extent_root); if (ret) { - printk(KERN_ERR "Failed to read block groups: %d\n", ret); - goto fail_block_groups; + printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret); + goto fail_sysfs; } fs_info->num_tolerated_disk_barrier_failures = btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); if (fs_info->fs_devices->missing_devices > fs_info->num_tolerated_disk_barrier_failures && !(sb->s_flags & MS_RDONLY)) { - printk(KERN_WARNING - "Btrfs: too many missing devices, writeable mount is not allowed\n"); - goto fail_block_groups; + printk(KERN_WARNING "BTRFS: " + "too many missing devices, writeable mount is not allowed\n"); + goto fail_sysfs; } fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, "btrfs-cleaner"); if (IS_ERR(fs_info->cleaner_kthread)) - goto fail_block_groups; + goto fail_sysfs; fs_info->transaction_kthread = kthread_run(transaction_kthread, tree_root, @@ -2785,11 +2821,15 @@ retry_root_backup: if (!btrfs_test_opt(tree_root, SSD) && !btrfs_test_opt(tree_root, NOSSD) && !fs_info->fs_devices->rotating) { - printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD " + printk(KERN_INFO "BTRFS: detected SSD devices, enabling SSD " "mode\n"); btrfs_set_opt(fs_info->mount_opt, SSD); } + /* Set the real inode map cache flag */ + if (btrfs_test_opt(tree_root, CHANGE_INODE_CACHE)) + btrfs_set_opt(tree_root->fs_info->mount_opt, INODE_MAP_CACHE); + #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { ret = btrfsic_mount(tree_root, fs_devices, @@ -2798,7 +2838,7 @@ retry_root_backup: 1 : 0, fs_info->check_integrity_print_mask); if (ret) - printk(KERN_WARNING "btrfs: failed to initialize" + printk(KERN_WARNING "BTRFS: failed to initialize" " integrity check module %s\n", sb->s_id); } #endif @@ -2811,7 +2851,7 @@ retry_root_backup: u64 bytenr = btrfs_super_log_root(disk_super); if (fs_devices->rw_devices == 0) { - printk(KERN_WARNING "Btrfs log replay required " + printk(KERN_WARNING "BTRFS: log replay required " "on RO media\n"); err = -EIO; goto fail_qgroup; @@ -2834,10 +2874,10 @@ retry_root_backup: generation + 1); if (!log_tree_root->node || !extent_buffer_uptodate(log_tree_root->node)) { - printk(KERN_ERR "btrfs: failed to read log tree\n"); + printk(KERN_ERR "BTRFS: failed to read log tree\n"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); - goto fail_trans_kthread; + goto fail_qgroup; } /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); @@ -2846,29 +2886,31 @@ retry_root_backup: "Failed to recover log tree"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); - goto fail_trans_kthread; + goto fail_qgroup; } if (sb->s_flags & MS_RDONLY) { ret = btrfs_commit_super(tree_root); if (ret) - goto fail_trans_kthread; + goto fail_qgroup; } } ret = btrfs_find_orphan_roots(tree_root); if (ret) - goto fail_trans_kthread; + goto fail_qgroup; if (!(sb->s_flags & MS_RDONLY)) { ret = btrfs_cleanup_fs_roots(fs_info); if (ret) - goto fail_trans_kthread; + goto fail_qgroup; + mutex_lock(&fs_info->cleaner_mutex); ret = btrfs_recover_relocation(tree_root); + mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { printk(KERN_WARNING - "btrfs: failed to recover relocation\n"); + "BTRFS: failed to recover relocation\n"); err = -EINVAL; goto fail_qgroup; } @@ -2898,14 +2940,14 @@ retry_root_backup: ret = btrfs_resume_balance_async(fs_info); if (ret) { - printk(KERN_WARNING "btrfs: failed to resume balance\n"); + printk(KERN_WARNING "BTRFS: failed to resume balance\n"); close_ctree(tree_root); return ret; } ret = btrfs_resume_dev_replace_async(fs_info); if (ret) { - pr_warn("btrfs: failed to resume dev_replace\n"); + pr_warn("BTRFS: failed to resume dev_replace\n"); close_ctree(tree_root); return ret; } @@ -2913,20 +2955,20 @@ retry_root_backup: btrfs_qgroup_rescan_resume(fs_info); if (create_uuid_tree) { - pr_info("btrfs: creating UUID tree\n"); + pr_info("BTRFS: creating UUID tree\n"); ret = btrfs_create_uuid_tree(fs_info); if (ret) { - pr_warn("btrfs: failed to create the UUID tree %d\n", + pr_warn("BTRFS: failed to create the UUID tree %d\n", ret); close_ctree(tree_root); return ret; } } else if (check_uuid_tree || btrfs_test_opt(tree_root, RESCAN_UUID_TREE)) { - pr_info("btrfs: checking UUID tree\n"); + pr_info("BTRFS: checking UUID tree\n"); ret = btrfs_check_uuid_tree(fs_info); if (ret) { - pr_warn("btrfs: failed to check the UUID tree %d\n", + pr_warn("BTRFS: failed to check the UUID tree %d\n", ret); close_ctree(tree_root); return ret; @@ -2942,7 +2984,7 @@ fail_qgroup: fail_trans_kthread: kthread_stop(fs_info->transaction_kthread); btrfs_cleanup_transaction(fs_info->tree_root); - del_fs_roots(fs_info); + btrfs_free_fs_roots(fs_info); fail_cleaner: kthread_stop(fs_info->cleaner_kthread); @@ -2952,6 +2994,9 @@ fail_cleaner: */ filemap_write_and_wait(fs_info->btree_inode->i_mapping); +fail_sysfs: + btrfs_sysfs_remove_one(fs_info); + fail_block_groups: btrfs_put_block_group_cache(fs_info); btrfs_free_block_groups(fs_info); @@ -2967,6 +3012,8 @@ fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); iput(fs_info->btree_inode); +fail_bio_counter: + percpu_counter_destroy(&fs_info->bio_counter); fail_delalloc_bytes: percpu_counter_destroy(&fs_info->delalloc_bytes); fail_dirty_metadata_bytes: @@ -3007,7 +3054,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) struct btrfs_device *device = (struct btrfs_device *) bh->b_private; - printk_ratelimited_in_rcu(KERN_WARNING "lost page write due to " + printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to " "I/O error on %s\n", rcu_str_deref(device->name)); /* note, we dont' set_buffer_write_io_error because we have @@ -3126,7 +3173,7 @@ static int write_dev_supers(struct btrfs_device *device, bh = __getblk(device->bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); if (!bh) { - printk(KERN_ERR "btrfs: couldn't get super " + printk(KERN_ERR "BTRFS: couldn't get super " "buffer head for bytenr %Lu\n", bytenr); errors++; continue; @@ -3147,7 +3194,10 @@ static int write_dev_supers(struct btrfs_device *device, * we fua the first super. The others we allow * to go down lazy. */ - ret = btrfsic_submit_bh(WRITE_FUA, bh); + if (i == 0) + ret = btrfsic_submit_bh(WRITE_FUA, bh); + else + ret = btrfsic_submit_bh(WRITE_SYNC, bh); if (ret) errors++; } @@ -3193,7 +3243,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) wait_for_completion(&device->flush_wait); if (bio_flagged(bio, BIO_EOPNOTSUPP)) { - printk_in_rcu("btrfs: disabling barriers on dev %s\n", + printk_in_rcu("BTRFS: disabling barriers on dev %s\n", rcu_str_deref(device->name)); device->nobarriers = 1; } else if (!bio_flagged(bio, BIO_UPTODATE)) { @@ -3245,6 +3295,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info) /* send down all the barriers */ head = &info->fs_devices->devices; list_for_each_entry_rcu(dev, head, dev_list) { + if (dev->missing) + continue; if (!dev->bdev) { errors_send++; continue; @@ -3259,6 +3311,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info) /* wait for all the barriers */ list_for_each_entry_rcu(dev, head, dev_list) { + if (dev->missing) + continue; if (!dev->bdev) { errors_wait++; continue; @@ -3414,7 +3468,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) total_errors++; } if (total_errors > max_errors) { - printk(KERN_ERR "btrfs: %d errors while writing supers\n", + btrfs_err(root->fs_info, "%d errors while writing supers", total_errors); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); @@ -3447,10 +3501,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors) { - int ret; - - ret = write_all_supers(root, max_mirrors); - return ret; + return write_all_supers(root, max_mirrors); } /* Drop a fs root from the radix tree and free it. */ @@ -3465,13 +3516,13 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, if (btrfs_root_refs(&root->root_item) == 0) synchronize_srcu(&fs_info->subvol_srcu); - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) btrfs_free_log(NULL, root); - btrfs_free_log_root_tree(NULL, fs_info); - } - __btrfs_remove_free_space_cache(root->free_ino_pinned); - __btrfs_remove_free_space_cache(root->free_ino_ctl); + if (root->free_ino_pinned) + __btrfs_remove_free_space_cache(root->free_ino_pinned); + if (root->free_ino_ctl) + __btrfs_remove_free_space_cache(root->free_ino_ctl); free_fs_root(root); } @@ -3483,6 +3534,8 @@ static void free_fs_root(struct btrfs_root *root) root->orphan_block_rsv = NULL; if (root->anon_dev) free_anon_bdev(root->anon_dev); + if (root->subv_writers) + btrfs_free_subvolume_writers(root->subv_writers); free_extent_buffer(root->node); free_extent_buffer(root->commit_root); kfree(root->free_ino_ctl); @@ -3500,34 +3553,56 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) { u64 root_objectid = 0; struct btrfs_root *gang[8]; - int i; - int ret; + int i = 0; + int err = 0; + unsigned int ret = 0; + int index; while (1) { + index = srcu_read_lock(&fs_info->subvol_srcu); ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, (void **)gang, root_objectid, ARRAY_SIZE(gang)); - if (!ret) + if (!ret) { + srcu_read_unlock(&fs_info->subvol_srcu, index); break; - + } root_objectid = gang[ret - 1]->root_key.objectid + 1; + for (i = 0; i < ret; i++) { - int err; + /* Avoid to grab roots in dead_roots */ + if (btrfs_root_refs(&gang[i]->root_item) == 0) { + gang[i] = NULL; + continue; + } + /* grab all the search result for later use */ + gang[i] = btrfs_grab_fs_root(gang[i]); + } + srcu_read_unlock(&fs_info->subvol_srcu, index); + for (i = 0; i < ret; i++) { + if (!gang[i]) + continue; root_objectid = gang[i]->root_key.objectid; err = btrfs_orphan_cleanup(gang[i]); if (err) - return err; + break; + btrfs_put_fs_root(gang[i]); } root_objectid++; } - return 0; + + /* release the uncleaned roots due to error */ + for (; i < ret; i++) { + if (gang[i]) + btrfs_put_fs_root(gang[i]); + } + return err; } int btrfs_commit_super(struct btrfs_root *root) { struct btrfs_trans_handle *trans; - int ret; mutex_lock(&root->fs_info->cleaner_mutex); btrfs_run_delayed_iputs(root); @@ -3541,25 +3616,7 @@ int btrfs_commit_super(struct btrfs_root *root) trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; - /* run commit again to drop the original snapshot */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; - ret = btrfs_write_and_wait_transaction(NULL, root); - if (ret) { - btrfs_error(root->fs_info, ret, - "Failed to sync btree inode to disk."); - return ret; - } - - ret = write_ctree_super(NULL, root, 0); - return ret; + return btrfs_commit_transaction(trans, root); } int close_ctree(struct btrfs_root *root) @@ -3589,17 +3646,17 @@ int close_ctree(struct btrfs_root *root) /* clear out the rbtree of defraggable inodes */ btrfs_cleanup_defrag_inodes(fs_info); + cancel_work_sync(&fs_info->async_reclaim_work); + if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_commit_super(root); if (ret) - printk(KERN_ERR "btrfs: commit super ret %d\n", ret); + btrfs_err(root->fs_info, "commit super ret %d", ret); } if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) btrfs_error_commit_super(root); - btrfs_put_block_group_cache(fs_info); - kthread_stop(fs_info->transaction_kthread); kthread_stop(fs_info->cleaner_kthread); @@ -3609,16 +3666,25 @@ int close_ctree(struct btrfs_root *root) btrfs_free_qgroup_config(root->fs_info); if (percpu_counter_sum(&fs_info->delalloc_bytes)) { - printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n", + btrfs_info(root->fs_info, "at unmount delalloc count %lld", percpu_counter_sum(&fs_info->delalloc_bytes)); } + btrfs_sysfs_remove_one(fs_info); + + btrfs_free_fs_roots(fs_info); + + btrfs_put_block_group_cache(fs_info); + btrfs_free_block_groups(fs_info); + /* + * we must make sure there is not any read request to + * submit after we stopping all workers. + */ + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); btrfs_stop_all_workers(fs_info); - del_fs_roots(fs_info); - free_root_pointers(fs_info, 1); iput(fs_info->btree_inode); @@ -3633,6 +3699,7 @@ int close_ctree(struct btrfs_root *root) percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); + percpu_counter_destroy(&fs_info->bio_counter); bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); @@ -3668,10 +3735,20 @@ int btrfs_set_buffer_uptodate(struct extent_buffer *buf) void btrfs_mark_buffer_dirty(struct extent_buffer *buf) { - struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; + struct btrfs_root *root; u64 transid = btrfs_header_generation(buf); int was_dirty; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + /* + * This is a fast path so only do this check if we have sanity tests + * enabled. Normal people shouldn't be marking dummy buffers as dirty + * outside of the sanity tests. + */ + if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &buf->bflags))) + return; +#endif + root = BTRFS_I(buf->pages[0]->mapping->host)->root; btrfs_assert_tree_locked(buf); if (transid != root->fs_info->generation) WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, " @@ -3682,6 +3759,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) __percpu_counter_add(&root->fs_info->dirty_metadata_bytes, buf->len, root->fs_info->dirty_metadata_batch); +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) { + btrfs_print_leaf(root, buf); + ASSERT(0); + } +#endif } static void __btrfs_btree_balance_dirty(struct btrfs_root *root, @@ -3801,11 +3884,14 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) while (!list_empty(&splice)) { root = list_first_entry(&splice, struct btrfs_root, ordered_root); - list_del_init(&root->ordered_root); + list_move_tail(&root->ordered_root, + &fs_info->ordered_roots); + spin_unlock(&fs_info->ordered_root_lock); btrfs_destroy_ordered_extents(root); - cond_resched_lock(&fs_info->ordered_root_lock); + cond_resched(); + spin_lock(&fs_info->ordered_root_lock); } spin_unlock(&fs_info->ordered_root_lock); } @@ -3821,55 +3907,54 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, delayed_refs = &trans->delayed_refs; spin_lock(&delayed_refs->lock); - if (delayed_refs->num_entries == 0) { + if (atomic_read(&delayed_refs->num_entries) == 0) { spin_unlock(&delayed_refs->lock); - printk(KERN_INFO "delayed_refs has NO entry\n"); + btrfs_info(root->fs_info, "delayed_refs has NO entry"); return ret; } - while ((node = rb_first(&delayed_refs->root)) != NULL) { - struct btrfs_delayed_ref_head *head = NULL; + while ((node = rb_first(&delayed_refs->href_root)) != NULL) { + struct btrfs_delayed_ref_head *head; bool pin_bytes = false; - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - atomic_set(&ref->refs, 1); - if (btrfs_delayed_ref_is_head(ref)) { - - head = btrfs_delayed_node_to_head(ref); - if (!mutex_trylock(&head->mutex)) { - atomic_inc(&ref->refs); - spin_unlock(&delayed_refs->lock); - - /* Need to wait for the delayed ref to run */ - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(ref); + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); - spin_lock(&delayed_refs->lock); - continue; - } - - if (head->must_insert_reserved) - pin_bytes = true; - btrfs_free_delayed_extent_op(head->extent_op); - delayed_refs->num_heads--; - if (list_empty(&head->cluster)) - delayed_refs->num_heads_ready--; - list_del_init(&head->cluster); - } - - ref->in_tree = 0; - rb_erase(&ref->rb_node, &delayed_refs->root); - delayed_refs->num_entries--; - spin_unlock(&delayed_refs->lock); - if (head) { - if (pin_bytes) - btrfs_pin_extent(root, ref->bytenr, - ref->num_bytes, 1); + mutex_lock(&head->mutex); mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + spin_lock(&delayed_refs->lock); + continue; } - btrfs_put_delayed_ref(ref); + spin_lock(&head->lock); + while ((node = rb_first(&head->ref_root)) != NULL) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, + rb_node); + ref->in_tree = 0; + rb_erase(&ref->rb_node, &head->ref_root); + atomic_dec(&delayed_refs->num_entries); + btrfs_put_delayed_ref(ref); + } + if (head->must_insert_reserved) + pin_bytes = true; + btrfs_free_delayed_extent_op(head->extent_op); + delayed_refs->num_heads--; + if (head->processing == 0) + delayed_refs->num_heads_ready--; + atomic_dec(&delayed_refs->num_entries); + head->node.in_tree = 0; + rb_erase(&head->href_node, &delayed_refs->href_root); + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); + mutex_unlock(&head->mutex); + if (pin_bytes) + btrfs_pin_extent(root, head->node.bytenr, + head->node.num_bytes, 1); + btrfs_put_delayed_ref(&head->node); cond_resched(); spin_lock(&delayed_refs->lock); } @@ -3879,24 +3964,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, return ret; } -static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t) -{ - struct btrfs_pending_snapshot *snapshot; - struct list_head splice; - - INIT_LIST_HEAD(&splice); - - list_splice_init(&t->pending_snapshots, &splice); - - while (!list_empty(&splice)) { - snapshot = list_entry(splice.next, - struct btrfs_pending_snapshot, - list); - snapshot->error = -ECANCELED; - list_del_init(&snapshot->list); - } -} - static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) { struct btrfs_inode *btrfs_inode; @@ -4026,15 +4093,13 @@ again: void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, struct btrfs_root *root) { + btrfs_destroy_ordered_operations(cur_trans, root); + btrfs_destroy_delayed_refs(cur_trans, root); - btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, - cur_trans->dirty_pages.dirty_bytes); cur_trans->state = TRANS_STATE_COMMIT_START; wake_up(&root->fs_info->transaction_blocked_wait); - btrfs_evict_pending_snapshots(cur_trans); - cur_trans->state = TRANS_STATE_UNBLOCKED; wake_up(&root->fs_info->transaction_wait); @@ -4058,63 +4123,51 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, static int btrfs_cleanup_transaction(struct btrfs_root *root) { struct btrfs_transaction *t; - LIST_HEAD(list); mutex_lock(&root->fs_info->transaction_kthread_mutex); spin_lock(&root->fs_info->trans_lock); - list_splice_init(&root->fs_info->trans_list, &list); - root->fs_info->running_transaction = NULL; - spin_unlock(&root->fs_info->trans_lock); - - while (!list_empty(&list)) { - t = list_entry(list.next, struct btrfs_transaction, list); - - btrfs_destroy_ordered_operations(t, root); - - btrfs_destroy_all_ordered_extents(root->fs_info); - - btrfs_destroy_delayed_refs(t, root); - - /* - * FIXME: cleanup wait for commit - * We needn't acquire the lock here, because we are during - * the umount, there is no other task which will change it. - */ - t->state = TRANS_STATE_COMMIT_START; - smp_mb(); - if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) - wake_up(&root->fs_info->transaction_blocked_wait); - - btrfs_evict_pending_snapshots(t); - - t->state = TRANS_STATE_UNBLOCKED; - smp_mb(); - if (waitqueue_active(&root->fs_info->transaction_wait)) - wake_up(&root->fs_info->transaction_wait); - - btrfs_destroy_delayed_inodes(root); - btrfs_assert_delayed_root_empty(root); - - btrfs_destroy_all_delalloc_inodes(root->fs_info); - - btrfs_destroy_marked_extents(root, &t->dirty_pages, - EXTENT_DIRTY); - - btrfs_destroy_pinned_extent(root, - root->fs_info->pinned_extents); - - t->state = TRANS_STATE_COMPLETED; - smp_mb(); - if (waitqueue_active(&t->commit_wait)) - wake_up(&t->commit_wait); + while (!list_empty(&root->fs_info->trans_list)) { + t = list_first_entry(&root->fs_info->trans_list, + struct btrfs_transaction, list); + if (t->state >= TRANS_STATE_COMMIT_START) { + atomic_inc(&t->use_count); + spin_unlock(&root->fs_info->trans_lock); + btrfs_wait_for_commit(root, t->transid); + btrfs_put_transaction(t); + spin_lock(&root->fs_info->trans_lock); + continue; + } + if (t == root->fs_info->running_transaction) { + t->state = TRANS_STATE_COMMIT_DOING; + spin_unlock(&root->fs_info->trans_lock); + /* + * We wait for 0 num_writers since we don't hold a trans + * handle open currently for this transaction. + */ + wait_event(t->writer_wait, + atomic_read(&t->num_writers) == 0); + } else { + spin_unlock(&root->fs_info->trans_lock); + } + btrfs_cleanup_one_transaction(t, root); - atomic_set(&t->use_count, 0); + spin_lock(&root->fs_info->trans_lock); + if (t == root->fs_info->running_transaction) + root->fs_info->running_transaction = NULL; list_del_init(&t->list); - memset(t, 0, sizeof(*t)); - kmem_cache_free(btrfs_transaction_cachep, t); - } + spin_unlock(&root->fs_info->trans_lock); + btrfs_put_transaction(t); + trace_btrfs_transaction_commit(root); + spin_lock(&root->fs_info->trans_lock); + } + spin_unlock(&root->fs_info->trans_lock); + btrfs_destroy_all_ordered_extents(root->fs_info); + btrfs_destroy_delayed_inodes(root); + btrfs_assert_delayed_root_empty(root); + btrfs_destroy_pinned_extent(root, root->fs_info->pinned_extents); + btrfs_destroy_all_delalloc_inodes(root->fs_info); mutex_unlock(&root->fs_info->transaction_kthread_mutex); return 0; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index b71acd6e1e5..23ce3ceba0a 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -68,8 +68,18 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, int btrfs_init_fs_root(struct btrfs_root *root); int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); -struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, - struct btrfs_key *location); +void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); + +struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *key, + bool check_ref); +static inline struct btrfs_root * +btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, + struct btrfs_key *location) +{ + return btrfs_get_fs_root(fs_info, location, true); +} + int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty(struct btrfs_root *root); void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); @@ -77,6 +87,10 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_free_fs_root(struct btrfs_root *root); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct btrfs_root *btrfs_alloc_dummy_root(void); +#endif + /* * This function is used to grab the root, and avoid it is freed when we * access it. But it doesn't ensure that the tree is not dropped. diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 4b869160737..41422a3de8e 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -5,7 +5,6 @@ #include "btrfs_inode.h" #include "print-tree.h" #include "export.h" -#include "compat.h" #define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \ parent_objectid) / 4) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d58bef130a4..813537f362f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -25,17 +25,17 @@ #include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/percpu_counter.h> -#include "compat.h" #include "hash.h" -#include "ctree.h" +#include "tree-log.h" #include "disk-io.h" #include "print-tree.h" -#include "transaction.h" #include "volumes.h" #include "raid56.h" #include "locking.h" #include "free-space-cache.h" #include "math.h" +#include "sysfs.h" +#include "qgroup.h" #undef SCRAMBLE_DELAYED_REFS @@ -81,7 +81,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extra_op); + struct btrfs_delayed_extent_op *extra_op, + int no_quota); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, struct btrfs_extent_item *ei); @@ -94,7 +95,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins); + int level, struct btrfs_key *ins, + int no_quota); static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 flags, int force); @@ -103,7 +105,8 @@ static int find_next_key(struct btrfs_path *path, int level, static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups); static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve); + u64 num_bytes, int reserve, + int delalloc); static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); int btrfs_pin_extent(struct btrfs_root *root, @@ -419,7 +422,7 @@ static noinline void caching_thread(struct btrfs_work *work) again: mutex_lock(&caching_ctl->mutex); /* need to make sure the commit_root doesn't disappear */ - down_read(&fs_info->extent_commit_sem); + down_read(&fs_info->commit_root_sem); next: ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); @@ -442,10 +445,11 @@ next: if (ret) break; - if (need_resched()) { + if (need_resched() || + rwsem_is_contended(&fs_info->commit_root_sem)) { caching_ctl->progress = last; btrfs_release_path(path); - up_read(&fs_info->extent_commit_sem); + up_read(&fs_info->commit_root_sem); mutex_unlock(&caching_ctl->mutex); cond_resched(); goto again; @@ -512,7 +516,7 @@ next: err: btrfs_free_path(path); - up_read(&fs_info->extent_commit_sem); + up_read(&fs_info->commit_root_sem); free_excluded_extents(extent_root, block_group); @@ -548,7 +552,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, caching_ctl->block_group = cache; caching_ctl->progress = cache->key.objectid; atomic_set(&caching_ctl->count, 1); - caching_ctl->work.func = caching_thread; + btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); spin_lock(&cache->lock); /* @@ -632,14 +636,14 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, return 0; } - down_write(&fs_info->extent_commit_sem); + down_write(&fs_info->commit_root_sem); atomic_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); - up_write(&fs_info->extent_commit_sem); + up_write(&fs_info->commit_root_sem); btrfs_get_block_group(cache); - btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work); + btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); return ret; } @@ -768,20 +772,19 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - if (metadata) { - key.objectid = bytenr; - key.type = BTRFS_METADATA_ITEM_KEY; - key.offset = offset; - } else { - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = offset; - } - if (!trans) { path->skip_locking = 1; path->search_commit_root = 1; } + +search_again: + key.objectid = bytenr; + key.offset = offset; + if (metadata) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + again: ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 0, 0); @@ -789,7 +792,6 @@ again: goto out_free; if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { - metadata = 0; if (path->slots[0]) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, @@ -856,14 +858,16 @@ again: mutex_lock(&head->mutex); mutex_unlock(&head->mutex); btrfs_put_delayed_ref(&head->node); - goto again; + goto search_again; } + spin_lock(&head->lock); if (head->extent_op && head->extent_op->update_flags) extent_flags |= head->extent_op->flags_to_set; else BUG_ON(num_refs == 0); num_refs += head->node.ref_mod; + spin_unlock(&head->lock); mutex_unlock(&head->mutex); } spin_unlock(&delayed_refs->lock); @@ -1073,11 +1077,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) __le64 lenum; lenum = cpu_to_le64(root_objectid); - high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); + high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(owner); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(offset); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); return ((u64)high_crc << 31) ^ (u64)low_crc; } @@ -1270,7 +1274,7 @@ fail: static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - int refs_to_drop) + int refs_to_drop, int *last_ref) { struct btrfs_key key; struct btrfs_extent_data_ref *ref1 = NULL; @@ -1306,6 +1310,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, if (num_refs == 0) { ret = btrfs_del_item(trans, root, path); + *last_ref = 1; } else { if (key.type == BTRFS_EXTENT_DATA_REF_KEY) btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); @@ -1541,6 +1546,7 @@ again: ret = 0; } if (ret) { + key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; btrfs_release_path(path); @@ -1551,9 +1557,8 @@ again: if (ret && !insert) { err = -ENOENT; goto out; - } else if (ret) { + } else if (WARN_ON(ret)) { err = -EIO; - WARN_ON(1); goto out; } @@ -1763,7 +1768,8 @@ void update_inline_extent_backref(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_mod, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_extent_op *extent_op, + int *last_ref) { struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -1807,6 +1813,7 @@ void update_inline_extent_backref(struct btrfs_root *root, else btrfs_set_shared_data_ref_count(leaf, sref, refs); } else { + *last_ref = 1; size = btrfs_extent_inline_ref_size(type); item_size = btrfs_item_size_nr(leaf, path->slots[0]); ptr = (unsigned long)iref; @@ -1838,7 +1845,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, if (ret == 0) { BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); update_inline_extent_backref(root, path, iref, - refs_to_add, extent_op); + refs_to_add, extent_op, NULL); } else if (ret == -ENOENT) { setup_inline_extent_backref(root, path, iref, parent, root_objectid, owner, offset, @@ -1871,17 +1878,19 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, - int refs_to_drop, int is_data) + int refs_to_drop, int is_data, int *last_ref) { int ret = 0; BUG_ON(!is_data && refs_to_drop != 1); if (iref) { update_inline_extent_backref(root, path, iref, - -refs_to_drop, NULL); + -refs_to_drop, NULL, last_ref); } else if (is_data) { - ret = remove_extent_data_ref(trans, root, path, refs_to_drop); + ret = remove_extent_data_ref(trans, root, path, refs_to_drop, + last_ref); } else { + *last_ref = 1; ret = btrfs_del_item(trans, root, path); } return ret; @@ -1945,7 +1954,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset, int for_cow) + u64 root_objectid, u64 owner, u64 offset, + int no_quota) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1957,12 +1967,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL, for_cow); + BTRFS_ADD_DELAYED_REF, NULL, no_quota); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_REF, NULL, for_cow); + BTRFS_ADD_DELAYED_REF, NULL, no_quota); } return ret; } @@ -1972,37 +1982,64 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add, + int no_quota, struct btrfs_delayed_extent_op *extent_op) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_extent_item *item; + struct btrfs_key key; u64 refs; int ret; - int err = 0; + enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL; path = btrfs_alloc_path(); if (!path) return -ENOMEM; + if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled) + no_quota = 1; + path->reada = 1; path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ - ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, - path, bytenr, num_bytes, parent, + ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, + bytenr, num_bytes, parent, root_objectid, owner, offset, refs_to_add, extent_op); - if (ret == 0) + if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota)) goto out; + /* + * Ok we were able to insert an inline extent and it appears to be a new + * reference, deal with the qgroup accounting. + */ + if (!ret && !no_quota) { + ASSERT(root->fs_info->quota_enabled); + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add) + type = BTRFS_QGROUP_OPER_ADD_SHARED; + btrfs_release_path(path); - if (ret != -EAGAIN) { - err = ret; + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + bytenr, num_bytes, type, 0); goto out; } + /* + * Ok we had -EAGAIN which means we didn't have space to insert and + * inline extent ref, so just update the reference count and add a + * normal backref. + */ leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, item); + if (refs) + type = BTRFS_QGROUP_OPER_ADD_SHARED; btrfs_set_extent_refs(leaf, item, refs + refs_to_add); if (extent_op) __run_delayed_extent_op(extent_op, leaf, item); @@ -2010,9 +2047,15 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); + if (!no_quota) { + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + bytenr, num_bytes, type, 0); + if (ret) + goto out; + } + path->reada = 1; path->leave_spinning = 1; - /* now insert the actual backref */ ret = insert_extent_backref(trans, root->fs_info->extent_root, path, bytenr, parent, root_objectid, @@ -2021,7 +2064,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, root, ret); out: btrfs_free_path(path); - return err; + return ret; } static int run_delayed_data_ref(struct btrfs_trans_handle *trans, @@ -2046,8 +2089,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, if (node->type == BTRFS_SHARED_DATA_REF_KEY) parent = ref->parent; - else - ref_root = ref->root; + ref_root = ref->root; if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { if (extent_op) @@ -2061,13 +2103,13 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, node->num_bytes, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, - extent_op); + node->no_quota, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, - extent_op); + extent_op, node->no_quota); } else { BUG(); } @@ -2137,15 +2179,28 @@ again: } if (ret > 0) { if (metadata) { - btrfs_release_path(path); - metadata = 0; + if (path->slots[0] > 0) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == node->bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == node->num_bytes) + ret = 0; + } + if (ret > 0) { + btrfs_release_path(path); + metadata = 0; - key.offset = node->num_bytes; - key.type = BTRFS_EXTENT_ITEM_KEY; - goto again; + key.objectid = node->bytenr; + key.offset = node->num_bytes; + key.type = BTRFS_EXTENT_ITEM_KEY; + goto again; + } + } else { + err = -EIO; + goto out; } - err = -EIO; - goto out; } leaf = path->nodes[0]; @@ -2191,8 +2246,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) parent = ref->parent; - else - ref_root = ref->root; + ref_root = ref->root; ins.objectid = node->bytenr; if (skinny_metadata) { @@ -2210,15 +2264,18 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, parent, ref_root, extent_op->flags_to_set, &extent_op->key, - ref->level, &ins); + ref->level, &ins, + node->no_quota); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, node->num_bytes, parent, ref_root, - ref->level, 0, 1, extent_op); + ref->level, 0, 1, node->no_quota, + extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes, parent, ref_root, - ref->level, 0, 1, extent_op); + ref->level, 0, 1, extent_op, + node->no_quota); } else { BUG(); } @@ -2234,8 +2291,12 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, { int ret = 0; - if (trans->aborted) + if (trans->aborted) { + if (insert_reserved) + btrfs_pin_extent(root, node->bytenr, + node->num_bytes, 1); return 0; + } if (btrfs_delayed_ref_is_head(node)) { struct btrfs_delayed_ref_head *head; @@ -2278,64 +2339,62 @@ static noinline struct btrfs_delayed_ref_node * select_delayed_ref(struct btrfs_delayed_ref_head *head) { struct rb_node *node; - struct btrfs_delayed_ref_node *ref; - int action = BTRFS_ADD_DELAYED_REF; -again: + struct btrfs_delayed_ref_node *ref, *last = NULL;; + /* * select delayed ref of type BTRFS_ADD_DELAYED_REF first. * this prevents ref count from going down to zero when * there still are pending delayed ref. */ - node = rb_prev(&head->node.rb_node); - while (1) { - if (!node) - break; + node = rb_first(&head->ref_root); + while (node) { ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (ref->bytenr != head->node.bytenr) - break; - if (ref->action == action) + if (ref->action == BTRFS_ADD_DELAYED_REF) return ref; - node = rb_prev(node); + else if (last == NULL) + last = ref; + node = rb_next(node); } - if (action == BTRFS_ADD_DELAYED_REF) { - action = BTRFS_DROP_DELAYED_REF; - goto again; - } - return NULL; + return last; } /* * Returns 0 on success or if called with an already aborted transaction. * Returns -ENOMEM or -EIO on failure and will abort the transaction. */ -static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct list_head *cluster) +static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + unsigned long nr) { struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *locked_ref = NULL; struct btrfs_delayed_extent_op *extent_op; struct btrfs_fs_info *fs_info = root->fs_info; + ktime_t start = ktime_get(); int ret; - int count = 0; + unsigned long count = 0; + unsigned long actual_count = 0; int must_insert_reserved = 0; delayed_refs = &trans->transaction->delayed_refs; while (1) { if (!locked_ref) { - /* pick a new head ref from the cluster list */ - if (list_empty(cluster)) + if (count >= nr) break; - locked_ref = list_entry(cluster->next, - struct btrfs_delayed_ref_head, cluster); + spin_lock(&delayed_refs->lock); + locked_ref = btrfs_select_ref_head(trans); + if (!locked_ref) { + spin_unlock(&delayed_refs->lock); + break; + } /* grab the lock that says we are going to process * all the refs for this head */ ret = btrfs_delayed_ref_lock(trans, locked_ref); - + spin_unlock(&delayed_refs->lock); /* * we may have dropped the spin lock to get the head * mutex lock, and that might have given someone else @@ -2356,6 +2415,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, * finish. If we merged anything we need to re-loop so we can * get a good ref. */ + spin_lock(&locked_ref->lock); btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, locked_ref); @@ -2367,17 +2427,15 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, if (ref && ref->seq && btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { - /* - * there are still refs with lower seq numbers in the - * process of being added. Don't run this ref yet. - */ - list_del_init(&locked_ref->cluster); + spin_unlock(&locked_ref->lock); btrfs_delayed_ref_unlock(locked_ref); - locked_ref = NULL; + spin_lock(&delayed_refs->lock); + locked_ref->processing = 0; delayed_refs->num_heads_ready++; spin_unlock(&delayed_refs->lock); + locked_ref = NULL; cond_resched(); - spin_lock(&delayed_refs->lock); + count++; continue; } @@ -2392,6 +2450,8 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, locked_ref->extent_op = NULL; if (!ref) { + + /* All delayed refs have been processed, Go ahead * and send the head node to run_one_delayed_ref, * so that any accounting fixes can happen @@ -2404,26 +2464,54 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, } if (extent_op) { - spin_unlock(&delayed_refs->lock); - + spin_unlock(&locked_ref->lock); ret = run_delayed_extent_op(trans, root, ref, extent_op); btrfs_free_delayed_extent_op(extent_op); if (ret) { + /* + * Need to reset must_insert_reserved if + * there was an error so the abort stuff + * can cleanup the reserved space + * properly. + */ + if (must_insert_reserved) + locked_ref->must_insert_reserved = 1; + locked_ref->processing = 0; btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); - spin_lock(&delayed_refs->lock); btrfs_delayed_ref_unlock(locked_ref); return ret; } + continue; + } - goto next; + /* + * Need to drop our head ref lock and re-aqcuire the + * delayed ref lock and then re-check to make sure + * nobody got added. + */ + spin_unlock(&locked_ref->lock); + spin_lock(&delayed_refs->lock); + spin_lock(&locked_ref->lock); + if (rb_first(&locked_ref->ref_root) || + locked_ref->extent_op) { + spin_unlock(&locked_ref->lock); + spin_unlock(&delayed_refs->lock); + continue; } + ref->in_tree = 0; + delayed_refs->num_heads--; + rb_erase(&locked_ref->href_node, + &delayed_refs->href_root); + spin_unlock(&delayed_refs->lock); + } else { + actual_count++; + ref->in_tree = 0; + rb_erase(&ref->rb_node, &locked_ref->ref_root); } + atomic_dec(&delayed_refs->num_entries); - ref->in_tree = 0; - rb_erase(&ref->rb_node, &delayed_refs->root); - delayed_refs->num_entries--; if (!btrfs_delayed_ref_is_head(ref)) { /* * when we play the delayed ref, also correct the @@ -2440,20 +2528,18 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, default: WARN_ON(1); } - } else { - list_del_init(&locked_ref->cluster); } - spin_unlock(&delayed_refs->lock); + spin_unlock(&locked_ref->lock); ret = run_one_delayed_ref(trans, root, ref, extent_op, must_insert_reserved); btrfs_free_delayed_extent_op(extent_op); if (ret) { + locked_ref->processing = 0; btrfs_delayed_ref_unlock(locked_ref); btrfs_put_delayed_ref(ref); btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); - spin_lock(&delayed_refs->lock); return ret; } @@ -2469,11 +2555,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, } btrfs_put_delayed_ref(ref); count++; -next: cond_resched(); + } + + /* + * We don't want to include ref heads since we can have empty ref heads + * and those will drastically skew our runtime down since we just do + * accounting, no actual extent tree updates. + */ + if (actual_count > 0) { + u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); + u64 avg; + + /* + * We weigh the current average higher than our current runtime + * to avoid large swings in the average. + */ spin_lock(&delayed_refs->lock); + avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; + avg = div64_u64(avg, 4); + fs_info->avg_delayed_ref_runtime = avg; + spin_unlock(&delayed_refs->lock); } - return count; + return 0; } #ifdef SCRAMBLE_DELAYED_REFS @@ -2519,52 +2623,6 @@ static u64 find_middle(struct rb_root *root) } #endif -int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - struct qgroup_update *qgroup_update; - int ret = 0; - - if (list_empty(&trans->qgroup_ref_list) != - !trans->delayed_ref_elem.seq) { - /* list without seq or seq without list */ - btrfs_err(fs_info, - "qgroup accounting update error, list is%s empty, seq is %#x.%x", - list_empty(&trans->qgroup_ref_list) ? "" : " not", - (u32)(trans->delayed_ref_elem.seq >> 32), - (u32)trans->delayed_ref_elem.seq); - BUG(); - } - - if (!trans->delayed_ref_elem.seq) - return 0; - - while (!list_empty(&trans->qgroup_ref_list)) { - qgroup_update = list_first_entry(&trans->qgroup_ref_list, - struct qgroup_update, list); - list_del(&qgroup_update->list); - if (!ret) - ret = btrfs_qgroup_account_ref( - trans, fs_info, qgroup_update->node, - qgroup_update->extent_op); - kfree(qgroup_update); - } - - btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem); - - return ret; -} - -static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq, - int count) -{ - int val = atomic_read(&delayed_refs->ref_seq); - - if (val < seq || val >= seq + count) - return 1; - return 0; -} - static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) { u64 num_bytes; @@ -2581,7 +2639,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); } -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_block_rsv *global_rsv; @@ -2610,6 +2668,101 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, return ret; } +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 num_entries = + atomic_read(&trans->transaction->delayed_refs.num_entries); + u64 avg_runtime; + u64 val; + + smp_mb(); + avg_runtime = fs_info->avg_delayed_ref_runtime; + val = num_entries * avg_runtime; + if (num_entries * avg_runtime >= NSEC_PER_SEC) + return 1; + if (val >= NSEC_PER_SEC / 2) + return 2; + + return btrfs_check_space_for_delayed_refs(trans, root); +} + +struct async_delayed_refs { + struct btrfs_root *root; + int count; + int error; + int sync; + struct completion wait; + struct btrfs_work work; +}; + +static void delayed_ref_async_start(struct btrfs_work *work) +{ + struct async_delayed_refs *async; + struct btrfs_trans_handle *trans; + int ret; + + async = container_of(work, struct async_delayed_refs, work); + + trans = btrfs_join_transaction(async->root); + if (IS_ERR(trans)) { + async->error = PTR_ERR(trans); + goto done; + } + + /* + * trans->sync means that when we call end_transaciton, we won't + * wait on delayed refs + */ + trans->sync = true; + ret = btrfs_run_delayed_refs(trans, async->root, async->count); + if (ret) + async->error = ret; + + ret = btrfs_end_transaction(trans, async->root); + if (ret && !async->error) + async->error = ret; +done: + if (async->sync) + complete(&async->wait); + else + kfree(async); +} + +int btrfs_async_run_delayed_refs(struct btrfs_root *root, + unsigned long count, int wait) +{ + struct async_delayed_refs *async; + int ret; + + async = kmalloc(sizeof(*async), GFP_NOFS); + if (!async) + return -ENOMEM; + + async->root = root->fs_info->tree_root; + async->count = count; + async->error = 0; + if (wait) + async->sync = 1; + else + async->sync = 0; + init_completion(&async->wait); + + btrfs_init_work(&async->work, delayed_ref_async_start, + NULL, NULL); + + btrfs_queue_work(root->fs_info->extent_workers, &async->work); + + if (wait) { + wait_for_completion(&async->wait); + ret = async->error; + kfree(async); + return ret; + } + return 0; +} + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -2625,13 +2778,10 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, { struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - struct list_head cluster; + struct btrfs_delayed_ref_head *head; int ret; - u64 delayed_start; int run_all = count == (unsigned long)-1; int run_most = 0; - int loops; /* We'll clean this up in btrfs_cleanup_transaction */ if (trans->aborted) @@ -2640,133 +2790,41 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; - btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); - delayed_refs = &trans->transaction->delayed_refs; - INIT_LIST_HEAD(&cluster); if (count == 0) { - count = delayed_refs->num_entries * 2; + count = atomic_read(&delayed_refs->num_entries) * 2; run_most = 1; } - if (!run_all && !run_most) { - int old; - int seq = atomic_read(&delayed_refs->ref_seq); - -progress: - old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); - if (old) { - DEFINE_WAIT(__wait); - if (delayed_refs->flushing || - !btrfs_should_throttle_delayed_refs(trans, root)) - return 0; - - prepare_to_wait(&delayed_refs->wait, &__wait, - TASK_UNINTERRUPTIBLE); - - old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); - if (old) { - schedule(); - finish_wait(&delayed_refs->wait, &__wait); - - if (!refs_newer(delayed_refs, seq, 256)) - goto progress; - else - return 0; - } else { - finish_wait(&delayed_refs->wait, &__wait); - goto again; - } - } - - } else { - atomic_inc(&delayed_refs->procs_running_refs); - } - again: - loops = 0; - spin_lock(&delayed_refs->lock); - #ifdef SCRAMBLE_DELAYED_REFS delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif - - while (1) { - if (!(run_all || run_most) && - !btrfs_should_throttle_delayed_refs(trans, root)) - break; - - /* - * go find something we can process in the rbtree. We start at - * the beginning of the tree, and then build a cluster - * of refs to process starting at the first one we are able to - * lock - */ - delayed_start = delayed_refs->run_delayed_start; - ret = btrfs_find_ref_cluster(trans, &cluster, - delayed_refs->run_delayed_start); - if (ret) - break; - - ret = run_clustered_refs(trans, root, &cluster); - if (ret < 0) { - btrfs_release_ref_cluster(&cluster); - spin_unlock(&delayed_refs->lock); - btrfs_abort_transaction(trans, root, ret); - atomic_dec(&delayed_refs->procs_running_refs); - wake_up(&delayed_refs->wait); - return ret; - } - - atomic_add(ret, &delayed_refs->ref_seq); - - count -= min_t(unsigned long, ret, count); - - if (count == 0) - break; - - if (delayed_start >= delayed_refs->run_delayed_start) { - if (loops == 0) { - /* - * btrfs_find_ref_cluster looped. let's do one - * more cycle. if we don't run any delayed ref - * during that cycle (because we can't because - * all of them are blocked), bail out. - */ - loops = 1; - } else { - /* - * no runnable refs left, stop trying - */ - BUG_ON(run_all); - break; - } - } - if (ret) { - /* refs were run, let's reset staleness detection */ - loops = 0; - } + ret = __btrfs_run_delayed_refs(trans, root, count); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + return ret; } if (run_all) { - if (!list_empty(&trans->new_bgs)) { - spin_unlock(&delayed_refs->lock); + if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); - spin_lock(&delayed_refs->lock); - } - node = rb_first(&delayed_refs->root); - if (!node) + spin_lock(&delayed_refs->lock); + node = rb_first(&delayed_refs->href_root); + if (!node) { + spin_unlock(&delayed_refs->lock); goto out; + } count = (unsigned long)-1; while (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); - if (btrfs_delayed_ref_is_head(ref)) { - struct btrfs_delayed_ref_head *head; + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + if (btrfs_delayed_ref_is_head(&head->node)) { + struct btrfs_delayed_ref_node *ref; - head = btrfs_delayed_node_to_head(ref); + ref = &head->node; atomic_inc(&ref->refs); spin_unlock(&delayed_refs->lock); @@ -2780,20 +2838,19 @@ again: btrfs_put_delayed_ref(ref); cond_resched(); goto again; + } else { + WARN_ON(1); } node = rb_next(node); } spin_unlock(&delayed_refs->lock); - schedule_timeout(1); + cond_resched(); goto again; } out: - atomic_dec(&delayed_refs->procs_running_refs); - smp_mb(); - if (waitqueue_active(&delayed_refs->wait)) - wake_up(&delayed_refs->wait); - - spin_unlock(&delayed_refs->lock); + ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); + if (ret) + return ret; assert_qgroups_uptodate(trans); return 0; } @@ -2835,12 +2892,13 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, struct rb_node *node; int ret = 0; - ret = -ENOENT; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); head = btrfs_find_delayed_ref_head(trans, bytenr); - if (!head) - goto out; + if (!head) { + spin_unlock(&delayed_refs->lock); + return 0; + } if (!mutex_trylock(&head->mutex)) { atomic_inc(&head->node.refs); @@ -2857,40 +2915,35 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, btrfs_put_delayed_ref(&head->node); return -EAGAIN; } + spin_unlock(&delayed_refs->lock); - node = rb_prev(&head->node.rb_node); - if (!node) - goto out_unlock; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - - if (ref->bytenr != bytenr) - goto out_unlock; - - ret = 1; - if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) - goto out_unlock; + spin_lock(&head->lock); + node = rb_first(&head->ref_root); + while (node) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + node = rb_next(node); - data_ref = btrfs_delayed_node_to_data_ref(ref); + /* If it's a shared ref we know a cross reference exists */ + if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { + ret = 1; + break; + } - node = rb_prev(node); - if (node) { - int seq = ref->seq; + data_ref = btrfs_delayed_node_to_data_ref(ref); - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (ref->bytenr == bytenr && ref->seq == seq) - goto out_unlock; + /* + * If our ref doesn't match the one we're currently looking at + * then we have a cross reference. + */ + if (data_ref->root != root->root_key.objectid || + data_ref->objectid != objectid || + data_ref->offset != offset) { + ret = 1; + break; + } } - - if (data_ref->root != root->root_key.objectid || - data_ref->objectid != objectid || data_ref->offset != offset) - goto out_unlock; - - ret = 0; -out_unlock: + spin_unlock(&head->lock); mutex_unlock(&head->mutex); -out: - spin_unlock(&delayed_refs->lock); return ret; } @@ -3004,7 +3057,7 @@ out: static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - int full_backref, int inc, int for_cow) + int full_backref, int inc, int no_quota) { u64 bytenr; u64 num_bytes; @@ -3019,11 +3072,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, u64, u64, u64, u64, u64, u64, int); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 0; +#endif ref_root = btrfs_header_owner(buf); nritems = btrfs_header_nritems(buf); level = btrfs_header_level(buf); - if (!root->ref_cows && level == 0) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) return 0; if (inc) @@ -3054,7 +3111,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(buf, fi); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, key.objectid, - key.offset, for_cow); + key.offset, no_quota); if (ret) goto fail; } else { @@ -3062,7 +3119,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, num_bytes = btrfs_level_size(root, level - 1); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, level - 1, 0, - for_cow); + no_quota); if (ret) goto fail; } @@ -3073,15 +3130,15 @@ fail: } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow) + struct extent_buffer *buf, int full_backref, int no_quota) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow); + return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota); } int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow) + struct extent_buffer *buf, int full_backref, int no_quota) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow); + return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota); } static int write_one_cache_group(struct btrfs_trans_handle *trans, @@ -3197,15 +3254,15 @@ again: if (ret) goto out_put; - ret = btrfs_truncate_free_space_cache(root, trans, path, - inode); + ret = btrfs_truncate_free_space_cache(root, trans, inode); if (ret) goto out_put; } spin_lock(&block_group->lock); if (block_group->cached != BTRFS_CACHE_FINISHED || - !btrfs_test_opt(root, SPACE_CACHE)) { + !btrfs_test_opt(root, SPACE_CACHE) || + block_group->delalloc_bytes) { /* * don't bother trying to write stuff out _if_ * a) we're not cached, @@ -3318,10 +3375,9 @@ again: last = cache->key.objectid + cache->key.offset; err = write_one_cache_group(trans, root, path, cache); + btrfs_put_block_group(cache); if (err) /* File system offline */ goto out; - - btrfs_put_block_group(cache); } while (1) { @@ -3389,6 +3445,23 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) return readonly; } +static const char *alloc_name(u64 flags) +{ + switch (flags) { + case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: + return "mixed"; + case BTRFS_BLOCK_GROUP_METADATA: + return "metadata"; + case BTRFS_BLOCK_GROUP_DATA: + return "data"; + case BTRFS_BLOCK_GROUP_SYSTEM: + return "system"; + default: + WARN_ON(1); + return "invalid-combination"; + }; +} + static int update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, struct btrfs_space_info **space_info) @@ -3444,11 +3517,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->chunk_alloc = 0; found->flush = 0; init_waitqueue_head(&found->wait); + + ret = kobject_init_and_add(&found->kobj, &space_info_ktype, + info->space_info_kobj, "%s", + alloc_name(found->flags)); + if (ret) { + kfree(found); + return ret; + } + *space_info = found; list_add_rcu(&found->list, &info->space_info); if (flags & BTRFS_BLOCK_GROUP_DATA) info->data_sinfo = found; - return 0; + + return ret; } static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) @@ -3556,11 +3639,13 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) return extended_to_chunk(flags | tmp); } -static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) +static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) { unsigned seq; + u64 flags; do { + flags = orig_flags; seq = read_seqbegin(&root->fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA) @@ -3605,10 +3690,9 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes) /* make sure bytes are sectorsize aligned */ bytes = ALIGN(bytes, root->sectorsize); - if (root == root->fs_info->tree_root || - BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { - alloc_chunk = 0; + if (btrfs_is_free_space_inode(inode)) { committed = 1; + ASSERT(current->journal_info); } data_sinfo = fs_info->data_sinfo; @@ -3636,6 +3720,16 @@ again: spin_unlock(&data_sinfo->lock); alloc: alloc_target = btrfs_get_alloc_profile(root, 1); + /* + * It is ugly that we don't call nolock join + * transaction for the free space inode case here. + * But it is safe because we only do the data space + * reservation for the free space cache in the + * transaction context, the common join transaction + * just increase the counter of the current transaction + * handler, doesn't try to acquire the trans_lock of + * the fs. + */ trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -3681,6 +3775,9 @@ commit_trans: goto again; } + trace_btrfs_space_reservation(root->fs_info, + "space_info:enospc", + data_sinfo->flags, bytes, 1); return -ENOSPC; } data_sinfo->bytes_may_use += bytes; @@ -3974,7 +4071,7 @@ static int can_overcommit(struct btrfs_root *root, } static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, - unsigned long nr_pages) + unsigned long nr_pages, int nr_items) { struct super_block *sb = root->fs_info->sb; @@ -3989,12 +4086,26 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, * the filesystem is readonly(all dirty pages are written to * the disk). */ - btrfs_start_all_delalloc_inodes(root->fs_info, 0); + btrfs_start_delalloc_roots(root->fs_info, 0, nr_items); if (!current->journal_info) - btrfs_wait_all_ordered_extents(root->fs_info); + btrfs_wait_ordered_roots(root->fs_info, nr_items); } } +static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim) +{ + u64 bytes; + int nr; + + bytes = btrfs_calc_trans_metadata_size(root, 1); + nr = (int)div64_u64(to_reclaim, bytes); + if (!nr) + nr = 1; + return nr; +} + +#define EXTENT_SIZE_PER_ITEM (256 * 1024) + /* * shrink metadata reservation for delalloc */ @@ -4007,35 +4118,51 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, u64 delalloc_bytes; u64 max_reclaim; long time_left; - unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; - int loops = 0; + unsigned long nr_pages; + int loops; + int items; enum btrfs_reserve_flush_enum flush; + /* Calc the number of the pages we need flush for space reservation */ + items = calc_reclaim_items_nr(root, to_reclaim); + to_reclaim = items * EXTENT_SIZE_PER_ITEM; + trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &root->fs_info->delalloc_block_rsv; space_info = block_rsv->space_info; - smp_mb(); delalloc_bytes = percpu_counter_sum_positive( &root->fs_info->delalloc_bytes); if (delalloc_bytes == 0) { if (trans) return; - btrfs_wait_all_ordered_extents(root->fs_info); + if (wait_ordered) + btrfs_wait_ordered_roots(root->fs_info, items); return; } + loops = 0; while (delalloc_bytes && loops < 3) { max_reclaim = min(delalloc_bytes, to_reclaim); nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; - btrfs_writeback_inodes_sb_nr(root, nr_pages); + btrfs_writeback_inodes_sb_nr(root, nr_pages, items); /* * We need to wait for the async pages to actually start before * we do anything. */ - wait_event(root->fs_info->async_submit_wait, - !atomic_read(&root->fs_info->async_delalloc_pages)); + max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages); + if (!max_reclaim) + goto skip_async; + + if (max_reclaim <= nr_pages) + max_reclaim = 0; + else + max_reclaim -= nr_pages; + wait_event(root->fs_info->async_submit_wait, + atomic_read(&root->fs_info->async_delalloc_pages) <= + (int)max_reclaim); +skip_async: if (!trans) flush = BTRFS_RESERVE_FLUSH_ALL; else @@ -4049,13 +4176,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, loops++; if (wait_ordered && !trans) { - btrfs_wait_all_ordered_extents(root->fs_info); + btrfs_wait_ordered_roots(root->fs_info, items); } else { time_left = schedule_timeout_killable(1); if (time_left) break; } - smp_mb(); delalloc_bytes = percpu_counter_sum_positive( &root->fs_info->delalloc_bytes); } @@ -4086,13 +4212,9 @@ static int may_commit_transaction(struct btrfs_root *root, goto commit; /* See if there is enough pinned space to make this reservation */ - spin_lock(&space_info->lock); if (percpu_counter_compare(&space_info->total_bytes_pinned, - bytes) >= 0) { - spin_unlock(&space_info->lock); + bytes) >= 0) goto commit; - } - spin_unlock(&space_info->lock); /* * See if there is some space in the delayed insertion reservation for @@ -4101,16 +4223,13 @@ static int may_commit_transaction(struct btrfs_root *root, if (space_info != delayed_rsv->space_info) return -ENOSPC; - spin_lock(&space_info->lock); spin_lock(&delayed_rsv->lock); if (percpu_counter_compare(&space_info->total_bytes_pinned, bytes - delayed_rsv->size) >= 0) { spin_unlock(&delayed_rsv->lock); - spin_unlock(&space_info->lock); return -ENOSPC; } spin_unlock(&delayed_rsv->lock); - spin_unlock(&space_info->lock); commit: trans = btrfs_join_transaction(root); @@ -4140,16 +4259,11 @@ static int flush_space(struct btrfs_root *root, switch (state) { case FLUSH_DELAYED_ITEMS_NR: case FLUSH_DELAYED_ITEMS: - if (state == FLUSH_DELAYED_ITEMS_NR) { - u64 bytes = btrfs_calc_trans_metadata_size(root, 1); - - nr = (int)div64_u64(num_bytes, bytes); - if (!nr) - nr = 1; - nr *= 2; - } else { + if (state == FLUSH_DELAYED_ITEMS_NR) + nr = calc_reclaim_items_nr(root, num_bytes) * 2; + else nr = -1; - } + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -4160,7 +4274,7 @@ static int flush_space(struct btrfs_root *root, break; case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: - shrink_delalloc(root, num_bytes, orig_bytes, + shrink_delalloc(root, num_bytes * 2, orig_bytes, state == FLUSH_DELALLOC_WAIT); break; case ALLOC_CHUNK: @@ -4186,6 +4300,104 @@ static int flush_space(struct btrfs_root *root, return ret; } + +static inline u64 +btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, + struct btrfs_space_info *space_info) +{ + u64 used; + u64 expected; + u64 to_reclaim; + + to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024, + 16 * 1024 * 1024); + spin_lock(&space_info->lock); + if (can_overcommit(root, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL)) { + to_reclaim = 0; + goto out; + } + + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + if (can_overcommit(root, space_info, 1024 * 1024, + BTRFS_RESERVE_FLUSH_ALL)) + expected = div_factor_fine(space_info->total_bytes, 95); + else + expected = div_factor_fine(space_info->total_bytes, 90); + + if (used > expected) + to_reclaim = used - expected; + else + to_reclaim = 0; + to_reclaim = min(to_reclaim, space_info->bytes_may_use + + space_info->bytes_reserved); +out: + spin_unlock(&space_info->lock); + + return to_reclaim; +} + +static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, + struct btrfs_fs_info *fs_info, u64 used) +{ + return (used >= div_factor_fine(space_info->total_bytes, 98) && + !btrfs_fs_closing(fs_info) && + !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); +} + +static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, + struct btrfs_fs_info *fs_info) +{ + u64 used; + + spin_lock(&space_info->lock); + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + if (need_do_async_reclaim(space_info, fs_info, used)) { + spin_unlock(&space_info->lock); + return 1; + } + spin_unlock(&space_info->lock); + + return 0; +} + +static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + u64 to_reclaim; + int flush_state; + + fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, + space_info); + if (!to_reclaim) + return; + + flush_state = FLUSH_DELAYED_ITEMS_NR; + do { + flush_space(fs_info->fs_root, space_info, to_reclaim, + to_reclaim, flush_state); + flush_state++; + if (!btrfs_need_do_async_reclaim(space_info, fs_info)) + return; + } while (flush_state <= COMMIT_TRANS); + + if (btrfs_need_do_async_reclaim(space_info, fs_info)) + queue_work(system_unbound_wq, work); +} + +void btrfs_init_async_reclaim_work(struct work_struct *work) +{ + INIT_WORK(work, btrfs_async_reclaim_metadata_space); +} + /** * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space * @root - the root we're allocating for @@ -4293,8 +4505,13 @@ again: if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { flushing = true; space_info->flush = 1; + } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { + used += orig_bytes; + if (need_do_async_reclaim(space_info, root->fs_info, used) && + !work_busy(&root->fs_info->async_reclaim_work)) + queue_work(system_unbound_wq, + &root->fs_info->async_reclaim_work); } - spin_unlock(&space_info->lock); if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) @@ -4332,6 +4549,10 @@ out: !block_rsv_use_bytes(global_rsv, orig_bytes)) ret = 0; } + if (ret == -ENOSPC) + trace_btrfs_space_reservation(root->fs_info, + "space_info:enospc", + space_info->flags, orig_bytes, 1); if (flushing) { spin_lock(&space_info->lock); space_info->flush = 0; @@ -4347,7 +4568,7 @@ static struct btrfs_block_rsv *get_block_rsv( { struct btrfs_block_rsv *block_rsv = NULL; - if (root->ref_cows) + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) block_rsv = trans->block_rsv; if (root == root->fs_info->csum_root && trans->adding_csums) @@ -4584,7 +4805,7 @@ void btrfs_block_rsv_release(struct btrfs_root *root, u64 num_bytes) { struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; - if (global_rsv->full || global_rsv == block_rsv || + if (global_rsv == block_rsv || block_rsv->space_info != global_rsv->space_info) global_rsv = NULL; block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, @@ -4986,7 +5207,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); if (to_reserve) - trace_btrfs_space_reservation(root->fs_info,"delalloc", + trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), to_reserve, 1); block_rsv_add_bytes(block_rsv, to_reserve, 1); @@ -5264,6 +5485,8 @@ static int pin_down_extent(struct btrfs_root *root, set_extent_dirty(root->fs_info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); + if (reserved) + trace_btrfs_reserved_extent_free(root, bytenr, num_bytes); return 0; } @@ -5392,6 +5615,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log, * @cache: The cache we are manipulating * @num_bytes: The number of bytes in question * @reserve: One of the reservation enums + * @delalloc: The blocks are allocated for the delalloc write * * This is called by the allocator when it reserves space, or by somebody who is * freeing space that was never actually used on disk. For example if you @@ -5410,7 +5634,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log, * succeeds. */ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve) + u64 num_bytes, int reserve, int delalloc) { struct btrfs_space_info *space_info = cache->space_info; int ret = 0; @@ -5429,12 +5653,18 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, num_bytes, 0); space_info->bytes_may_use -= num_bytes; } + + if (delalloc) + cache->delalloc_bytes += num_bytes; } } else { if (cache->ro) space_info->bytes_readonly += num_bytes; cache->reserved -= num_bytes; space_info->bytes_reserved -= num_bytes; + + if (delalloc) + cache->delalloc_bytes -= num_bytes; } spin_unlock(&cache->lock); spin_unlock(&space_info->lock); @@ -5448,9 +5678,8 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_caching_control *next; struct btrfs_caching_control *caching_ctl; struct btrfs_block_group_cache *cache; - struct btrfs_space_info *space_info; - down_write(&fs_info->extent_commit_sem); + down_write(&fs_info->commit_root_sem); list_for_each_entry_safe(caching_ctl, next, &fs_info->caching_block_groups, list) { @@ -5469,10 +5698,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, else fs_info->pinned_extents = &fs_info->freed_extents[0]; - up_write(&fs_info->extent_commit_sem); - - list_for_each_entry_rcu(space_info, &fs_info->space_info, list) - percpu_counter_set(&space_info->total_bytes_pinned, 0); + up_write(&fs_info->commit_root_sem); update_global_block_rsv(fs_info); } @@ -5511,6 +5737,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) spin_lock(&cache->lock); cache->pinned -= len; space_info->bytes_pinned -= len; + percpu_counter_add(&space_info->total_bytes_pinned, -len); if (cache->ro) { space_info->bytes_readonly += len; readonly = true; @@ -5597,7 +5824,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_extent_op *extent_op, + int no_quota) { struct btrfs_key key; struct btrfs_path *path; @@ -5613,9 +5841,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int num_to_del = 1; u32 item_size; u64 refs; + int last_ref = 0; + enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL; bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); + if (!info->quota_enabled || !is_fstree(root_objectid)) + no_quota = 1; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -5663,7 +5896,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, BUG_ON(iref); ret = remove_extent_backref(trans, extent_root, path, NULL, refs_to_drop, - is_data); + is_data, &last_ref); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -5698,6 +5931,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (ret > 0 && skinny_metadata) { skinny_metadata = false; + key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; btrfs_release_path(path); @@ -5718,13 +5952,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } extent_slot = path->slots[0]; } - } else if (ret == -ENOENT) { + } else if (WARN_ON(ret == -ENOENT)) { btrfs_print_leaf(extent_root, path->nodes[0]); - WARN_ON(1); btrfs_err(info, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", bytenr, parent, root_objectid, owner_objectid, owner_offset); + btrfs_abort_transaction(trans, extent_root, ret); + goto out; } else { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -5780,7 +6015,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, refs = btrfs_extent_refs(leaf, ei); if (refs < refs_to_drop) { btrfs_err(info, "trying to drop %d refs but we only have %Lu " - "for bytenr %Lu\n", refs_to_drop, refs, bytenr); + "for bytenr %Lu", refs_to_drop, refs, bytenr); ret = -EINVAL; btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -5788,6 +6023,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, refs -= refs_to_drop; if (refs > 0) { + type = BTRFS_QGROUP_OPER_SUB_SHARED; if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei); /* @@ -5803,7 +6039,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, iref, refs_to_drop, - is_data); + is_data, &last_ref); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -5824,6 +6060,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } + last_ref = 1; ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); if (ret) { @@ -5846,6 +6083,20 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } } + btrfs_release_path(path); + + /* Deal with the quota accounting */ + if (!ret && last_ref && !no_quota) { + int mod_seq = 0; + + if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID && + type == BTRFS_QGROUP_OPER_SUB_SHARED) + mod_seq = 1; + + ret = btrfs_qgroup_record_ref(trans, info, root_objectid, + bytenr, num_bytes, type, + mod_seq); + } out: btrfs_free_path(path); return ret; @@ -5862,24 +6113,16 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, { struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - struct rb_node *node; int ret = 0; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); head = btrfs_find_delayed_ref_head(trans, bytenr); if (!head) - goto out; + goto out_delayed_unlock; - node = rb_prev(&head->node.rb_node); - if (!node) - goto out; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - - /* there are still entries for this ref, we can't drop it */ - if (ref->bytenr == bytenr) + spin_lock(&head->lock); + if (rb_first(&head->ref_root)) goto out; if (head->extent_op) { @@ -5901,19 +6144,19 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, * ahead and process it. */ head->node.in_tree = 0; - rb_erase(&head->node.rb_node, &delayed_refs->root); + rb_erase(&head->href_node, &delayed_refs->href_root); - delayed_refs->num_entries--; + atomic_dec(&delayed_refs->num_entries); /* * we don't take a ref on the node because we're removing it from the * tree, so we just steal the ref the tree was holding. */ delayed_refs->num_heads--; - if (list_empty(&head->cluster)) + if (head->processing == 0) delayed_refs->num_heads_ready--; - - list_del_init(&head->cluster); + head->processing = 0; + spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); BUG_ON(head->extent_op); @@ -5924,6 +6167,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, btrfs_put_delayed_ref(&head->node); return ret; out: + spin_unlock(&head->lock); + +out_delayed_unlock: spin_unlock(&delayed_refs->lock); return 0; } @@ -5966,7 +6212,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len); - btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); + btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); + trace_btrfs_reserved_extent_free(root, buf->start, buf->len); pin = 0; } out: @@ -5986,11 +6233,15 @@ out: /* Can return -ENOMEM */ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int for_cow) + u64 owner, u64 offset, int no_quota) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 0; +#endif add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); /* @@ -6006,13 +6257,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL, for_cow); + BTRFS_DROP_DELAYED_REF, NULL, no_quota); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, owner, offset, BTRFS_DROP_DELAYED_REF, - NULL, for_cow); + NULL, no_quota); } return ret; } @@ -6090,11 +6341,29 @@ int __get_raid_index(u64 flags) return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ } -static int get_block_group_index(struct btrfs_block_group_cache *cache) +int get_block_group_index(struct btrfs_block_group_cache *cache) { return __get_raid_index(cache->flags); } +static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = "raid10", + [BTRFS_RAID_RAID1] = "raid1", + [BTRFS_RAID_DUP] = "dup", + [BTRFS_RAID_RAID0] = "raid0", + [BTRFS_RAID_SINGLE] = "single", + [BTRFS_RAID_RAID5] = "raid5", + [BTRFS_RAID_RAID6] = "raid6", +}; + +static const char *get_raid_name(enum btrfs_raid_types type) +{ + if (type >= BTRFS_NR_RAID_TYPES) + return NULL; + + return btrfs_raid_type_names[type]; +} + enum btrfs_loop_type { LOOP_CACHING_NOWAIT = 0, LOOP_CACHING_WAIT = 1, @@ -6102,6 +6371,70 @@ enum btrfs_loop_type { LOOP_NO_EMPTY_SIZE = 3, }; +static inline void +btrfs_lock_block_group(struct btrfs_block_group_cache *cache, + int delalloc) +{ + if (delalloc) + down_read(&cache->data_rwsem); +} + +static inline void +btrfs_grab_block_group(struct btrfs_block_group_cache *cache, + int delalloc) +{ + btrfs_get_block_group(cache); + if (delalloc) + down_read(&cache->data_rwsem); +} + +static struct btrfs_block_group_cache * +btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + int delalloc) +{ + struct btrfs_block_group_cache *used_bg; + bool locked = false; +again: + spin_lock(&cluster->refill_lock); + if (locked) { + if (used_bg == cluster->block_group) + return used_bg; + + up_read(&used_bg->data_rwsem); + btrfs_put_block_group(used_bg); + } + + used_bg = cluster->block_group; + if (!used_bg) + return NULL; + + if (used_bg == block_group) + return used_bg; + + btrfs_get_block_group(used_bg); + + if (!delalloc) + return used_bg; + + if (down_read_trylock(&used_bg->data_rwsem)) + return used_bg; + + spin_unlock(&cluster->refill_lock); + down_read(&used_bg->data_rwsem); + locked = true; + goto again; +} + +static inline void +btrfs_release_block_group(struct btrfs_block_group_cache *cache, + int delalloc) +{ + if (delalloc) + up_read(&cache->data_rwsem); + btrfs_put_block_group(cache); +} + /* * walks the btree of allocated extents and find a hole of a given size. * The key ins is changed to record the hole: @@ -6116,13 +6449,12 @@ enum btrfs_loop_type { static noinline int find_free_extent(struct btrfs_root *orig_root, u64 num_bytes, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, - u64 flags) + u64 flags, int delalloc) { int ret = 0; struct btrfs_root *root = orig_root->fs_info->extent_root; struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; - struct btrfs_block_group_cache *used_block_group; u64 search_start = 0; u64 max_extent_size = 0; int empty_cluster = 2 * 1024 * 1024; @@ -6131,7 +6463,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, int index = __get_raid_index(flags); int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; - bool found_uncached_bg = false; bool failed_cluster_refill = false; bool failed_alloc = false; bool use_cluster = true; @@ -6184,7 +6515,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, if (search_start == hint_byte) { block_group = btrfs_lookup_block_group(root->fs_info, search_start); - used_block_group = block_group; /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. @@ -6207,6 +6537,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, up_read(&space_info->groups_sem); } else { index = get_block_group_index(block_group); + btrfs_lock_block_group(block_group, delalloc); goto have_block_group; } } else if (block_group) { @@ -6221,8 +6552,7 @@ search: u64 offset; int cached; - used_block_group = block_group; - btrfs_get_block_group(block_group); + btrfs_grab_block_group(block_group, delalloc); search_start = block_group->key.objectid; /* @@ -6249,7 +6579,6 @@ search: have_block_group: cached = block_group_cache_done(block_group); if (unlikely(!cached)) { - found_uncached_bg = true; ret = cache_block_group(block_group, 0); BUG_ON(ret < 0); ret = 0; @@ -6265,23 +6594,22 @@ have_block_group: * lets look there */ if (last_ptr) { + struct btrfs_block_group_cache *used_block_group; unsigned long aligned_cluster; /* * the refill lock keeps out other * people trying to start a new cluster */ - spin_lock(&last_ptr->refill_lock); - used_block_group = last_ptr->block_group; - if (used_block_group != block_group && - (!used_block_group || - used_block_group->ro || - !block_group_bits(used_block_group, flags))) { - used_block_group = block_group; + used_block_group = btrfs_lock_cluster(block_group, + last_ptr, + delalloc); + if (!used_block_group) goto refill_cluster; - } - if (used_block_group != block_group) - btrfs_get_block_group(used_block_group); + if (used_block_group != block_group && + (used_block_group->ro || + !block_group_bits(used_block_group, flags))) + goto release_cluster; offset = btrfs_alloc_from_cluster(used_block_group, last_ptr, @@ -6292,17 +6620,18 @@ have_block_group: /* we have a block, we're done */ spin_unlock(&last_ptr->refill_lock); trace_btrfs_reserve_extent_cluster(root, - block_group, search_start, num_bytes); + used_block_group, + search_start, num_bytes); + if (used_block_group != block_group) { + btrfs_release_block_group(block_group, + delalloc); + block_group = used_block_group; + } goto checks; } WARN_ON(last_ptr->block_group != used_block_group); - if (used_block_group != block_group) { - btrfs_put_block_group(used_block_group); - used_block_group = block_group; - } -refill_cluster: - BUG_ON(used_block_group != block_group); +release_cluster: /* If we are on LOOP_NO_EMPTY_SIZE, we can't * set up a new clusters, so lets just skip it * and let the allocator find whatever block @@ -6319,8 +6648,10 @@ refill_cluster: * succeeding in the unclustered * allocation. */ if (loop >= LOOP_NO_EMPTY_SIZE && - last_ptr->block_group != block_group) { + used_block_group != block_group) { spin_unlock(&last_ptr->refill_lock); + btrfs_release_block_group(used_block_group, + delalloc); goto unclustered_alloc; } @@ -6330,6 +6661,10 @@ refill_cluster: */ btrfs_return_cluster_to_free_space(NULL, last_ptr); + if (used_block_group != block_group) + btrfs_release_block_group(used_block_group, + delalloc); +refill_cluster: if (loop >= LOOP_NO_EMPTY_SIZE) { spin_unlock(&last_ptr->refill_lock); goto unclustered_alloc; @@ -6421,25 +6756,25 @@ unclustered_alloc: goto loop; } checks: - search_start = stripe_align(root, used_block_group, + search_start = stripe_align(root, block_group, offset, num_bytes); /* move on to the next group */ if (search_start + num_bytes > - used_block_group->key.objectid + used_block_group->key.offset) { - btrfs_add_free_space(used_block_group, offset, num_bytes); + block_group->key.objectid + block_group->key.offset) { + btrfs_add_free_space(block_group, offset, num_bytes); goto loop; } if (offset < search_start) - btrfs_add_free_space(used_block_group, offset, + btrfs_add_free_space(block_group, offset, search_start - offset); BUG_ON(offset > search_start); - ret = btrfs_update_reserved_bytes(used_block_group, num_bytes, - alloc_type); + ret = btrfs_update_reserved_bytes(block_group, num_bytes, + alloc_type, delalloc); if (ret == -EAGAIN) { - btrfs_add_free_space(used_block_group, offset, num_bytes); + btrfs_add_free_space(block_group, offset, num_bytes); goto loop; } @@ -6449,17 +6784,13 @@ checks: trace_btrfs_reserve_extent(orig_root, block_group, search_start, num_bytes); - if (used_block_group != block_group) - btrfs_put_block_group(used_block_group); - btrfs_put_block_group(block_group); + btrfs_release_block_group(block_group, delalloc); break; loop: failed_cluster_refill = false; failed_alloc = false; BUG_ON(index != get_block_group_index(block_group)); - if (used_block_group != block_group) - btrfs_put_block_group(used_block_group); - btrfs_put_block_group(block_group); + btrfs_release_block_group(block_group, delalloc); } up_read(&space_info->groups_sem); @@ -6482,8 +6813,14 @@ loop: loop++; if (loop == LOOP_ALLOC_CHUNK) { struct btrfs_trans_handle *trans; + int exist = 0; + + trans = current->journal_info; + if (trans) + exist = 1; + else + trans = btrfs_join_transaction(root); - trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; @@ -6500,7 +6837,8 @@ loop: root, ret); else ret = 0; - btrfs_end_transaction(trans, root); + if (!exist) + btrfs_end_transaction(trans, root); if (ret) goto out; } @@ -6529,12 +6867,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int index = 0; spin_lock(&info->lock); - printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n", + printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n", info->flags, info->total_bytes - info->bytes_used - info->bytes_pinned - info->bytes_reserved - info->bytes_readonly, (info->full) ? "" : "not "); - printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, " + printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, " "reserved=%llu, may_use=%llu, readonly=%llu\n", info->total_bytes, info->bytes_used, info->bytes_pinned, info->bytes_reserved, info->bytes_may_use, @@ -6548,7 +6886,9 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, again: list_for_each_entry(cache, &info->block_groups[index], list) { spin_lock(&cache->lock); - printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n", + printk(KERN_INFO "BTRFS: " + "block group %llu has %llu bytes, " + "%llu used %llu pinned %llu reserved %s\n", cache->key.objectid, cache->key.offset, btrfs_block_group_used(&cache->item), cache->pinned, cache->reserved, cache->ro ? "[readonly]" : ""); @@ -6563,7 +6903,7 @@ again: int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, int is_data) + struct btrfs_key *ins, int is_data, int delalloc) { bool final_tried = false; u64 flags; @@ -6573,7 +6913,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, again: WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, - flags); + flags, delalloc); if (ret == -ENOSPC) { if (!final_tried && ins->offset) { @@ -6594,13 +6934,12 @@ again: } } - trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); - return ret; } static int __btrfs_free_reserved_extent(struct btrfs_root *root, - u64 start, u64 len, int pin) + u64 start, u64 len, + int pin, int delalloc) { struct btrfs_block_group_cache *cache; int ret = 0; @@ -6619,7 +6958,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, pin_down_extent(root, cache, start, len, 1); else { btrfs_add_free_space(cache, start, len); - btrfs_update_reserved_bytes(cache, len, RESERVE_FREE); + btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); } btrfs_put_block_group(cache); @@ -6629,15 +6968,15 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, } int btrfs_free_reserved_extent(struct btrfs_root *root, - u64 start, u64 len) + u64 start, u64 len, int delalloc) { - return __btrfs_free_reserved_extent(root, start, len, 0); + return __btrfs_free_reserved_extent(root, start, len, 0, delalloc); } int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, u64 start, u64 len) { - return __btrfs_free_reserved_extent(root, start, len, 1); + return __btrfs_free_reserved_extent(root, start, len, 1, 0); } static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, @@ -6701,12 +7040,20 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); + /* Always set parent to 0 here since its exclusive anyway. */ + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + ins->objectid, ins->offset, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) + return ret; + ret = update_block_group(root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); BUG(); } + trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); return ret; } @@ -6714,7 +7061,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins) + int level, struct btrfs_key *ins, + int no_quota) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -6724,6 +7072,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct extent_buffer *leaf; u32 size = sizeof(*extent_item) + sizeof(*iref); + u64 num_bytes = ins->offset; bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); @@ -6731,13 +7080,18 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, size += sizeof(*block_info); path = btrfs_alloc_path(); - if (!path) + if (!path) { + btrfs_free_and_pin_reserved_extent(root, ins->objectid, + root->leafsize); return -ENOMEM; + } path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, ins, size); if (ret) { + btrfs_free_and_pin_reserved_extent(root, ins->objectid, + root->leafsize); btrfs_free_path(path); return ret; } @@ -6752,6 +7106,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (skinny_metadata) { iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + num_bytes = root->leafsize; } else { block_info = (struct btrfs_tree_block_info *)(extent_item + 1); btrfs_set_tree_block_key(leaf, block_info, key); @@ -6773,12 +7128,22 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); + if (!no_quota) { + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + ins->objectid, num_bytes, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) + return ret; + } + ret = update_block_group(root, ins->objectid, root->leafsize, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); BUG(); } + + trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize); return ret; } @@ -6826,7 +7191,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, return -EINVAL; ret = btrfs_update_reserved_bytes(block_group, ins->offset, - RESERVE_ALLOC_NO_ACCOUNT); + RESERVE_ALLOC_NO_ACCOUNT, 0); BUG_ON(ret); /* logic error */ ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); @@ -6905,7 +7270,7 @@ again: /*DEFAULT_RATELIMIT_BURST*/ 1); if (__ratelimit(&_rs)) WARN(1, KERN_DEBUG - "btrfs: block rsv returned %d\n", ret); + "BTRFS: block rsv returned %d\n", ret); } try_reserve: ret = reserve_metadata_bytes(root, block_rsv, blocksize, @@ -6954,12 +7319,21 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) { + buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, + blocksize, level); + if (!IS_ERR(buf)) + root->alloc_bytenr += blocksize; + return buf; + } +#endif block_rsv = use_block_rsv(trans, root, blocksize); if (IS_ERR(block_rsv)) return ERR_CAST(block_rsv); ret = btrfs_reserve_extent(root, blocksize, blocksize, - empty_size, hint, &ins, 0); + empty_size, hint, &ins, 0, 0); if (ret) { unuse_block_rsv(root->fs_info, block_rsv, blocksize); return ERR_PTR(ret); @@ -7653,7 +8027,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, btrfs_end_transaction_throttle(trans, tree_root); if (!for_reloc && btrfs_need_cleaner_sleep(root)) { - pr_debug("btrfs: drop snapshot early exit\n"); + pr_debug("BTRFS: drop snapshot early exit\n"); err = -EAGAIN; goto out_free; } @@ -7695,7 +8069,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } } - if (root->in_radix) { + if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { btrfs_drop_and_free_fs_root(tree_root->fs_info, root); } else { free_extent_buffer(root->node); @@ -7718,7 +8092,7 @@ out: */ if (!for_reloc && root_dropped == false) btrfs_add_dead_root(root); - if (err) + if (err && err != -EAGAIN) btrfs_std_error(root->fs_info, err); return err; } @@ -7983,7 +8357,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) spin_lock(&sinfo->lock); - for(i = 0; i < BTRFS_NR_RAID_TYPES; i++) + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) if (!list_empty(&sinfo->block_groups[i])) free_bytes += __btrfs_get_ro_block_group_free_space( &sinfo->block_groups[i]); @@ -8222,14 +8596,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) struct btrfs_caching_control *caching_ctl; struct rb_node *n; - down_write(&info->extent_commit_sem); + down_write(&info->commit_root_sem); while (!list_empty(&info->caching_block_groups)) { caching_ctl = list_entry(info->caching_block_groups.next, struct btrfs_caching_control, list); list_del(&caching_ctl->list); put_caching_control(caching_ctl); } - up_write(&info->extent_commit_sem); + up_write(&info->commit_root_sem); spin_lock(&info->block_group_cache_lock); while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { @@ -8271,21 +8645,31 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) release_global_block_rsv(info); - while(!list_empty(&info->space_info)) { + while (!list_empty(&info->space_info)) { + int i; + space_info = list_entry(info->space_info.next, struct btrfs_space_info, list); if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { - if (space_info->bytes_pinned > 0 || + if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_reserved > 0 || - space_info->bytes_may_use > 0) { - WARN_ON(1); + space_info->bytes_may_use > 0)) { dump_space_info(space_info, 0, 0); } } - percpu_counter_destroy(&space_info->total_bytes_pinned); list_del(&space_info->list); - kfree(space_info); + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + struct kobject *kobj; + kobj = space_info->block_group_kobjs[i]; + space_info->block_group_kobjs[i] = NULL; + if (kobj) { + kobject_del(kobj); + kobject_put(kobj); + } + } + kobject_del(&space_info->kobj); + kobject_put(&space_info->kobj); } return 0; } @@ -8294,10 +8678,71 @@ static void __link_block_group(struct btrfs_space_info *space_info, struct btrfs_block_group_cache *cache) { int index = get_block_group_index(cache); + bool first = false; down_write(&space_info->groups_sem); + if (list_empty(&space_info->block_groups[index])) + first = true; list_add_tail(&cache->list, &space_info->block_groups[index]); up_write(&space_info->groups_sem); + + if (first) { + struct raid_kobject *rkobj; + int ret; + + rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); + if (!rkobj) + goto out_err; + rkobj->raid_type = index; + kobject_init(&rkobj->kobj, &btrfs_raid_ktype); + ret = kobject_add(&rkobj->kobj, &space_info->kobj, + "%s", get_raid_name(index)); + if (ret) { + kobject_put(&rkobj->kobj); + goto out_err; + } + space_info->block_group_kobjs[index] = &rkobj->kobj; + } + + return; +out_err: + pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); +} + +static struct btrfs_block_group_cache * +btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) +{ + struct btrfs_block_group_cache *cache; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return NULL; + + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + return NULL; + } + + cache->key.objectid = start; + cache->key.offset = size; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + + cache->sectorsize = root->sectorsize; + cache->fs_info = root->fs_info; + cache->full_stripe_len = btrfs_full_stripe_len(root, + &root->fs_info->mapping_tree, + start); + atomic_set(&cache->count, 1); + spin_lock_init(&cache->lock); + init_rwsem(&cache->data_rwsem); + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->new_bg_list); + btrfs_init_free_space_ctl(cache); + + return cache; } int btrfs_read_block_groups(struct btrfs_root *root) @@ -8335,26 +8780,16 @@ int btrfs_read_block_groups(struct btrfs_root *root) break; if (ret != 0) goto error; + leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - cache = kzalloc(sizeof(*cache), GFP_NOFS); + + cache = btrfs_create_block_group_cache(root, found_key.objectid, + found_key.offset); if (!cache) { ret = -ENOMEM; goto error; } - cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); - if (!cache->free_space_ctl) { - kfree(cache); - ret = -ENOMEM; - goto error; - } - - atomic_set(&cache->count, 1); - spin_lock_init(&cache->lock); - cache->fs_info = info; - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); if (need_clear) { /* @@ -8375,16 +8810,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) read_extent_buffer(leaf, &cache->item, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(cache->item)); - memcpy(&cache->key, &found_key, sizeof(found_key)); + cache->flags = btrfs_block_group_flags(&cache->item); key.objectid = found_key.objectid + found_key.offset; btrfs_release_path(path); - cache->flags = btrfs_block_group_flags(&cache->item); - cache->sectorsize = root->sectorsize; - cache->full_stripe_len = btrfs_full_stripe_len(root, - &root->fs_info->mapping_tree, - found_key.objectid); - btrfs_init_free_space_ctl(cache); /* * We need to exclude the super stripes now so that the space @@ -8398,8 +8827,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) * case. */ free_excluded_extents(root, cache); - kfree(cache->free_space_ctl); - kfree(cache); + btrfs_put_block_group(cache); goto error; } @@ -8528,40 +8956,17 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, extent_root = root->fs_info->extent_root; - root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); - cache = kzalloc(sizeof(*cache), GFP_NOFS); + cache = btrfs_create_block_group_cache(root, chunk_offset, size); if (!cache) return -ENOMEM; - cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); - if (!cache->free_space_ctl) { - kfree(cache); - return -ENOMEM; - } - - cache->key.objectid = chunk_offset; - cache->key.offset = size; - cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - cache->sectorsize = root->sectorsize; - cache->fs_info = root->fs_info; - cache->full_stripe_len = btrfs_full_stripe_len(root, - &root->fs_info->mapping_tree, - chunk_offset); - - atomic_set(&cache->count, 1); - spin_lock_init(&cache->lock); - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); - INIT_LIST_HEAD(&cache->new_bg_list); - - btrfs_init_free_space_ctl(cache); btrfs_set_block_group_used(&cache->item, bytes_used); btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); - cache->flags = type; btrfs_set_block_group_flags(&cache->item, type); + cache->flags = type; cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; ret = exclude_super_stripes(root, cache); @@ -8571,8 +8976,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, * case. */ free_excluded_extents(root, cache); - kfree(cache->free_space_ctl); - kfree(cache); + btrfs_put_block_group(cache); return ret; } @@ -8638,6 +9042,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root = root->fs_info->tree_root; struct btrfs_key key; struct inode *inode; + struct kobject *kobj = NULL; int ret; int index; int factor; @@ -8736,9 +9141,16 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * are still on the list after taking the semaphore */ list_del_init(&block_group->list); - if (list_empty(&block_group->space_info->block_groups[index])) + if (list_empty(&block_group->space_info->block_groups[index])) { + kobj = block_group->space_info->block_group_kobjs[index]; + block_group->space_info->block_group_kobjs[index] = NULL; clear_avail_alloc_bits(root->fs_info, block_group->flags); + } up_write(&block_group->space_info->groups_sem); + if (kobj) { + kobject_del(kobj); + kobject_put(kobj); + } if (block_group->cached == BTRFS_CACHE_STARTED) wait_block_group_cache_done(block_group); @@ -8880,3 +9292,38 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) range->len = trimmed; return ret; } + +/* + * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(), + * they are used to prevent the some tasks writing data into the page cache + * by nocow before the subvolume is snapshoted, but flush the data into + * the disk after the snapshot creation. + */ +void btrfs_end_nocow_write(struct btrfs_root *root) +{ + percpu_counter_dec(&root->subv_writers->counter); + /* + * Make sure counter is updated before we wake up + * waiters. + */ + smp_mb(); + if (waitqueue_active(&root->subv_writers->wait)) + wake_up(&root->subv_writers->wait); +} + +int btrfs_start_nocow_write(struct btrfs_root *root) +{ + if (unlikely(atomic_read(&root->will_be_snapshoted))) + return 0; + + percpu_counter_inc(&root->subv_writers->counter); + /* + * Make sure counter is updated before we check for snapshot creation. + */ + smp_mb(); + if (unlikely(atomic_read(&root->will_be_snapshoted))) { + btrfs_end_nocow_write(root); + return 0; + } + return 1; +} diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 22bda32acb8..a389820d158 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -13,13 +13,13 @@ #include <linux/cleancache.h> #include "extent_io.h" #include "extent_map.h" -#include "compat.h" #include "ctree.h" #include "btrfs_inode.h" #include "volumes.h" #include "check-integrity.h" #include "locking.h" #include "rcu-string.h" +#include "backref.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -59,7 +59,7 @@ void btrfs_leak_debug_check(void) while (!list_empty(&states)) { state = list_entry(states.next, struct extent_state, leak_list); - printk(KERN_ERR "btrfs state leak: start %llu end %llu " + printk(KERN_ERR "BTRFS: state leak: start %llu end %llu " "state %lu in tree %p refs %d\n", state->start, state->end, state->state, state->tree, atomic_read(&state->refs)); @@ -69,7 +69,7 @@ void btrfs_leak_debug_check(void) while (!list_empty(&buffers)) { eb = list_entry(buffers.next, struct extent_buffer, leak_list); - printk(KERN_ERR "btrfs buffer leak start %llu len %lu " + printk(KERN_ERR "BTRFS: buffer leak start %llu len %lu " "refs %d\n", eb->start, eb->len, atomic_read(&eb->refs)); list_del(&eb->leak_list); @@ -77,16 +77,22 @@ void btrfs_leak_debug_check(void) } } -#define btrfs_debug_check_extent_io_range(inode, start, end) \ - __btrfs_debug_check_extent_io_range(__func__, (inode), (start), (end)) +#define btrfs_debug_check_extent_io_range(tree, start, end) \ + __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) static inline void __btrfs_debug_check_extent_io_range(const char *caller, - struct inode *inode, u64 start, u64 end) + struct extent_io_tree *tree, u64 start, u64 end) { - u64 isize = i_size_read(inode); + struct inode *inode; + u64 isize; + + if (!tree->mapping) + return; + inode = tree->mapping->host; + isize = i_size_read(inode); if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { printk_ratelimited(KERN_DEBUG - "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n", + "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n", caller, btrfs_ino(inode), isize, start, end); } } @@ -124,6 +130,8 @@ static noinline void flush_write_bio(void *data); static inline struct btrfs_fs_info * tree_fs_info(struct extent_io_tree *tree) { + if (!tree->mapping) + return NULL; return btrfs_sb(tree->mapping->host->i_sb); } @@ -186,11 +194,9 @@ void extent_io_tree_init(struct extent_io_tree *tree, struct address_space *mapping) { tree->state = RB_ROOT; - INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC); tree->ops = NULL; tree->dirty_bytes = 0; spin_lock_init(&tree->lock); - spin_lock_init(&tree->buffer_lock); tree->mapping = mapping; } @@ -223,13 +229,24 @@ void free_extent_state(struct extent_state *state) } } -static struct rb_node *tree_insert(struct rb_root *root, u64 offset, - struct rb_node *node) +static struct rb_node *tree_insert(struct rb_root *root, + struct rb_node *search_start, + u64 offset, + struct rb_node *node, + struct rb_node ***p_in, + struct rb_node **parent_in) { - struct rb_node **p = &root->rb_node; + struct rb_node **p; struct rb_node *parent = NULL; struct tree_entry *entry; + if (p_in && parent_in) { + p = *p_in; + parent = *parent_in; + goto do_insert; + } + + p = search_start ? &search_start : &root->rb_node; while (*p) { parent = *p; entry = rb_entry(parent, struct tree_entry, rb_node); @@ -242,35 +259,43 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset, return parent; } +do_insert: rb_link_node(node, parent, p); rb_insert_color(node, root); return NULL; } static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, - struct rb_node **prev_ret, - struct rb_node **next_ret) + struct rb_node **prev_ret, + struct rb_node **next_ret, + struct rb_node ***p_ret, + struct rb_node **parent_ret) { struct rb_root *root = &tree->state; - struct rb_node *n = root->rb_node; + struct rb_node **n = &root->rb_node; struct rb_node *prev = NULL; struct rb_node *orig_prev = NULL; struct tree_entry *entry; struct tree_entry *prev_entry = NULL; - while (n) { - entry = rb_entry(n, struct tree_entry, rb_node); - prev = n; + while (*n) { + prev = *n; + entry = rb_entry(prev, struct tree_entry, rb_node); prev_entry = entry; if (offset < entry->start) - n = n->rb_left; + n = &(*n)->rb_left; else if (offset > entry->end) - n = n->rb_right; + n = &(*n)->rb_right; else - return n; + return *n; } + if (p_ret) + *p_ret = n; + if (parent_ret) + *parent_ret = prev; + if (prev_ret) { orig_prev = prev; while (prev && offset > prev_entry->end) { @@ -292,18 +317,27 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, return NULL; } -static inline struct rb_node *tree_search(struct extent_io_tree *tree, - u64 offset) +static inline struct rb_node * +tree_search_for_insert(struct extent_io_tree *tree, + u64 offset, + struct rb_node ***p_ret, + struct rb_node **parent_ret) { struct rb_node *prev = NULL; struct rb_node *ret; - ret = __etree_search(tree, offset, &prev, NULL); + ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret); if (!ret) return prev; return ret; } +static inline struct rb_node *tree_search(struct extent_io_tree *tree, + u64 offset) +{ + return tree_search_for_insert(tree, offset, NULL, NULL); +} + static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, struct extent_state *other) { @@ -385,23 +419,25 @@ static void set_state_bits(struct extent_io_tree *tree, */ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, u64 start, u64 end, + struct rb_node ***p, + struct rb_node **parent, unsigned long *bits) { struct rb_node *node; if (end < start) - WARN(1, KERN_ERR "btrfs end < start %llu %llu\n", + WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n", end, start); state->start = start; state->end = end; set_state_bits(tree, state, bits); - node = tree_insert(&tree->state, end, &state->rb_node); + node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); if (node) { struct extent_state *found; found = rb_entry(node, struct extent_state, rb_node); - printk(KERN_ERR "btrfs found node %llu %llu on insert of " + printk(KERN_ERR "BTRFS: found node %llu %llu on insert of " "%llu %llu\n", found->start, found->end, start, end); return -EEXIST; @@ -444,7 +480,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, prealloc->state = orig->state; orig->start = split; - node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); + node = tree_insert(&tree->state, &orig->rb_node, prealloc->end, + &prealloc->rb_node, NULL, NULL); if (node) { free_extent_state(prealloc); return -EEXIST; @@ -542,7 +579,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int err; int clear = 0; - btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); + btrfs_debug_check_extent_io_range(tree, start, end); if (bits & EXTENT_DELALLOC) bits |= EXTENT_NORESERVE; @@ -702,7 +739,7 @@ static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *state; struct rb_node *node; - btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); + btrfs_debug_check_extent_io_range(tree, start, end); spin_lock(&tree->lock); again: @@ -712,6 +749,7 @@ again: * our range starts */ node = tree_search(tree, start); +process_node: if (!node) break; @@ -732,7 +770,10 @@ again: if (start > end) break; - cond_resched_lock(&tree->lock); + if (!cond_resched_lock(&tree->lock)) { + node = rb_next(node); + goto process_node; + } } out: spin_unlock(&tree->lock); @@ -783,11 +824,13 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *state; struct extent_state *prealloc = NULL; struct rb_node *node; + struct rb_node **p; + struct rb_node *parent; int err = 0; u64 last_start; u64 last_end; - btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); + btrfs_debug_check_extent_io_range(tree, start, end); bits |= EXTENT_FIRST_DELALLOC; again: @@ -809,14 +852,16 @@ again: * this search will find all the extents that end after * our range starts. */ - node = tree_search(tree, start); + node = tree_search_for_insert(tree, start, &p, &parent); if (!node) { prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); - err = insert_state(tree, prealloc, start, end, &bits); + err = insert_state(tree, prealloc, start, end, + &p, &parent, &bits); if (err) extent_io_tree_panic(tree, err); + cache_state(prealloc, cached_state); prealloc = NULL; goto out; } @@ -919,7 +964,7 @@ hit_next: * the later extent. */ err = insert_state(tree, prealloc, start, this_end, - &bits); + NULL, NULL, &bits); if (err) extent_io_tree_panic(tree, err); @@ -1005,11 +1050,13 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *state; struct extent_state *prealloc = NULL; struct rb_node *node; + struct rb_node **p; + struct rb_node *parent; int err = 0; u64 last_start; u64 last_end; - btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); + btrfs_debug_check_extent_io_range(tree, start, end); again: if (!prealloc && (mask & __GFP_WAIT)) { @@ -1032,17 +1079,19 @@ again: * this search will find all the extents that end after * our range starts. */ - node = tree_search(tree, start); + node = tree_search_for_insert(tree, start, &p, &parent); if (!node) { prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) { err = -ENOMEM; goto out; } - err = insert_state(tree, prealloc, start, end, &bits); - prealloc = NULL; + err = insert_state(tree, prealloc, start, end, + &p, &parent, &bits); if (err) extent_io_tree_panic(tree, err); + cache_state(prealloc, cached_state); + prealloc = NULL; goto out; } state = rb_entry(node, struct extent_state, rb_node); @@ -1135,7 +1184,7 @@ hit_next: * the later extent. */ err = insert_state(tree, prealloc, start, this_end, - &bits); + NULL, NULL, &bits); if (err) extent_io_tree_panic(tree, err); cache_state(prealloc, cached_state); @@ -1490,10 +1539,8 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, cur_start = state->end + 1; node = rb_next(node); total_bytes += state->end - state->start + 1; - if (total_bytes >= max_bytes) { - *end = *start + max_bytes - 1; + if (total_bytes >= max_bytes) break; - } if (!node) break; } @@ -1599,11 +1646,10 @@ done: * * 1 is returned if we find something, 0 if nothing was in the tree */ -static noinline u64 find_lock_delalloc_range(struct inode *inode, - struct extent_io_tree *tree, - struct page *locked_page, - u64 *start, u64 *end, - u64 max_bytes) +STATIC u64 find_lock_delalloc_range(struct inode *inode, + struct extent_io_tree *tree, + struct page *locked_page, u64 *start, + u64 *end, u64 max_bytes) { u64 delalloc_start; u64 delalloc_end; @@ -1635,10 +1681,9 @@ again: /* * make sure to limit the number of pages we try to lock down - * if we're looping. */ - if (delalloc_end + 1 - delalloc_start > max_bytes && loops) - delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; + if (delalloc_end + 1 - delalloc_start > max_bytes) + delalloc_end = delalloc_start + max_bytes - 1; /* step two, lock all the pages after the page that has start */ ret = lock_delalloc_pages(inode, locked_page, @@ -1648,9 +1693,9 @@ again: * shortening the size of the delalloc range we're searching */ free_extent_state(cached_state); + cached_state = NULL; if (!loops) { - unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); - max_bytes = PAGE_CACHE_SIZE - offset; + max_bytes = PAGE_CACHE_SIZE; loops = 1; goto again; } else { @@ -1744,10 +1789,8 @@ u64 count_range_bits(struct extent_io_tree *tree, u64 last = 0; int found = 0; - if (search_end <= cur_start) { - WARN_ON(1); + if (WARN_ON(search_end <= cur_start)) return 0; - } spin_lock(&tree->lock); if (cur_start == 0 && bits == EXTENT_DIRTY) { @@ -1959,11 +2002,6 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec, return err; } -static void repair_io_failure_callback(struct bio *bio, int err) -{ - complete(bio->bi_private); -} - /* * this bypasses the standard btrfs submit functions deliberately, as * the standard behavior is to write all copies in a raid setup. here we only @@ -1980,13 +2018,13 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, { struct bio *bio; struct btrfs_device *dev; - DECLARE_COMPLETION_ONSTACK(compl); u64 map_length = 0; u64 sector; struct btrfs_bio *bbio = NULL; struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; int ret; + ASSERT(!(fs_info->sb->s_flags & MS_RDONLY)); BUG_ON(!mirror_num); /* we can't repair anything in raid56 yet */ @@ -1996,9 +2034,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; - bio->bi_private = &compl; - bio->bi_end_io = repair_io_failure_callback; - bio->bi_size = 0; + bio->bi_iter.bi_size = 0; map_length = length; ret = btrfs_map_block(fs_info, WRITE, logical, @@ -2009,7 +2045,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, } BUG_ON(mirror_num != bbio->mirror_num); sector = bbio->stripes[mirror_num-1].physical >> 9; - bio->bi_sector = sector; + bio->bi_iter.bi_sector = sector; dev = bbio->stripes[mirror_num-1].dev; kfree(bbio); if (!dev || !dev->bdev || !dev->writeable) { @@ -2018,19 +2054,18 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, } bio->bi_bdev = dev->bdev; bio_add_page(bio, page, length, start - page_offset(page)); - btrfsic_submit_bio(WRITE_SYNC, bio); - wait_for_completion(&compl); - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) { /* try to remap that extent elsewhere? */ bio_put(bio); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); return -EIO; } - printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu " - "(dev %s sector %llu)\n", page->mapping->host->i_ino, - start, rcu_str_deref(dev->name), sector); + printk_ratelimited_in_rcu(KERN_INFO + "BTRFS: read error corrected: ino %lu off %llu " + "(dev %s sector %llu)\n", page->mapping->host->i_ino, + start, rcu_str_deref(dev->name), sector); bio_put(bio); return 0; @@ -2043,6 +2078,9 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); int ret = 0; + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + for (i = 0; i < num_pages; i++) { struct page *p = extent_buffer_page(eb, i); ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE, @@ -2064,12 +2102,12 @@ static int clean_io_failure(u64 start, struct page *page) u64 private; u64 private_failure; struct io_failure_record *failrec; - struct btrfs_fs_info *fs_info; + struct inode *inode = page->mapping->host; + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct extent_state *state; int num_copies; int did_repair = 0; int ret; - struct inode *inode = page->mapping->host; private = 0; ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, @@ -2092,6 +2130,8 @@ static int clean_io_failure(u64 start, struct page *page) did_repair = 1; goto out; } + if (fs_info->sb->s_flags & MS_RDONLY) + goto out; spin_lock(&BTRFS_I(inode)->io_tree.lock); state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, @@ -2101,7 +2141,6 @@ static int clean_io_failure(u64 start, struct page *page) if (state && state->start <= failrec->start && state->end >= failrec->start + failrec->len - 1) { - fs_info = BTRFS_I(inode)->root->fs_info; num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); if (num_copies > 1) { @@ -2168,7 +2207,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, return -EIO; } - if (em->start > start || em->start + em->len < start) { + if (em->start > start || em->start + em->len <= start) { free_extent_map(em); em = NULL; } @@ -2280,9 +2319,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, return -EIO; } bio->bi_end_io = failed_bio->bi_end_io; - bio->bi_sector = failrec->logical >> 9; + bio->bi_iter.bi_sector = failrec->logical >> 9; bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; - bio->bi_size = 0; + bio->bi_iter.bi_size = 0; btrfs_failed_bio = btrfs_io_bio(failed_bio); if (btrfs_failed_bio->csum) { @@ -2315,7 +2354,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) { int uptodate = (err == 0); struct extent_io_tree *tree; - int ret; + int ret = 0; tree = &BTRFS_I(page->mapping->host)->io_tree; @@ -2329,6 +2368,8 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) if (!uptodate) { ClearPageUptodate(page); SetPageError(page); + ret = ret < 0 ? ret : -EIO; + mapping_set_error(page->mapping, ret); } return 0; } @@ -2344,37 +2385,39 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) */ static void end_bio_extent_writepage(struct bio *bio, int err) { - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct extent_io_tree *tree; + struct bio_vec *bvec; u64 start; u64 end; + int i; - do { + bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - tree = &BTRFS_I(page->mapping->host)->io_tree; /* We always issue full-page reads, but if some block * in a page fails to read, blk_update_request() will * advance bv_offset and adjust bv_len to compensate. * Print a warning for nonzero offsets, and an error * if they don't add up to a full page. */ - if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) - printk("%s page write in btrfs with offset %u and length %u\n", - bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE - ? KERN_ERR "partial" : KERN_INFO "incomplete", - bvec->bv_offset, bvec->bv_len); + if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { + if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) + btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info, + "partial page write in btrfs with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + else + btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info, + "incomplete page write in btrfs with offset %u and " + "length %u", + bvec->bv_offset, bvec->bv_len); + } start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - if (end_extent_writepage(page, err, start, end)) continue; end_page_writeback(page); - } while (bvec >= bio->bi_io_vec); + } bio_put(bio); } @@ -2404,9 +2447,8 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, */ static void end_bio_extent_readpage(struct bio *bio, int err) { + struct bio_vec *bvec; int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; - struct bio_vec *bvec = bio->bi_io_vec; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct extent_io_tree *tree; u64 offset = 0; @@ -2417,16 +2459,17 @@ static void end_bio_extent_readpage(struct bio *bio, int err) u64 extent_len = 0; int mirror; int ret; + int i; if (err) uptodate = 0; - do { + bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " - "mirror=%lu\n", (u64)bio->bi_sector, err, + "mirror=%lu\n", (u64)bio->bi_iter.bi_sector, err, io_bio->mirror_num); tree = &BTRFS_I(inode)->io_tree; @@ -2435,19 +2478,22 @@ static void end_bio_extent_readpage(struct bio *bio, int err) * advance bv_offset and adjust bv_len to compensate. * Print a warning for nonzero offsets, and an error * if they don't add up to a full page. */ - if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) - printk("%s page read in btrfs with offset %u and length %u\n", - bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE - ? KERN_ERR "partial" : KERN_INFO "incomplete", - bvec->bv_offset, bvec->bv_len); + if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { + if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) + btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info, + "partial page read in btrfs with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + else + btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info, + "incomplete page read in btrfs with offset %u and " + "length %u", + bvec->bv_offset, bvec->bv_len); + } start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; len = bvec->bv_len; - if (++bvec <= bvec_end) - prefetchw(&bvec->bv_page->flags); - mirror = io_bio->mirror_num; if (likely(uptodate && tree->ops && tree->ops->readpage_end_io_hook)) { @@ -2528,7 +2574,7 @@ readpage_ok: extent_start = start; extent_len = end + 1 - start; } - } while (bvec <= bvec_end); + } if (extent_len) endio_readpage_release_extent(tree, extent_start, extent_len, @@ -2559,9 +2605,8 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, } if (bio) { - bio->bi_size = 0; bio->bi_bdev = bdev; - bio->bi_sector = first_sector; + bio->bi_iter.bi_sector = first_sector; btrfs_bio = btrfs_io_bio(bio); btrfs_bio->csum = NULL; btrfs_bio->csum_allocated = NULL; @@ -2655,7 +2700,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, if (bio_ret && *bio_ret) { bio = *bio_ret; if (old_compressed) - contig = bio->bi_sector == sector; + contig = bio->bi_iter.bi_sector == sector; else contig = bio_end_sector(bio) == sector; @@ -2722,7 +2767,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, if (em_cached && *em_cached) { em = *em_cached; - if (em->in_tree && start >= em->start && + if (extent_map_in_tree(em) && start >= em->start && start < extent_map_end(em)) { atomic_inc(&em->refs); return em; @@ -3056,143 +3101,130 @@ static noinline void update_nr_written(struct page *page, } /* - * the writepage semantics are similar to regular writepage. extent - * records are inserted to lock ranges in the tree, and as dirty areas - * are found, they are marked writeback. Then the lock bits are removed - * and the end_io handler clears the writeback ranges + * helper for __extent_writepage, doing all of the delayed allocation setup. + * + * This returns 1 if our fill_delalloc function did all the work required + * to write the page (copy into inline extent). In this case the IO has + * been started and the page is already unlocked. + * + * This returns 0 if all went well (page still locked) + * This returns < 0 if there were errors (page still locked) */ -static int __extent_writepage(struct page *page, struct writeback_control *wbc, - void *data) +static noinline_for_stack int writepage_delalloc(struct inode *inode, + struct page *page, struct writeback_control *wbc, + struct extent_page_data *epd, + u64 delalloc_start, + unsigned long *nr_written) +{ + struct extent_io_tree *tree = epd->tree; + u64 page_end = delalloc_start + PAGE_CACHE_SIZE - 1; + u64 nr_delalloc; + u64 delalloc_to_write = 0; + u64 delalloc_end = 0; + int ret; + int page_started = 0; + + if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc) + return 0; + + while (delalloc_end < page_end) { + nr_delalloc = find_lock_delalloc_range(inode, tree, + page, + &delalloc_start, + &delalloc_end, + 128 * 1024 * 1024); + if (nr_delalloc == 0) { + delalloc_start = delalloc_end + 1; + continue; + } + ret = tree->ops->fill_delalloc(inode, page, + delalloc_start, + delalloc_end, + &page_started, + nr_written); + /* File system has been set read-only */ + if (ret) { + SetPageError(page); + /* fill_delalloc should be return < 0 for error + * but just in case, we use > 0 here meaning the + * IO is started, so we don't want to return > 0 + * unless things are going well. + */ + ret = ret < 0 ? ret : -EIO; + goto done; + } + /* + * delalloc_end is already one less than the total + * length, so we don't subtract one from + * PAGE_CACHE_SIZE + */ + delalloc_to_write += (delalloc_end - delalloc_start + + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + delalloc_start = delalloc_end + 1; + } + if (wbc->nr_to_write < delalloc_to_write) { + int thresh = 8192; + + if (delalloc_to_write < thresh * 2) + thresh = delalloc_to_write; + wbc->nr_to_write = min_t(u64, delalloc_to_write, + thresh); + } + + /* did the fill delalloc function already unlock and start + * the IO? + */ + if (page_started) { + /* + * we've unlocked the page, so we can't update + * the mapping's writeback index, just update + * nr_to_write. + */ + wbc->nr_to_write -= *nr_written; + return 1; + } + + ret = 0; + +done: + return ret; +} + +/* + * helper for __extent_writepage. This calls the writepage start hooks, + * and does the loop to map the page into extents and bios. + * + * We return 1 if the IO is started and the page is unlocked, + * 0 if all went well (page still locked) + * < 0 if there were errors (page still locked) + */ +static noinline_for_stack int __extent_writepage_io(struct inode *inode, + struct page *page, + struct writeback_control *wbc, + struct extent_page_data *epd, + loff_t i_size, + unsigned long nr_written, + int write_flags, int *nr_ret) { - struct inode *inode = page->mapping->host; - struct extent_page_data *epd = data; struct extent_io_tree *tree = epd->tree; u64 start = page_offset(page); - u64 delalloc_start; u64 page_end = start + PAGE_CACHE_SIZE - 1; u64 end; u64 cur = start; u64 extent_offset; - u64 last_byte = i_size_read(inode); u64 block_start; u64 iosize; sector_t sector; struct extent_state *cached_state = NULL; struct extent_map *em; struct block_device *bdev; - int ret; - int nr = 0; size_t pg_offset = 0; size_t blocksize; - loff_t i_size = i_size_read(inode); - unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; - u64 nr_delalloc; - u64 delalloc_end; - int page_started; - int compressed; - int write_flags; - unsigned long nr_written = 0; - bool fill_delalloc = true; - - if (wbc->sync_mode == WB_SYNC_ALL) - write_flags = WRITE_SYNC; - else - write_flags = WRITE; - - trace___extent_writepage(page, inode, wbc); - - WARN_ON(!PageLocked(page)); - - ClearPageError(page); - - pg_offset = i_size & (PAGE_CACHE_SIZE - 1); - if (page->index > end_index || - (page->index == end_index && !pg_offset)) { - page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); - unlock_page(page); - return 0; - } - - if (page->index == end_index) { - char *userpage; - - userpage = kmap_atomic(page); - memset(userpage + pg_offset, 0, - PAGE_CACHE_SIZE - pg_offset); - kunmap_atomic(userpage); - flush_dcache_page(page); - } - pg_offset = 0; - - set_page_extent_mapped(page); - - if (!tree->ops || !tree->ops->fill_delalloc) - fill_delalloc = false; - - delalloc_start = start; - delalloc_end = 0; - page_started = 0; - if (!epd->extent_locked && fill_delalloc) { - u64 delalloc_to_write = 0; - /* - * make sure the wbc mapping index is at least updated - * to this page. - */ - update_nr_written(page, wbc, 0); - - while (delalloc_end < page_end) { - nr_delalloc = find_lock_delalloc_range(inode, tree, - page, - &delalloc_start, - &delalloc_end, - 128 * 1024 * 1024); - if (nr_delalloc == 0) { - delalloc_start = delalloc_end + 1; - continue; - } - ret = tree->ops->fill_delalloc(inode, page, - delalloc_start, - delalloc_end, - &page_started, - &nr_written); - /* File system has been set read-only */ - if (ret) { - SetPageError(page); - goto done; - } - /* - * delalloc_end is already one less than the total - * length, so we don't subtract one from - * PAGE_CACHE_SIZE - */ - delalloc_to_write += (delalloc_end - delalloc_start + - PAGE_CACHE_SIZE) >> - PAGE_CACHE_SHIFT; - delalloc_start = delalloc_end + 1; - } - if (wbc->nr_to_write < delalloc_to_write) { - int thresh = 8192; - - if (delalloc_to_write < thresh * 2) - thresh = delalloc_to_write; - wbc->nr_to_write = min_t(u64, delalloc_to_write, - thresh); - } + int ret = 0; + int nr = 0; + bool compressed; - /* did the fill delalloc function already unlock and start - * the IO? - */ - if (page_started) { - ret = 0; - /* - * we've unlocked the page, so we can't update - * the mapping's writeback index, just update - * nr_to_write. - */ - wbc->nr_to_write -= nr_written; - goto done_unlocked; - } - } if (tree->ops && tree->ops->writepage_start_hook) { ret = tree->ops->writepage_start_hook(page, start, page_end); @@ -3202,9 +3234,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, wbc->pages_skipped++; else redirty_page_for_writepage(wbc, page); + update_nr_written(page, wbc, nr_written); unlock_page(page); - ret = 0; + ret = 1; goto done_unlocked; } } @@ -3216,7 +3249,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, update_nr_written(page, wbc, nr_written + 1); end = page_end; - if (last_byte <= start) { + if (i_size <= start) { if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, start, page_end, NULL, 1); @@ -3226,7 +3259,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, blocksize = inode->i_sb->s_blocksize; while (cur <= end) { - if (cur >= last_byte) { + u64 em_end; + if (cur >= i_size) { if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, cur, page_end, NULL, 1); @@ -3236,13 +3270,15 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, end - cur + 1, 1); if (IS_ERR_OR_NULL(em)) { SetPageError(page); + ret = PTR_ERR_OR_ZERO(em); break; } extent_offset = cur - em->start; - BUG_ON(extent_map_end(em) <= cur); + em_end = extent_map_end(em); + BUG_ON(em_end <= cur); BUG_ON(end < cur); - iosize = min(extent_map_end(em) - cur, end - cur + 1); + iosize = min(em_end - cur, end - cur + 1); iosize = ALIGN(iosize, blocksize); sector = (em->block_start + extent_offset) >> 9; bdev = em->bdev; @@ -3278,13 +3314,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, pg_offset += iosize; continue; } - /* leave this out until we have a page_mkwrite call */ - if (0 && !test_range_bit(tree, cur, cur + iosize - 1, - EXTENT_DIRTY, 0, NULL)) { - cur = cur + iosize; - pg_offset += iosize; - continue; - } if (tree->ops && tree->ops->writepage_io_hook) { ret = tree->ops->writepage_io_hook(page, cur, @@ -3295,12 +3324,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (ret) { SetPageError(page); } else { - unsigned long max_nr = end_index + 1; + unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1; set_range_writeback(tree, cur, cur + iosize - 1); if (!PageWriteback(page)) { - printk(KERN_ERR "btrfs warning page %lu not " - "writeback, cur %llu end %llu\n", + btrfs_err(BTRFS_I(inode)->root->fs_info, + "page %lu not writeback, cur %llu end %llu", page->index, cur, end); } @@ -3317,17 +3346,94 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, nr++; } done: + *nr_ret = nr; + +done_unlocked: + + /* drop our reference on any cached states */ + free_extent_state(cached_state); + return ret; +} + +/* + * the writepage semantics are similar to regular writepage. extent + * records are inserted to lock ranges in the tree, and as dirty areas + * are found, they are marked writeback. Then the lock bits are removed + * and the end_io handler clears the writeback ranges + */ +static int __extent_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct inode *inode = page->mapping->host; + struct extent_page_data *epd = data; + u64 start = page_offset(page); + u64 page_end = start + PAGE_CACHE_SIZE - 1; + int ret; + int nr = 0; + size_t pg_offset = 0; + loff_t i_size = i_size_read(inode); + unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; + int write_flags; + unsigned long nr_written = 0; + + if (wbc->sync_mode == WB_SYNC_ALL) + write_flags = WRITE_SYNC; + else + write_flags = WRITE; + + trace___extent_writepage(page, inode, wbc); + + WARN_ON(!PageLocked(page)); + + ClearPageError(page); + + pg_offset = i_size & (PAGE_CACHE_SIZE - 1); + if (page->index > end_index || + (page->index == end_index && !pg_offset)) { + page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); + unlock_page(page); + return 0; + } + + if (page->index == end_index) { + char *userpage; + + userpage = kmap_atomic(page); + memset(userpage + pg_offset, 0, + PAGE_CACHE_SIZE - pg_offset); + kunmap_atomic(userpage); + flush_dcache_page(page); + } + + pg_offset = 0; + + set_page_extent_mapped(page); + + ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written); + if (ret == 1) + goto done_unlocked; + if (ret) + goto done; + + ret = __extent_writepage_io(inode, page, wbc, epd, + i_size, nr_written, write_flags, &nr); + if (ret == 1) + goto done_unlocked; + +done: if (nr == 0) { /* make sure the mapping tag for page dirty gets cleared */ set_page_writeback(page); end_page_writeback(page); } + if (PageError(page)) { + ret = ret < 0 ? ret : -EIO; + end_extent_writepage(page, ret, start, page_end); + } unlock_page(page); + return ret; done_unlocked: - - /* drop our reference on any cached states */ - free_extent_state(cached_state); return 0; } @@ -3343,9 +3449,10 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb) TASK_UNINTERRUPTIBLE); } -static int lock_extent_buffer_for_io(struct extent_buffer *eb, - struct btrfs_fs_info *fs_info, - struct extent_page_data *epd) +static noinline_for_stack int +lock_extent_buffer_for_io(struct extent_buffer *eb, + struct btrfs_fs_info *fs_info, + struct extent_page_data *epd) { unsigned long i, num_pages; int flush = 0; @@ -3416,26 +3523,24 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb, static void end_extent_buffer_writeback(struct extent_buffer *eb) { clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); } static void end_bio_extent_buffer_writepage(struct bio *bio, int err) { - int uptodate = err == 0; - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec; struct extent_buffer *eb; - int done; + int i, done; - do { + bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - bvec--; eb = (struct extent_buffer *)page->private; BUG_ON(!eb); done = atomic_dec_and_test(&eb->io_pages); - if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { + if (err || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); ClearPageUptodate(page); SetPageError(page); @@ -3447,18 +3552,18 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err) continue; end_extent_buffer_writeback(eb); - } while (bvec >= bio->bi_io_vec); + } bio_put(bio); - } -static int write_one_eb(struct extent_buffer *eb, +static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct btrfs_fs_info *fs_info, struct writeback_control *wbc, struct extent_page_data *epd) { struct block_device *bdev = fs_info->fs_devices->latest_bdev; + struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; u64 offset = eb->start; unsigned long i, num_pages; unsigned long bio_flags = 0; @@ -3476,7 +3581,7 @@ static int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); - ret = submit_extent_page(rw, eb->tree, p, offset >> 9, + ret = submit_extent_page(rw, tree, p, offset >> 9, PAGE_CACHE_SIZE, 0, bdev, &epd->bio, -1, end_bio_extent_buffer_writepage, 0, epd->bio_flags, bio_flags); @@ -3573,9 +3678,8 @@ retry: * but no sense in crashing the users box for something * we can survive anyway. */ - if (!eb) { + if (WARN_ON(!eb)) { spin_unlock(&mapping->private_lock); - WARN_ON(1); continue; } @@ -3651,6 +3755,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, struct inode *inode = mapping->host; int ret = 0; int done = 0; + int err = 0; int nr_to_write_done = 0; struct pagevec pvec; int nr_pages; @@ -3737,8 +3842,8 @@ retry: unlock_page(page); ret = 0; } - if (ret) - done = 1; + if (!err && ret < 0) + err = ret; /* * the filesystem may choose to bump up nr_to_write. @@ -3750,7 +3855,7 @@ retry: pagevec_release(&pvec); cond_resched(); } - if (!scanned && !done) { + if (!scanned && !done && !err) { /* * We hit the last page and there is more work to be done: wrap * back to the start of the file @@ -3760,7 +3865,7 @@ retry: goto retry; } btrfs_add_delayed_iput(inode); - return ret; + return err; } static void flush_epd_write_bio(struct extent_page_data *epd) @@ -4042,7 +4147,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, if (offset >= last) return NULL; - while(1) { + while (1) { len = last - offset; if (len == 0) break; @@ -4066,6 +4171,19 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, return NULL; } +static noinline int count_ext_ref(u64 inum, u64 offset, u64 root_id, void *ctx) +{ + unsigned long cnt = *((unsigned long *)ctx); + + cnt++; + *((unsigned long *)ctx) = cnt; + + /* Now we're sure that the extent is shared. */ + if (cnt > 1) + return 1; + return 0; +} + int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent) { @@ -4082,12 +4200,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct btrfs_path *path; - struct btrfs_file_extent_item *item; int end = 0; u64 em_start = 0; u64 em_len = 0; u64 em_end = 0; - unsigned long emflags; if (len == 0) return -EINVAL; @@ -4112,8 +4228,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } WARN_ON(!ret); path->slots[0]--; - item = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); found_type = btrfs_key_type(&found_key); @@ -4132,7 +4246,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, last = found_key.offset; last_for_get_extent = last + 1; } - btrfs_free_path(path); + btrfs_release_path(path); /* * we might have some extents allocated but more delalloc past those @@ -4181,7 +4295,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, offset_in_extent = em_start - em->start; em_end = extent_map_end(em); em_len = em_end - em_start; - emflags = em->flags; disko = 0; flags = 0; @@ -4202,7 +4315,24 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, flags |= (FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); } else { + unsigned long ref_cnt = 0; + disko = em->block_start + offset_in_extent; + + /* + * As btrfs supports shared space, this information + * can be exported to userspace tools via + * flag FIEMAP_EXTENT_SHARED. + */ + ret = iterate_inodes_from_logical( + em->block_start, + BTRFS_I(inode)->root->fs_info, + path, count_ext_ref, &ref_cnt); + if (ret < 0 && ret != -ENOENT) + goto out_free; + + if (ref_cnt > 1) + flags |= FIEMAP_EXTENT_SHARED; } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; @@ -4234,6 +4364,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, out_free: free_extent_map(em); out: + btrfs_free_path(path); unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state, GFP_NOFS); return ret; @@ -4245,7 +4376,7 @@ static void __free_extent_buffer(struct extent_buffer *eb) kmem_cache_free(extent_buffer_cache, eb); } -static int extent_buffer_under_io(struct extent_buffer *eb) +int extent_buffer_under_io(struct extent_buffer *eb) { return (atomic_read(&eb->io_pages) || test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || @@ -4315,10 +4446,9 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) __free_extent_buffer(eb); } -static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, - u64 start, - unsigned long len, - gfp_t mask) +static struct extent_buffer * +__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, + unsigned long len, gfp_t mask) { struct extent_buffer *eb = NULL; @@ -4327,7 +4457,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, return NULL; eb->start = start; eb->len = len; - eb->tree = tree; + eb->fs_info = fs_info; eb->bflags = 0; rwlock_init(&eb->lock); atomic_set(&eb->write_locks, 0); @@ -4446,7 +4576,8 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); } -static void mark_extent_buffer_accessed(struct extent_buffer *eb) +static void mark_extent_buffer_accessed(struct extent_buffer *eb, + struct page *accessed) { unsigned long num_pages, i; @@ -4455,11 +4586,77 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb) num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { struct page *p = extent_buffer_page(eb, i); - mark_page_accessed(p); + if (p != accessed) + mark_page_accessed(p); } } -struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, +struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) +{ + struct extent_buffer *eb; + + rcu_read_lock(); + eb = radix_tree_lookup(&fs_info->buffer_radix, + start >> PAGE_CACHE_SHIFT); + if (eb && atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); + mark_extent_buffer_accessed(eb, NULL); + return eb; + } + rcu_read_unlock(); + + return NULL; +} + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, unsigned long len) +{ + struct extent_buffer *eb, *exists = NULL; + int ret; + + eb = find_extent_buffer(fs_info, start); + if (eb) + return eb; + eb = alloc_dummy_extent_buffer(start, len); + if (!eb) + return NULL; + eb->fs_info = fs_info; +again: + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (ret) + goto free_eb; + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> PAGE_CACHE_SHIFT, eb); + spin_unlock(&fs_info->buffer_lock); + radix_tree_preload_end(); + if (ret == -EEXIST) { + exists = find_extent_buffer(fs_info, start); + if (exists) + goto free_eb; + else + goto again; + } + check_buffer_tree_ref(eb); + set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); + + /* + * We will free dummy extent buffer's if they come into + * free_extent_buffer with a ref count of 2, but if we are using this we + * want the buffers to stay in memory until we're done with them, so + * bump the ref count again. + */ + atomic_inc(&eb->refs); + return eb; +free_eb: + btrfs_release_extent_buffer(eb); + return exists; +} +#endif + +struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len) { unsigned long num_pages = num_extent_pages(start, len); @@ -4468,20 +4665,15 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, struct extent_buffer *eb; struct extent_buffer *exists = NULL; struct page *p; - struct address_space *mapping = tree->mapping; + struct address_space *mapping = fs_info->btree_inode->i_mapping; int uptodate = 1; int ret; - rcu_read_lock(); - eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); - if (eb && atomic_inc_not_zero(&eb->refs)) { - rcu_read_unlock(); - mark_extent_buffer_accessed(eb); + eb = find_extent_buffer(fs_info, start); + if (eb) return eb; - } - rcu_read_unlock(); - eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS); + eb = __alloc_extent_buffer(fs_info, start, len, GFP_NOFS); if (!eb) return NULL; @@ -4504,7 +4696,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, spin_unlock(&mapping->private_lock); unlock_page(p); page_cache_release(p); - mark_extent_buffer_accessed(exists); + mark_extent_buffer_accessed(exists, p); goto free_eb; } @@ -4519,7 +4711,6 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, attach_extent_buffer_page(eb, p); spin_unlock(&mapping->private_lock); WARN_ON(PageDirty(p)); - mark_page_accessed(p); eb->pages[i] = p; if (!PageUptodate(p)) uptodate = 0; @@ -4536,26 +4727,21 @@ again: if (ret) goto free_eb; - spin_lock(&tree->buffer_lock); - ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> PAGE_CACHE_SHIFT, eb); + spin_unlock(&fs_info->buffer_lock); + radix_tree_preload_end(); if (ret == -EEXIST) { - exists = radix_tree_lookup(&tree->buffer, - start >> PAGE_CACHE_SHIFT); - if (!atomic_inc_not_zero(&exists->refs)) { - spin_unlock(&tree->buffer_lock); - radix_tree_preload_end(); - exists = NULL; + exists = find_extent_buffer(fs_info, start); + if (exists) + goto free_eb; + else goto again; - } - spin_unlock(&tree->buffer_lock); - radix_tree_preload_end(); - mark_extent_buffer_accessed(exists); - goto free_eb; } /* add one reference for the tree */ check_buffer_tree_ref(eb); - spin_unlock(&tree->buffer_lock); - radix_tree_preload_end(); + set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); /* * there is a race where release page may have @@ -4586,23 +4772,6 @@ free_eb: return exists; } -struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len) -{ - struct extent_buffer *eb; - - rcu_read_lock(); - eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); - if (eb && atomic_inc_not_zero(&eb->refs)) { - rcu_read_unlock(); - mark_extent_buffer_accessed(eb); - return eb; - } - rcu_read_unlock(); - - return NULL; -} - static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) { struct extent_buffer *eb = @@ -4616,17 +4785,17 @@ static int release_extent_buffer(struct extent_buffer *eb) { WARN_ON(atomic_read(&eb->refs) == 0); if (atomic_dec_and_test(&eb->refs)) { - if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) { - spin_unlock(&eb->refs_lock); - } else { - struct extent_io_tree *tree = eb->tree; + if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { + struct btrfs_fs_info *fs_info = eb->fs_info; spin_unlock(&eb->refs_lock); - spin_lock(&tree->buffer_lock); - radix_tree_delete(&tree->buffer, + spin_lock(&fs_info->buffer_lock); + radix_tree_delete(&fs_info->buffer_radix, eb->start >> PAGE_CACHE_SHIFT); - spin_unlock(&tree->buffer_lock); + spin_unlock(&fs_info->buffer_lock); + } else { + spin_unlock(&eb->refs_lock); } /* Should be safe to release our pages at this point */ @@ -4899,6 +5068,43 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, } } +int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv, + unsigned long start, + unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char __user *dst = (char __user *)dstv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + int ret = 0; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + + cur = min(len, (PAGE_CACHE_SIZE - offset)); + kaddr = page_address(page); + if (copy_to_user(dst, kaddr + offset, cur)) { + ret = -EFAULT; + break; + } + + dst += cur; + len -= cur; + offset = 0; + i++; + } + + return ret; +} + int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, unsigned long min_len, char **map, unsigned long *map_start, @@ -5066,23 +5272,6 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, } } -static void move_pages(struct page *dst_page, struct page *src_page, - unsigned long dst_off, unsigned long src_off, - unsigned long len) -{ - char *dst_kaddr = page_address(dst_page); - if (dst_page == src_page) { - memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); - } else { - char *src_kaddr = page_address(src_page); - char *p = dst_kaddr + dst_off + len; - char *s = src_kaddr + src_off + len; - - while (len--) - *--p = *--s; - } -} - static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) { unsigned long distance = (src > dst) ? src - dst : dst - src; @@ -5122,12 +5311,12 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_i; if (src_offset + len > dst->len) { - printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " + printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move " "len %lu dst len %lu\n", src_offset, len, dst->len); BUG_ON(1); } if (dst_offset + len > dst->len) { - printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " + printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move " "len %lu dst len %lu\n", dst_offset, len, dst->len); BUG_ON(1); } @@ -5169,12 +5358,12 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_i; if (src_offset + len > dst->len) { - printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " + printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move " "len %lu len %lu\n", src_offset, len, dst->len); BUG_ON(1); } if (dst_offset + len > dst->len) { - printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " + printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move " "len %lu len %lu\n", dst_offset, len, dst->len); BUG_ON(1); } @@ -5193,7 +5382,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); - move_pages(extent_buffer_page(dst, dst_i), + copy_pages(extent_buffer_page(dst, dst_i), extent_buffer_page(dst, src_i), dst_off_in_page - cur + 1, src_off_in_page - cur + 1, cur); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6dbc645f1f3..ccc264e7bde 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -43,6 +43,7 @@ #define EXTENT_BUFFER_WRITEBACK 7 #define EXTENT_BUFFER_IOERR 8 #define EXTENT_BUFFER_DUMMY 9 +#define EXTENT_BUFFER_IN_TREE 10 /* these are flags for extent_clear_unlock_delalloc */ #define PAGE_UNLOCK (1 << 0) @@ -94,12 +95,10 @@ struct extent_io_ops { struct extent_io_tree { struct rb_root state; - struct radix_tree_root buffer; struct address_space *mapping; u64 dirty_bytes; int track_uptodate; spinlock_t lock; - spinlock_t buffer_lock; struct extent_io_ops *ops; }; @@ -130,7 +129,7 @@ struct extent_buffer { unsigned long map_start; unsigned long map_len; unsigned long bflags; - struct extent_io_tree *tree; + struct btrfs_fs_info *fs_info; spinlock_t refs_lock; atomic_t refs; atomic_t io_pages; @@ -159,7 +158,6 @@ struct extent_buffer { * to unlock */ wait_queue_head_t read_lock_wq; - wait_queue_head_t lock_wq; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; @@ -266,12 +264,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); void set_page_extent_mapped(struct page *page); -struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, +struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len); struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len); struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); -struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len); +struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start); void free_extent_buffer(struct extent_buffer *eb); void free_extent_buffer_stale(struct extent_buffer *eb); #define WAIT_NONE 0 @@ -305,6 +303,9 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, void read_extent_buffer(struct extent_buffer *eb, void *dst, unsigned long start, unsigned long len); +int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dst, + unsigned long start, + unsigned long len); void write_extent_buffer(struct extent_buffer *eb, const void *src, unsigned long start, unsigned long len); void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, @@ -321,6 +322,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb); int set_extent_buffer_uptodate(struct extent_buffer *eb); int clear_extent_buffer_uptodate(struct extent_buffer *eb); int extent_buffer_uptodate(struct extent_buffer *eb); +int extent_buffer_under_io(struct extent_buffer *eb); int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long min_len, char **map, unsigned long *map_start, @@ -345,4 +347,12 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, int end_extent_writepage(struct page *page, int err, u64 start, u64 end); int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, int mirror_num); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +noinline u64 find_lock_delalloc_range(struct inode *inode, + struct extent_io_tree *tree, + struct page *locked_page, u64 *start, + u64 *end, u64 max_bytes); +struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, unsigned long len); +#endif #endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index a4a7a1a8da9..225302b39af 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -51,7 +51,7 @@ struct extent_map *alloc_extent_map(void) em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); if (!em) return NULL; - em->in_tree = 0; + RB_CLEAR_NODE(&em->rb_node); em->flags = 0; em->compress_type = BTRFS_COMPRESS_NONE; em->generation = 0; @@ -73,38 +73,64 @@ void free_extent_map(struct extent_map *em) return; WARN_ON(atomic_read(&em->refs) == 0); if (atomic_dec_and_test(&em->refs)) { - WARN_ON(em->in_tree); + WARN_ON(extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); + if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) + kfree(em->bdev); kmem_cache_free(extent_map_cache, em); } } -static struct rb_node *tree_insert(struct rb_root *root, u64 offset, - struct rb_node *node) +/* simple helper to do math around the end of an extent, handling wrap */ +static u64 range_end(u64 start, u64 len) +{ + if (start + len < start) + return (u64)-1; + return start + len; +} + +static int tree_insert(struct rb_root *root, struct extent_map *em) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; - struct extent_map *entry; + struct extent_map *entry = NULL; + struct rb_node *orig_parent = NULL; + u64 end = range_end(em->start, em->len); while (*p) { parent = *p; entry = rb_entry(parent, struct extent_map, rb_node); - WARN_ON(!entry->in_tree); - - if (offset < entry->start) + if (em->start < entry->start) p = &(*p)->rb_left; - else if (offset >= extent_map_end(entry)) + else if (em->start >= extent_map_end(entry)) p = &(*p)->rb_right; else - return parent; + return -EEXIST; } - entry = rb_entry(node, struct extent_map, rb_node); - entry->in_tree = 1; - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; + orig_parent = parent; + while (parent && em->start >= extent_map_end(entry)) { + parent = rb_next(parent); + entry = rb_entry(parent, struct extent_map, rb_node); + } + if (parent) + if (end > entry->start && em->start < extent_map_end(entry)) + return -EEXIST; + + parent = orig_parent; + entry = rb_entry(parent, struct extent_map, rb_node); + while (parent && em->start < entry->start) { + parent = rb_prev(parent); + entry = rb_entry(parent, struct extent_map, rb_node); + } + if (parent) + if (end > entry->start && em->start < extent_map_end(entry)) + return -EEXIST; + + rb_link_node(&em->rb_node, orig_parent, p); + rb_insert_color(&em->rb_node, root); + return 0; } /* @@ -126,8 +152,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, prev = n; prev_entry = entry; - WARN_ON(!entry->in_tree); - if (offset < entry->start) n = n->rb_left; else if (offset >= extent_map_end(entry)) @@ -213,12 +237,12 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->len += merge->len; em->block_len += merge->block_len; em->block_start = merge->block_start; - merge->in_tree = 0; em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; em->mod_start = merge->mod_start; em->generation = max(em->generation, merge->generation); rb_erase(&merge->rb_node, &tree->map); + RB_CLEAR_NODE(&merge->rb_node); free_extent_map(merge); } } @@ -228,9 +252,9 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) merge = rb_entry(rb, struct extent_map, rb_node); if (rb && mergable_maps(em, merge)) { em->len += merge->len; - em->block_len += merge->len; + em->block_len += merge->block_len; rb_erase(&merge->rb_node, &tree->map); - merge->in_tree = 0; + RB_CLEAR_NODE(&merge->rb_node); em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; em->generation = max(em->generation, merge->generation); free_extent_map(merge); @@ -292,7 +316,21 @@ out: void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) { clear_bit(EXTENT_FLAG_LOGGING, &em->flags); - if (em->in_tree) + if (extent_map_in_tree(em)) + try_merge_map(tree, em); +} + +static inline void setup_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em, + int modified) +{ + atomic_inc(&em->refs); + em->mod_start = em->start; + em->mod_len = em->len; + + if (modified) + list_move(&em->list, &tree->modified_extents); + else try_merge_map(tree, em); } @@ -310,41 +348,16 @@ int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *em, int modified) { int ret = 0; - struct rb_node *rb; - struct extent_map *exist; - exist = lookup_extent_mapping(tree, em->start, em->len); - if (exist) { - free_extent_map(exist); - ret = -EEXIST; - goto out; - } - rb = tree_insert(&tree->map, em->start, &em->rb_node); - if (rb) { - ret = -EEXIST; + ret = tree_insert(&tree->map, em); + if (ret) goto out; - } - atomic_inc(&em->refs); - em->mod_start = em->start; - em->mod_len = em->len; - - if (modified) - list_move(&em->list, &tree->modified_extents); - else - try_merge_map(tree, em); + setup_extent_mapping(tree, em, modified); out: return ret; } -/* simple helper to do math around the end of an extent, handling wrap */ -static u64 range_end(u64 start, u64 len) -{ - if (start + len < start) - return (u64)-1; - return start + len; -} - static struct extent_map * __lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len, int strict) @@ -424,6 +437,21 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) rb_erase(&em->rb_node, &tree->map); if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) list_del_init(&em->list); - em->in_tree = 0; + RB_CLEAR_NODE(&em->rb_node); return ret; } + +void replace_extent_mapping(struct extent_map_tree *tree, + struct extent_map *cur, + struct extent_map *new, + int modified) +{ + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags)); + ASSERT(extent_map_in_tree(cur)); + if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags)) + list_del_init(&cur->list); + rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map); + RB_CLEAR_NODE(&cur->rb_node); + + setup_extent_mapping(tree, new, modified); +} diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 61adc44b780..b2991fd8583 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -3,10 +3,10 @@ #include <linux/rbtree.h> -#define EXTENT_MAP_LAST_BYTE (u64)-4 -#define EXTENT_MAP_HOLE (u64)-3 -#define EXTENT_MAP_INLINE (u64)-2 -#define EXTENT_MAP_DELALLOC (u64)-1 +#define EXTENT_MAP_LAST_BYTE ((u64)-4) +#define EXTENT_MAP_HOLE ((u64)-3) +#define EXTENT_MAP_INLINE ((u64)-2) +#define EXTENT_MAP_DELALLOC ((u64)-1) /* bits for the flags field */ #define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */ @@ -15,6 +15,7 @@ #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ #define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ #define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */ +#define EXTENT_FLAG_FS_MAPPING 6 /* filesystem extent mapping type */ struct extent_map { struct rb_node rb_node; @@ -33,7 +34,6 @@ struct extent_map { unsigned long flags; struct block_device *bdev; atomic_t refs; - unsigned int in_tree; unsigned int compress_type; struct list_head list; }; @@ -44,6 +44,11 @@ struct extent_map_tree { rwlock_t lock; }; +static inline int extent_map_in_tree(const struct extent_map *em) +{ + return !RB_EMPTY_NODE(&em->rb_node); +} + static inline u64 extent_map_end(struct extent_map *em) { if (em->start + em->len < em->start) @@ -64,6 +69,10 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *em, int modified); int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); +void replace_extent_mapping(struct extent_map_tree *tree, + struct extent_map *cur, + struct extent_map *new, + int modified); struct extent_map *alloc_extent_map(void); void free_extent_map(struct extent_map *em); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 4f53159bdb9..f46cfe45d68 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -182,7 +182,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, if (!path) return -ENOMEM; - nblocks = bio->bi_size >> inode->i_sb->s_blocksize_bits; + nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; if (!dst) { if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { btrfs_bio->csum_allocated = kmalloc(nblocks * csum_size, @@ -201,7 +201,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, csum = (u8 *)dst; } - if (bio->bi_size > PAGE_CACHE_SIZE * 8) + if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8) path->reada = 2; WARN_ON(bio->bi_vcnt <= 0); @@ -217,7 +217,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, path->skip_locking = 1; } - disk_bytenr = (u64)bio->bi_sector << 9; + disk_bytenr = (u64)bio->bi_iter.bi_sector << 9; if (dio) offset = logical_offset; while (bio_index < bio->bi_vcnt) { @@ -246,8 +246,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, offset + bvec->bv_len - 1, EXTENT_NODATASUM, GFP_NOFS); } else { - printk(KERN_INFO "btrfs no csum found " - "for inode %llu start %llu\n", + btrfs_info(BTRFS_I(inode)->root->fs_info, + "no csum found for inode %llu start %llu", btrfs_ino(inode), offset); } item = NULL; @@ -281,10 +281,10 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, found: csum += count * csum_size; nblocks -= count; + bio_index += count; while (count--) { disk_bytenr += bvec->bv_len; offset += bvec->bv_len; - bio_index++; bvec++; } } @@ -302,7 +302,7 @@ int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, struct btrfs_dio_private *dip, struct bio *bio, u64 offset) { - int len = (bio->bi_sector << 9) - dip->disk_bytenr; + int len = (bio->bi_iter.bi_sector << 9) - dip->disk_bytenr; u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); int ret; @@ -329,6 +329,9 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, u64 csum_end; u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + ASSERT(start == ALIGN(start, root->sectorsize) && + (end + 1) == ALIGN(end + 1, root->sectorsize)); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -444,11 +447,12 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, u64 offset; WARN_ON(bio->bi_vcnt <= 0); - sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS); + sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_iter.bi_size), + GFP_NOFS); if (!sums) return -ENOMEM; - sums->len = bio->bi_size; + sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); if (contig) @@ -458,7 +462,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, ordered = btrfs_lookup_ordered_extent(inode, offset); BUG_ON(!ordered); /* Logic error */ - sums->bytenr = (u64)bio->bi_sector << 9; + sums->bytenr = (u64)bio->bi_iter.bi_sector << 9; index = 0; while (bio_index < bio->bi_vcnt) { @@ -473,7 +477,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, btrfs_add_ordered_sum(inode, ordered, sums); btrfs_put_ordered_extent(ordered); - bytes_left = bio->bi_size - total_bytes; + bytes_left = bio->bi_iter.bi_size - total_bytes; sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), GFP_NOFS); @@ -481,7 +485,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, sums->len = bytes_left; ordered = btrfs_lookup_ordered_extent(inode, offset); BUG_ON(!ordered); /* Logic error */ - sums->bytenr = ((u64)bio->bi_sector << 9) + + sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) + total_bytes; index = 0; } @@ -746,7 +750,7 @@ again: int slot = path->slots[0] + 1; /* we didn't find a csum item, insert one */ nritems = btrfs_header_nritems(path->nodes[0]); - if (path->slots[0] >= nritems - 1) { + if (!nritems || (path->slots[0] >= nritems - 1)) { ret = btrfs_next_leaf(root, path); if (ret == 1) found_next = 1; @@ -846,10 +850,8 @@ insert: path->leave_spinning = 0; if (ret < 0) goto fail_unlock; - if (ret != 0) { - WARN_ON(1); + if (WARN_ON(ret != 0)) goto fail_unlock; - } leaf = path->nodes[0]; csum: item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); @@ -883,3 +885,79 @@ out: fail_unlock: goto out; } + +void btrfs_extent_item_to_extent_map(struct inode *inode, + const struct btrfs_path *path, + struct btrfs_file_extent_item *fi, + const bool new_inline, + struct extent_map *em) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_buffer *leaf = path->nodes[0]; + const int slot = path->slots[0]; + struct btrfs_key key; + u64 extent_start, extent_end; + u64 bytenr; + u8 type = btrfs_file_extent_type(leaf, fi); + int compress_type = btrfs_file_extent_compression(leaf, fi); + + em->bdev = root->fs_info->fs_devices->latest_bdev; + btrfs_item_key_to_cpu(leaf, &key, slot); + extent_start = key.offset; + + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + extent_end = extent_start + + btrfs_file_extent_num_bytes(leaf, fi); + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + size_t size; + size = btrfs_file_extent_inline_len(leaf, slot, fi); + extent_end = ALIGN(extent_start + size, root->sectorsize); + } + + em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + em->start = extent_start; + em->len = extent_end - extent_start; + em->orig_start = extent_start - + btrfs_file_extent_offset(leaf, fi); + em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + if (bytenr == 0) { + em->block_start = EXTENT_MAP_HOLE; + return; + } + if (compress_type != BTRFS_COMPRESS_NONE) { + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; + em->block_start = bytenr; + em->block_len = em->orig_block_len; + } else { + bytenr += btrfs_file_extent_offset(leaf, fi); + em->block_start = bytenr; + em->block_len = em->len; + if (type == BTRFS_FILE_EXTENT_PREALLOC) + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + } + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + em->block_start = EXTENT_MAP_INLINE; + em->start = extent_start; + em->len = extent_end - extent_start; + /* + * Initialize orig_start and block_len with the same values + * as in inode.c:btrfs_get_extent(). + */ + em->orig_start = EXTENT_MAP_HOLE; + em->block_len = (u64)-1; + if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) { + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; + } + } else { + btrfs_err(root->fs_info, + "unknown file extent item type %d, inode %llu, offset %llu, root %llu", + type, btrfs_ino(inode), extent_start, + root->root_key.objectid); + } +} diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 72da4df53c9..1f2b99cb55e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -39,8 +39,8 @@ #include "print-tree.h" #include "tree-log.h" #include "locking.h" -#include "compat.h" #include "volumes.h" +#include "qgroup.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* @@ -370,7 +370,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) u64 root_objectid = 0; atomic_inc(&fs_info->defrag_running); - while(1) { + while (1) { /* Pause the auto defragger. */ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) @@ -426,13 +426,8 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, struct page *page = prepared_pages[pg]; /* * Copy data from userspace to the current page - * - * Disable pagefault to avoid recursive lock since - * the pages are already locked */ - pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, count); - pagefault_enable(); /* Flush processor's dcache for this page */ flush_dcache_page(page); @@ -453,7 +448,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, write_bytes -= copied; total_copied += copied; - /* Return to btrfs_file_aio_write to fault page */ + /* Return to btrfs_file_write_iter to fault page */ if (unlikely(copied == 0)) break; @@ -476,11 +471,12 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) for (i = 0; i < num_pages; i++) { /* page checked is some magic around finding pages that * have been modified without going through btrfs_set_page_dirty - * clear it here + * clear it here. There should be no need to mark the pages + * accessed as prepare_pages should have marked them accessed + * in prepare_pages via find_or_create_page() */ ClearPageChecked(pages[i]); unlock_page(pages[i]); - mark_page_accessed(pages[i]); page_cache_release(pages[i]); } } @@ -592,7 +588,6 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, clear_bit(EXTENT_FLAG_PINNED, &em->flags); clear_bit(EXTENT_FLAG_LOGGING, &flags); modified = !list_empty(&em->list); - remove_extent_mapping(em_tree, em); if (no_splits) goto next; @@ -623,8 +618,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->bdev = em->bdev; split->flags = flags; split->compress_type = em->compress_type; - ret = add_extent_mapping(em_tree, split, modified); - BUG_ON(ret); /* Logic error */ + replace_extent_mapping(em_tree, em, split, modified); free_extent_map(split); split = split2; split2 = NULL; @@ -662,12 +656,20 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->orig_block_len = 0; } - ret = add_extent_mapping(em_tree, split, modified); - BUG_ON(ret); /* Logic error */ + if (extent_map_in_tree(em)) { + replace_extent_mapping(em_tree, em, split, + modified); + } else { + ret = add_extent_mapping(em_tree, split, + modified); + ASSERT(ret == 0); /* Logic error */ + } free_extent_map(split); split = NULL; } next: + if (extent_map_in_tree(em)) + remove_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); /* once for us */ @@ -693,7 +695,10 @@ next: int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, u64 start, u64 end, - u64 *drop_end, int drop_cache) + u64 *drop_end, int drop_cache, + int replace_extent, + u32 extent_item_size, + int *key_inserted) { struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; @@ -711,15 +716,18 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int recow; int ret; int modify_tree = -1; - int update_refs = (root->ref_cows || root == root->fs_info->tree_root); + int update_refs; int found = 0; + int leafs_visited = 0; if (drop_cache) btrfs_drop_extent_cache(inode, start, end - 1, 0); - if (start >= BTRFS_I(inode)->disk_i_size) + if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent) modify_tree = 0; + update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + root == root->fs_info->tree_root); while (1) { recow = 0; ret = btrfs_lookup_file_extent(trans, root, path, ino, @@ -734,6 +742,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, path->slots[0]--; } ret = 0; + leafs_visited++; next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { @@ -745,6 +754,7 @@ next_slot: ret = 0; break; } + leafs_visited++; leaf = path->nodes[0]; recow = 1; } @@ -767,12 +777,25 @@ next_slot: btrfs_file_extent_num_bytes(leaf, fi); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = key.offset + - btrfs_file_extent_inline_len(leaf, fi); + btrfs_file_extent_inline_len(leaf, + path->slots[0], fi); } else { WARN_ON(1); extent_end = search_start; } + /* + * Don't skip extent items representing 0 byte lengths. They + * used to be created (bug) if while punching holes we hit + * -ENOSPC condition. So if we find one here, just ensure we + * delete it, otherwise we would insert a new file extent item + * with the same key (offset) as that 0 bytes length file + * extent item in the call to setup_items_for_insert() later + * in this function. + */ + if (extent_end == key.offset && extent_end >= search_start) + goto delete_extent_item; + if (extent_end <= search_start) { path->slots[0]++; goto next_slot; @@ -792,7 +815,10 @@ next_slot: */ if (start > key.offset && end < extent_end) { BUG_ON(del_nr > 0); - BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -EOPNOTSUPP; + break; + } memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = start; @@ -825,7 +851,7 @@ next_slot: disk_bytenr, num_bytes, 0, root->root_key.objectid, new_key.objectid, - start - extent_offset, 0); + start - extent_offset, 1); BUG_ON(ret); /* -ENOMEM */ } key.offset = start; @@ -835,7 +861,10 @@ next_slot: * | -------- extent -------- | */ if (start <= key.offset && end < extent_end) { - BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -EOPNOTSUPP; + break; + } memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = end; @@ -858,7 +887,10 @@ next_slot: */ if (start > key.offset && end >= extent_end) { BUG_ON(del_nr > 0); - BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -EOPNOTSUPP; + break; + } btrfs_set_file_extent_num_bytes(leaf, fi, start - key.offset); @@ -877,6 +909,7 @@ next_slot: * | ------ extent ------ | */ if (start <= key.offset && end >= extent_end) { +delete_extent_item: if (del_nr == 0) { del_slot = path->slots[0]; del_nr = 1; @@ -928,14 +961,52 @@ next_slot: } if (!ret && del_nr > 0) { + /* + * Set path->slots[0] to first slot, so that after the delete + * if items are move off from our leaf to its immediate left or + * right neighbor leafs, we end up with a correct and adjusted + * path->slots[0] for our insertion (if replace_extent != 0). + */ + path->slots[0] = del_slot; ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret) btrfs_abort_transaction(trans, root, ret); } + leaf = path->nodes[0]; + /* + * If btrfs_del_items() was called, it might have deleted a leaf, in + * which case it unlocked our path, so check path->locks[0] matches a + * write lock. + */ + if (!ret && replace_extent && leafs_visited == 1 && + (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING || + path->locks[0] == BTRFS_WRITE_LOCK) && + btrfs_leaf_free_space(root, leaf) >= + sizeof(struct btrfs_item) + extent_item_size) { + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) { + struct btrfs_key slot_key; + + btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]); + if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) + path->slots[0]++; + } + setup_items_for_insert(root, path, &key, + &extent_item_size, + extent_item_size, + sizeof(struct btrfs_item) + + extent_item_size, 1); + *key_inserted = 1; + } + + if (!replace_extent || !(*key_inserted)) + btrfs_release_path(path); if (drop_end) *drop_end = found ? min(end, extent_end) : end; - btrfs_release_path(path); return ret; } @@ -950,7 +1021,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL, - drop_cache); + drop_cache, 0, 0, NULL); btrfs_free_path(path); return ret; } @@ -1137,7 +1208,7 @@ again: ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 0); + ino, orig_offset, 1); BUG_ON(ret); /* -ENOMEM */ if (split == start) { @@ -1236,29 +1307,18 @@ static int prepare_uptodate_page(struct page *page, u64 pos, } /* - * this gets pages into the page cache and locks them down, it also properly - * waits for data=ordered extents to finish before allowing the pages to be - * modified. + * this just gets pages into the page cache and locks them down. */ -static noinline int prepare_pages(struct btrfs_root *root, struct file *file, - struct page **pages, size_t num_pages, - loff_t pos, unsigned long first_index, - size_t write_bytes, bool force_uptodate) +static noinline int prepare_pages(struct inode *inode, struct page **pages, + size_t num_pages, loff_t pos, + size_t write_bytes, bool force_uptodate) { - struct extent_state *cached_state = NULL; int i; unsigned long index = pos >> PAGE_CACHE_SHIFT; - struct inode *inode = file_inode(file); gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int err = 0; - int faili = 0; - u64 start_pos; - u64 last_pos; + int faili; - start_pos = pos & ~((u64)root->sectorsize - 1); - last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; - -again: for (i = 0; i < num_pages; i++) { pages[i] = find_or_create_page(inode->i_mapping, index + i, mask | __GFP_WRITE); @@ -1281,54 +1341,82 @@ again: } wait_on_page_writeback(pages[i]); } - err = 0; + + return 0; +fail: + while (faili >= 0) { + unlock_page(pages[faili]); + page_cache_release(pages[faili]); + faili--; + } + return err; + +} + +/* + * This function locks the extent and properly waits for data=ordered extents + * to finish before allowing the pages to be modified if need. + * + * The return value: + * 1 - the extent is locked + * 0 - the extent is not locked, and everything is OK + * -EAGAIN - need re-prepare the pages + * the other < 0 number - Something wrong happens + */ +static noinline int +lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, + size_t num_pages, loff_t pos, + u64 *lockstart, u64 *lockend, + struct extent_state **cached_state) +{ + u64 start_pos; + u64 last_pos; + int i; + int ret = 0; + + start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1); + last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1; + if (start_pos < inode->i_size) { struct btrfs_ordered_extent *ordered; lock_extent_bits(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, 0, &cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, - last_pos - 1); + start_pos, last_pos, 0, cached_state); + ordered = btrfs_lookup_ordered_range(inode, start_pos, + last_pos - start_pos + 1); if (ordered && ordered->file_offset + ordered->len > start_pos && - ordered->file_offset < last_pos) { - btrfs_put_ordered_extent(ordered); + ordered->file_offset <= last_pos) { unlock_extent_cached(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, - &cached_state, GFP_NOFS); + start_pos, last_pos, + cached_state, GFP_NOFS); for (i = 0; i < num_pages; i++) { unlock_page(pages[i]); page_cache_release(pages[i]); } - btrfs_wait_ordered_range(inode, start_pos, - last_pos - start_pos); - goto again; + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + return -EAGAIN; } if (ordered) btrfs_put_ordered_extent(ordered); clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, - last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | + last_pos, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, &cached_state, GFP_NOFS); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, &cached_state, - GFP_NOFS); + 0, 0, cached_state, GFP_NOFS); + *lockstart = start_pos; + *lockend = last_pos; + ret = 1; } + for (i = 0; i < num_pages; i++) { if (clear_page_dirty_for_io(pages[i])) account_page_redirty(pages[i]); set_page_extent_mapped(pages[i]); WARN_ON(!PageLocked(pages[i])); } - return 0; -fail: - while (faili >= 0) { - unlock_page(pages[faili]); - page_cache_release(pages[faili]); - faili--; - } - return err; + return ret; } static noinline int check_can_nocow(struct inode *inode, loff_t pos, @@ -1340,8 +1428,12 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, u64 num_bytes; int ret; + ret = btrfs_start_nocow_write(root); + if (!ret) + return -ENOSPC; + lockstart = round_down(pos, root->sectorsize); - lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1; + lockend = round_up(pos + *write_bytes, root->sectorsize) - 1; while (1) { lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); @@ -1359,12 +1451,10 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); if (ret <= 0) { ret = 0; + btrfs_end_nocow_write(root); } else { - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, - NULL, GFP_NOFS); - *write_bytes = min_t(size_t, *write_bytes, num_bytes); + *write_bytes = min_t(size_t, *write_bytes , + num_bytes - pos + lockstart); } unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); @@ -1379,13 +1469,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; + struct extent_state *cached_state = NULL; u64 release_bytes = 0; + u64 lockstart; + u64 lockend; unsigned long first_index; size_t num_written = 0; int nrptrs; int ret = 0; bool only_release_metadata = false; bool force_page_uptodate = false; + bool need_unlock; nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / @@ -1450,22 +1544,37 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, if (!only_release_metadata) btrfs_free_reserved_data_space(inode, reserve_bytes); + else + btrfs_end_nocow_write(root); break; } release_bytes = reserve_bytes; - + need_unlock = false; +again: /* * This is going to setup the pages array with the number of * pages we want, so we don't really need to worry about the * contents of pages from loop to loop */ - ret = prepare_pages(root, file, pages, num_pages, - pos, first_index, write_bytes, + ret = prepare_pages(inode, pages, num_pages, + pos, write_bytes, force_page_uptodate); if (ret) break; + ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages, + pos, &lockstart, &lockend, + &cached_state); + if (ret < 0) { + if (ret == -EAGAIN) + goto again; + break; + } else if (ret > 0) { + need_unlock = true; + ret = 0; + } + copied = btrfs_copy_from_user(pos, num_pages, write_bytes, pages, i); @@ -1510,18 +1619,23 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } release_bytes = dirty_pages << PAGE_CACHE_SHIFT; - if (copied > 0) { + + if (copied > 0) ret = btrfs_dirty_pages(root, inode, pages, dirty_pages, pos, copied, NULL); - if (ret) { - btrfs_drop_pages(pages, num_pages); - break; - } + if (need_unlock) + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + lockstart, lockend, &cached_state, + GFP_NOFS); + if (ret) { + btrfs_drop_pages(pages, num_pages); + break; } release_bytes = 0; - btrfs_drop_pages(pages, num_pages); + if (only_release_metadata) + btrfs_end_nocow_write(root); if (only_release_metadata && copied > 0) { u64 lockstart = round_down(pos, root->sectorsize); @@ -1534,6 +1648,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, only_release_metadata = false; } + btrfs_drop_pages(pages, num_pages); + cond_resched(); balance_dirty_pages_ratelimited(inode->i_mapping); @@ -1547,37 +1663,34 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, kfree(pages); if (release_bytes) { - if (only_release_metadata) + if (only_release_metadata) { + btrfs_end_nocow_write(root); btrfs_delalloc_release_metadata(inode, release_bytes); - else + } else { btrfs_delalloc_release_space(inode, release_bytes); + } } return num_written ? num_written : ret; } static ssize_t __btrfs_direct_write(struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, loff_t pos, - loff_t *ppos, size_t count, size_t ocount) + struct iov_iter *from, + loff_t pos) { struct file *file = iocb->ki_filp; - struct iov_iter i; ssize_t written; ssize_t written_buffered; loff_t endbyte; int err; - written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos, - count, ocount); + written = generic_file_direct_write(iocb, from, pos); - if (written < 0 || written == count) + if (written < 0 || !iov_iter_count(from)) return written; pos += written; - count -= written; - iov_iter_init(&i, iov, nr_segs, count, written); - written_buffered = __btrfs_buffered_write(file, &i, pos); + written_buffered = __btrfs_buffered_write(file, from, pos); if (written_buffered < 0) { err = written_buffered; goto out; @@ -1587,7 +1700,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, if (err) goto out; written += written_buffered; - *ppos = pos + written_buffered; + iocb->ki_pos = pos + written_buffered; invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT, endbyte >> PAGE_CACHE_SHIFT); out: @@ -1612,29 +1725,22 @@ static void update_time_for_write(struct inode *inode) inode_inc_iversion(inode); } -static ssize_t btrfs_file_aio_write(struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t btrfs_file_write_iter(struct kiocb *iocb, + struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; - loff_t *ppos = &iocb->ki_pos; u64 start_pos; + u64 end_pos; ssize_t num_written = 0; ssize_t err = 0; - size_t count, ocount; + size_t count = iov_iter_count(from); bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); + loff_t pos = iocb->ki_pos; mutex_lock(&inode->i_mutex); - err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); - if (err) { - mutex_unlock(&inode->i_mutex); - goto out; - } - count = ocount; - current->backing_dev_info = inode->i_mapping->backing_dev_info; err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) { @@ -1647,6 +1753,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, goto out; } + iov_iter_truncate(from, count); + err = file_remove_suid(file); if (err) { mutex_unlock(&inode->i_mutex); @@ -1675,7 +1783,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, start_pos = round_down(pos, root->sectorsize); if (start_pos > i_size_read(inode)) { - err = btrfs_cont_expand(inode, i_size_read(inode), start_pos); + /* Expand hole size to cover write data, preventing empty gap */ + end_pos = round_up(pos + count, root->sectorsize); + err = btrfs_cont_expand(inode, i_size_read(inode), end_pos); if (err) { mutex_unlock(&inode->i_mutex); goto out; @@ -1686,16 +1796,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, atomic_inc(&BTRFS_I(inode)->sync_writers); if (unlikely(file->f_flags & O_DIRECT)) { - num_written = __btrfs_direct_write(iocb, iov, nr_segs, - pos, ppos, count, ocount); + num_written = __btrfs_direct_write(iocb, from, pos); } else { - struct iov_iter i; - - iov_iter_init(&i, iov, nr_segs, count, num_written); - - num_written = __btrfs_buffered_write(file, &i, pos); + num_written = __btrfs_buffered_write(file, from, pos); if (num_written > 0) - *ppos = pos + num_written; + iocb->ki_pos = pos + num_written; } mutex_unlock(&inode->i_mutex); @@ -1720,7 +1825,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, BTRFS_I(inode)->last_sub_trans = root->log_transid; if (num_written > 0) { err = generic_write_sync(file, pos, num_written); - if (err < 0 && num_written > 0) + if (err < 0) num_written = err; } @@ -1779,8 +1884,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; - int ret = 0; struct btrfs_trans_handle *trans; + struct btrfs_log_ctx ctx; + int ret = 0; bool full_sync = 0; trace_btrfs_sync_file(file, datasync); @@ -1809,8 +1915,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - if (full_sync) - btrfs_wait_ordered_range(inode, start, end - start + 1); + if (full_sync) { + ret = btrfs_wait_ordered_range(inode, start, end - start + 1); + if (ret) { + mutex_unlock(&inode->i_mutex); + goto out; + } + } atomic_inc(&root->log_batch); /* @@ -1850,14 +1961,28 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (file->private_data) btrfs_ioctl_trans_end(file); + /* + * We use start here because we will need to wait on the IO to complete + * in btrfs_sync_log, which could require joining a transaction (for + * example checking cross references in the nocow path). If we use join + * here we could get into a situation where we're waiting on IO to + * happen that is blocked on a transaction trying to commit. With start + * we inc the extwriter counter, so we wait for all extwriters to exit + * before we start blocking join'ers. This comment is to keep somebody + * from thinking they are super smart and changing this to + * btrfs_join_transaction *cough*Josef*cough*. + */ trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); mutex_unlock(&inode->i_mutex); goto out; } + trans->sync = true; + + btrfs_init_log_ctx(&ctx); - ret = btrfs_log_dentry_safe(trans, root, dentry); + ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ ret = 1; @@ -1876,27 +2001,22 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) mutex_unlock(&inode->i_mutex); if (ret != BTRFS_NO_LOG_SYNC) { - if (ret > 0) { - /* - * If we didn't already wait for ordered extents we need - * to do that now. - */ - if (!full_sync) - btrfs_wait_ordered_range(inode, start, - end - start + 1); - ret = btrfs_commit_transaction(trans, root); - } else { - ret = btrfs_sync_log(trans, root); - if (ret == 0) { + if (!ret) { + ret = btrfs_sync_log(trans, root, &ctx); + if (!ret) { ret = btrfs_end_transaction(trans, root); - } else { - if (!full_sync) - btrfs_wait_ordered_range(inode, start, - end - - start + 1); - ret = btrfs_commit_transaction(trans, root); + goto out; + } + } + if (!full_sync) { + ret = btrfs_wait_ordered_range(inode, start, + end - start + 1); + if (ret) { + btrfs_end_transaction(trans, root); + goto out; } } + ret = btrfs_commit_transaction(trans, root); } else { ret = btrfs_end_transaction(trans, root); } @@ -1906,6 +2026,7 @@ out: static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = btrfs_page_mkwrite, .remap_pages = generic_file_remap_pages, }; @@ -1963,11 +2084,13 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_key key; int ret; + if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) + goto out; + key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = offset; - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) return ret; @@ -2049,6 +2172,37 @@ out: return 0; } +/* + * Find a hole extent on given inode and change start/len to the end of hole + * extent.(hole/vacuum extent whose em->start <= start && + * em->start + em->len > start) + * When a hole extent is found, return 1 and modify start/len. + */ +static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) +{ + struct extent_map *em; + int ret = 0; + + em = btrfs_get_extent(inode, NULL, 0, *start, *len, 0); + if (IS_ERR_OR_NULL(em)) { + if (!em) + ret = -ENOMEM; + else + ret = PTR_ERR(em); + return ret; + } + + /* Hole or vacuum extent(only exists in no-hole mode) */ + if (em->block_start == EXTENT_MAP_HOLE) { + ret = 1; + *len = em->start + em->len > *start + *len ? + 0 : *start + *len - em->start - em->len; + *start = em->start + em->len; + } + free_extent_map(em); + return ret; +} + static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2056,20 +2210,42 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) struct btrfs_path *path; struct btrfs_block_rsv *rsv; struct btrfs_trans_handle *trans; - u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); - u64 lockend = round_down(offset + len, - BTRFS_I(inode)->root->sectorsize) - 1; - u64 cur_offset = lockstart; + u64 lockstart; + u64 lockend; + u64 tail_start; + u64 tail_len; + u64 orig_start = offset; + u64 cur_offset; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); u64 drop_end; int ret = 0; int err = 0; - bool same_page = ((offset >> PAGE_CACHE_SHIFT) == - ((offset + len - 1) >> PAGE_CACHE_SHIFT)); + int rsv_count; + bool same_page; + bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); + u64 ino_size; - btrfs_wait_ordered_range(inode, offset, len); + ret = btrfs_wait_ordered_range(inode, offset, len); + if (ret) + return ret; mutex_lock(&inode->i_mutex); + ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); + ret = find_first_non_hole(inode, &offset, &len); + if (ret < 0) + goto out_only_mutex; + if (ret && !len) { + /* Already in a large hole */ + ret = 0; + goto out_only_mutex; + } + + lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize); + lockend = round_down(offset + len, + BTRFS_I(inode)->root->sectorsize) - 1; + same_page = ((offset >> PAGE_CACHE_SHIFT) == + ((offset + len - 1) >> PAGE_CACHE_SHIFT)); + /* * We needn't truncate any page which is beyond the end of the file * because we are sure there is no data there. @@ -2079,14 +2255,13 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) * entire page. */ if (same_page && len < PAGE_CACHE_SIZE) { - if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) + if (offset < ino_size) ret = btrfs_truncate_page(inode, offset, len, 0); - mutex_unlock(&inode->i_mutex); - return ret; + goto out_only_mutex; } /* zero back part of the first page */ - if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) { + if (offset < ino_size) { ret = btrfs_truncate_page(inode, offset, 0, 0); if (ret) { mutex_unlock(&inode->i_mutex); @@ -2094,12 +2269,39 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } } - /* zero the front end of the last page */ - if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) { - ret = btrfs_truncate_page(inode, offset + len, 0, 1); - if (ret) { - mutex_unlock(&inode->i_mutex); - return ret; + /* Check the aligned pages after the first unaligned page, + * if offset != orig_start, which means the first unaligned page + * including serveral following pages are already in holes, + * the extra check can be skipped */ + if (offset == orig_start) { + /* after truncate page, check hole again */ + len = offset + len - lockstart; + offset = lockstart; + ret = find_first_non_hole(inode, &offset, &len); + if (ret < 0) + goto out_only_mutex; + if (ret && !len) { + ret = 0; + goto out_only_mutex; + } + lockstart = offset; + } + + /* Check the tail unaligned part is in a hole */ + tail_start = lockend + 1; + tail_len = offset + len - tail_start; + if (tail_len) { + ret = find_first_non_hole(inode, &tail_start, &tail_len); + if (unlikely(ret < 0)) + goto out_only_mutex; + if (!ret) { + /* zero the front end of the last page */ + if (tail_start + tail_len < ino_size) { + ret = btrfs_truncate_page(inode, + tail_start + tail_len, 0, 1); + if (ret) + goto out_only_mutex; + } } } @@ -2123,11 +2325,9 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) * we need to try again. */ if ((!ordered || - (ordered->file_offset + ordered->len < lockstart || + (ordered->file_offset + ordered->len <= lockstart || ordered->file_offset > lockend)) && - !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, EXTENT_UPTODATE, 0, - cached_state)) { + !btrfs_page_exists_in_range(inode, lockstart, lockend)) { if (ordered) btrfs_put_ordered_extent(ordered); break; @@ -2136,8 +2336,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) btrfs_put_ordered_extent(ordered); unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, GFP_NOFS); - btrfs_wait_ordered_range(inode, lockstart, - lockend - lockstart + 1); + ret = btrfs_wait_ordered_range(inode, lockstart, + lockend - lockstart + 1); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } } path = btrfs_alloc_path(); @@ -2157,9 +2361,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) /* * 1 - update the inode * 1 - removing the extents in the range - * 1 - adding the hole extent + * 1 - adding the hole extent if no_holes isn't set */ - trans = btrfs_start_transaction(root, 3); + rsv_count = no_holes ? 2 : 3; + trans = btrfs_start_transaction(root, rsv_count); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out_free; @@ -2170,19 +2375,24 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) BUG_ON(ret); trans->block_rsv = rsv; + cur_offset = lockstart; + len = lockend - cur_offset; while (cur_offset < lockend) { ret = __btrfs_drop_extents(trans, root, inode, path, cur_offset, lockend + 1, - &drop_end, 1); + &drop_end, 1, 0, 0, NULL); if (ret != -ENOSPC) break; trans->block_rsv = &root->fs_info->trans_block_rsv; - ret = fill_holes(trans, inode, path, cur_offset, drop_end); - if (ret) { - err = ret; - break; + if (cur_offset < ino_size) { + ret = fill_holes(trans, inode, path, cur_offset, + drop_end); + if (ret) { + err = ret; + break; + } } cur_offset = drop_end; @@ -2196,7 +2406,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); - trans = btrfs_start_transaction(root, 3); + trans = btrfs_start_transaction(root, rsv_count); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; @@ -2207,6 +2417,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) rsv, min_size); BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; + + ret = find_first_non_hole(inode, &cur_offset, &len); + if (unlikely(ret < 0)) + break; + if (ret && !len) { + ret = 0; + break; + } } if (ret) { @@ -2215,10 +2433,17 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } trans->block_rsv = &root->fs_info->trans_block_rsv; - ret = fill_holes(trans, inode, path, cur_offset, drop_end); - if (ret) { - err = ret; - goto out_trans; + /* + * Don't insert file hole extent item if it's for a range beyond eof + * (because it's useless) or if it represents a 0 bytes range (when + * cur_offset == drop_end). + */ + if (cur_offset < ino_size && cur_offset < drop_end) { + ret = fill_holes(trans, inode, path, cur_offset, drop_end); + if (ret) { + err = ret; + goto out_trans; + } } out_trans: @@ -2238,6 +2463,7 @@ out_free: out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, GFP_NOFS); +out_only_mutex: mutex_unlock(&inode->i_mutex); if (ret && !err) err = ret; @@ -2308,7 +2534,10 @@ static long btrfs_fallocate(struct file *file, int mode, * wait for ordered IO before we have any locks. We'll loop again * below with the locks held. */ - btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); + ret = btrfs_wait_ordered_range(inode, alloc_start, + alloc_end - alloc_start); + if (ret) + goto out; locked_end = alloc_end - 1; while (1) { @@ -2332,8 +2561,10 @@ static long btrfs_fallocate(struct file *file, int mode, * we can't wait on the range with the transaction * running or with the extent lock held */ - btrfs_wait_ordered_range(inode, alloc_start, - alloc_end - alloc_start); + ret = btrfs_wait_ordered_range(inode, alloc_start, + alloc_end - alloc_start); + if (ret) + goto out; } else { if (ordered) btrfs_put_ordered_extent(ordered); @@ -2405,14 +2636,12 @@ out_reserve_fail: static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_map *em; + struct extent_map *em = NULL; struct extent_state *cached_state = NULL; u64 lockstart = *offset; u64 lockend = i_size_read(inode); u64 start = *offset; - u64 orig_start = *offset; u64 len = i_size_read(inode); - u64 last_end = 0; int ret = 0; lockend = max_t(u64, root->sectorsize, lockend); @@ -2429,89 +2658,35 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, &cached_state); - /* - * Delalloc is such a pain. If we have a hole and we have pending - * delalloc for a portion of the hole we will get back a hole that - * exists for the entire range since it hasn't been actually written - * yet. So to take care of this case we need to look for an extent just - * before the position we want in case there is outstanding delalloc - * going on here. - */ - if (whence == SEEK_HOLE && start != 0) { - if (start <= root->sectorsize) - em = btrfs_get_extent_fiemap(inode, NULL, 0, 0, - root->sectorsize, 0); - else - em = btrfs_get_extent_fiemap(inode, NULL, 0, - start - root->sectorsize, - root->sectorsize, 0); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; - } - last_end = em->start + em->len; - if (em->block_start == EXTENT_MAP_DELALLOC) - last_end = min_t(u64, last_end, inode->i_size); - free_extent_map(em); - } - - while (1) { + while (start < inode->i_size) { em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0); if (IS_ERR(em)) { ret = PTR_ERR(em); + em = NULL; break; } - if (em->block_start == EXTENT_MAP_HOLE) { - if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { - if (last_end <= orig_start) { - free_extent_map(em); - ret = -ENXIO; - break; - } - } - - if (whence == SEEK_HOLE) { - *offset = start; - free_extent_map(em); - break; - } - } else { - if (whence == SEEK_DATA) { - if (em->block_start == EXTENT_MAP_DELALLOC) { - if (start >= inode->i_size) { - free_extent_map(em); - ret = -ENXIO; - break; - } - } - - if (!test_bit(EXTENT_FLAG_PREALLOC, - &em->flags)) { - *offset = start; - free_extent_map(em); - break; - } - } - } + if (whence == SEEK_HOLE && + (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) + break; + else if (whence == SEEK_DATA && + (em->block_start != EXTENT_MAP_HOLE && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) + break; start = em->start + em->len; - last_end = em->start + em->len; - - if (em->block_start == EXTENT_MAP_DELALLOC) - last_end = min_t(u64, last_end, inode->i_size); - - if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { - free_extent_map(em); - ret = -ENXIO; - break; - } free_extent_map(em); + em = NULL; cond_resched(); } - if (!ret) - *offset = min(*offset, inode->i_size); -out: + free_extent_map(em); + if (!ret) { + if (whence == SEEK_DATA && start >= inode->i_size) + ret = -ENXIO; + else + *offset = min_t(loff_t, start, inode->i_size); + } unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, GFP_NOFS); return ret; @@ -2550,11 +2725,11 @@ out: const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, .splice_read = generic_file_splice_read, - .aio_write = btrfs_file_aio_write, + .write_iter = btrfs_file_write_iter, .mmap = btrfs_file_mmap, .open = generic_file_open, .release = btrfs_release_file, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index b4f9904c4c6..2b0a627cb5f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -218,7 +218,6 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, - struct btrfs_path *path, struct inode *inode) { int ret = 0; @@ -275,18 +274,32 @@ struct io_ctl { }; static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, - struct btrfs_root *root) + struct btrfs_root *root, int write) { + int num_pages; + int check_crcs = 0; + + num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + + if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) + check_crcs = 1; + + /* Make sure we can fit our crcs into the first page */ + if (write && check_crcs && + (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) + return -ENOSPC; + memset(io_ctl, 0, sizeof(struct io_ctl)); - io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; - io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages, - GFP_NOFS); + + io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); if (!io_ctl->pages) return -ENOMEM; + + io_ctl->num_pages = num_pages; io_ctl->root = root; - if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) - io_ctl->check_crcs = 1; + io_ctl->check_crcs = check_crcs; + return 0; } @@ -348,8 +361,8 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, btrfs_readpage(NULL, page); lock_page(page); if (!PageUptodate(page)) { - printk(KERN_ERR "btrfs: error reading free " - "space cache\n"); + btrfs_err(BTRFS_I(inode)->root->fs_info, + "error reading free space cache"); io_ctl_drop_pages(io_ctl); return -EIO; } @@ -406,7 +419,7 @@ static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) gen = io_ctl->cur; if (le64_to_cpu(*gen) != generation) { - printk_ratelimited(KERN_ERR "btrfs: space cache generation " + printk_ratelimited(KERN_ERR "BTRFS: space cache generation " "(%Lu) does not match inode (%Lu)\n", *gen, generation); io_ctl_unmap_page(io_ctl); @@ -464,7 +477,7 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) PAGE_CACHE_SIZE - offset); btrfs_csum_final(crc, (char *)&crc); if (val != crc) { - printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free " + printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free " "space cache\n"); io_ctl_unmap_page(io_ctl); return -EIO; @@ -667,6 +680,13 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, generation = btrfs_free_space_generation(leaf, header); btrfs_release_path(path); + if (!BTRFS_I(inode)->generation) { + btrfs_info(root->fs_info, + "The free space cache file (%llu) is invalid. skip it\n", + offset); + return 0; + } + if (BTRFS_I(inode)->generation != generation) { btrfs_err(root->fs_info, "free space inode generation (%llu) " @@ -678,7 +698,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, if (!num_entries) return 0; - ret = io_ctl_init(&io_ctl, inode, root); + ret = io_ctl_init(&io_ctl, inode, root, 0); if (ret) return ret; @@ -832,7 +852,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, if (!matched) { __btrfs_remove_free_space_cache(ctl); - btrfs_err(fs_info, "block group %llu has wrong amount of free space", + btrfs_warn(fs_info, "block group %llu has wrong amount of free space", block_group->key.objectid); ret = -1; } @@ -844,7 +864,7 @@ out: spin_unlock(&block_group->lock); ret = 0; - btrfs_err(fs_info, "failed to load free space cache for block group %llu", + btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now", block_group->key.objectid); } @@ -852,90 +872,44 @@ out: return ret; } -/** - * __btrfs_write_out_cache - write out cached info to an inode - * @root - the root the inode belongs to - * @ctl - the free space cache we are going to write out - * @block_group - the block_group for this cache if it belongs to a block_group - * @trans - the trans handle - * @path - the path to use - * @offset - the offset for the key we'll insert - * - * This function writes out a free space cache struct to disk for quick recovery - * on mount. This will return 0 if it was successfull in writing the cache out, - * and -1 if it was not. - */ -static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, - struct btrfs_free_space_ctl *ctl, - struct btrfs_block_group_cache *block_group, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, u64 offset) +static noinline_for_stack +int write_cache_extent_entries(struct io_ctl *io_ctl, + struct btrfs_free_space_ctl *ctl, + struct btrfs_block_group_cache *block_group, + int *entries, int *bitmaps, + struct list_head *bitmap_list) { - struct btrfs_free_space_header *header; - struct extent_buffer *leaf; - struct rb_node *node; - struct list_head *pos, *n; - struct extent_state *cached_state = NULL; - struct btrfs_free_cluster *cluster = NULL; - struct extent_io_tree *unpin = NULL; - struct io_ctl io_ctl; - struct list_head bitmap_list; - struct btrfs_key key; - u64 start, extent_start, extent_end, len; - int entries = 0; - int bitmaps = 0; int ret; - int err = -1; - - INIT_LIST_HEAD(&bitmap_list); - - if (!i_size_read(inode)) - return -1; - - ret = io_ctl_init(&io_ctl, inode, root); - if (ret) - return -1; + struct btrfs_free_cluster *cluster = NULL; + struct rb_node *node = rb_first(&ctl->free_space_offset); /* Get the cluster for this block_group if it exists */ - if (block_group && !list_empty(&block_group->cluster_list)) + if (block_group && !list_empty(&block_group->cluster_list)) { cluster = list_entry(block_group->cluster_list.next, struct btrfs_free_cluster, block_group_list); + } - /* Lock all pages first so we can lock the extent safely. */ - io_ctl_prepare_pages(&io_ctl, inode, 0); - - lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - 0, &cached_state); - - node = rb_first(&ctl->free_space_offset); if (!node && cluster) { node = rb_first(&cluster->root); cluster = NULL; } - /* Make sure we can fit our crcs into the first page */ - if (io_ctl.check_crcs && - (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) - goto out_nospc; - - io_ctl_set_generation(&io_ctl, trans->transid); - /* Write out the extent entries */ while (node) { struct btrfs_free_space *e; e = rb_entry(node, struct btrfs_free_space, offset_index); - entries++; + *entries += 1; - ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes, + ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, e->bitmap); if (ret) - goto out_nospc; + goto fail; if (e->bitmap) { - list_add_tail(&e->list, &bitmap_list); - bitmaps++; + list_add_tail(&e->list, bitmap_list); + *bitmaps += 1; } node = rb_next(node); if (!node && cluster) { @@ -943,131 +917,289 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, cluster = NULL; } } + return 0; +fail: + return -ENOSPC; +} + +static noinline_for_stack int +update_cache_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path, u64 offset, + int entries, int bitmaps) +{ + struct btrfs_key key; + struct btrfs_free_space_header *header; + struct extent_buffer *leaf; + int ret; + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = offset; + key.type = 0; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, + GFP_NOFS); + goto fail; + } + leaf = path->nodes[0]; + if (ret > 0) { + struct btrfs_key found_key; + ASSERT(path->slots[0]); + path->slots[0]--; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || + found_key.offset != offset) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, + inode->i_size - 1, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, + NULL, GFP_NOFS); + btrfs_release_path(path); + goto fail; + } + } + + BTRFS_I(inode)->generation = trans->transid; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + btrfs_set_free_space_entries(leaf, header, entries); + btrfs_set_free_space_bitmaps(leaf, header, bitmaps); + btrfs_set_free_space_generation(leaf, header, trans->transid); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + return 0; + +fail: + return -1; +} + +static noinline_for_stack int +write_pinned_extent_entries(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group, + struct io_ctl *io_ctl, + int *entries) +{ + u64 start, extent_start, extent_end, len; + struct extent_io_tree *unpin = NULL; + int ret; + + if (!block_group) + return 0; /* * We want to add any pinned extents to our free space cache * so we don't leak the space - */ - - /* + * * We shouldn't have switched the pinned extents yet so this is the * right one */ unpin = root->fs_info->pinned_extents; - if (block_group) - start = block_group->key.objectid; + start = block_group->key.objectid; - while (block_group && (start < block_group->key.objectid + - block_group->key.offset)) { + while (start < block_group->key.objectid + block_group->key.offset) { ret = find_first_extent_bit(unpin, start, &extent_start, &extent_end, EXTENT_DIRTY, NULL); - if (ret) { - ret = 0; - break; - } + if (ret) + return 0; /* This pinned extent is out of our range */ if (extent_start >= block_group->key.objectid + block_group->key.offset) - break; + return 0; extent_start = max(extent_start, start); extent_end = min(block_group->key.objectid + block_group->key.offset, extent_end + 1); len = extent_end - extent_start; - entries++; - ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL); + *entries += 1; + ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); if (ret) - goto out_nospc; + return -ENOSPC; start = extent_end; } + return 0; +} + +static noinline_for_stack int +write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list) +{ + struct list_head *pos, *n; + int ret; + /* Write out the bitmaps */ - list_for_each_safe(pos, n, &bitmap_list) { + list_for_each_safe(pos, n, bitmap_list) { struct btrfs_free_space *entry = list_entry(pos, struct btrfs_free_space, list); - ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap); + ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); if (ret) - goto out_nospc; + return -ENOSPC; list_del_init(&entry->list); } - /* Zero out the rest of the pages just to make sure */ - io_ctl_zero_remaining_pages(&io_ctl); + return 0; +} - ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, - 0, i_size_read(inode), &cached_state); - io_ctl_drop_pages(&io_ctl); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, &cached_state, GFP_NOFS); +static int flush_dirty_cache(struct inode *inode) +{ + int ret; + ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (ret) - goto out; - - - btrfs_wait_ordered_range(inode, 0, (u64)-1); - - key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; - key.type = 0; - - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - if (ret < 0) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, GFP_NOFS); - goto out; + + return ret; +} + +static void noinline_for_stack +cleanup_write_cache_enospc(struct inode *inode, + struct io_ctl *io_ctl, + struct extent_state **cached_state, + struct list_head *bitmap_list) +{ + struct list_head *pos, *n; + + list_for_each_safe(pos, n, bitmap_list) { + struct btrfs_free_space *entry = + list_entry(pos, struct btrfs_free_space, list); + list_del_init(&entry->list); } - leaf = path->nodes[0]; - if (ret > 0) { - struct btrfs_key found_key; - ASSERT(path->slots[0]); - path->slots[0]--; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || - found_key.offset != offset) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, - inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, - NULL, GFP_NOFS); - btrfs_release_path(path); + io_ctl_drop_pages(io_ctl); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, + i_size_read(inode) - 1, cached_state, + GFP_NOFS); +} + +/** + * __btrfs_write_out_cache - write out cached info to an inode + * @root - the root the inode belongs to + * @ctl - the free space cache we are going to write out + * @block_group - the block_group for this cache if it belongs to a block_group + * @trans - the trans handle + * @path - the path to use + * @offset - the offset for the key we'll insert + * + * This function writes out a free space cache struct to disk for quick recovery + * on mount. This will return 0 if it was successfull in writing the cache out, + * and -1 if it was not. + */ +static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, + struct btrfs_free_space_ctl *ctl, + struct btrfs_block_group_cache *block_group, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, u64 offset) +{ + struct extent_state *cached_state = NULL; + struct io_ctl io_ctl; + LIST_HEAD(bitmap_list); + int entries = 0; + int bitmaps = 0; + int ret; + + if (!i_size_read(inode)) + return -1; + + ret = io_ctl_init(&io_ctl, inode, root, 1); + if (ret) + return -1; + + if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { + down_write(&block_group->data_rwsem); + spin_lock(&block_group->lock); + if (block_group->delalloc_bytes) { + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + up_write(&block_group->data_rwsem); + BTRFS_I(inode)->generation = 0; + ret = 0; goto out; } + spin_unlock(&block_group->lock); } - BTRFS_I(inode)->generation = trans->transid; - header = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_free_space_header); - btrfs_set_free_space_entries(leaf, header, entries); - btrfs_set_free_space_bitmaps(leaf, header, bitmaps); - btrfs_set_free_space_generation(leaf, header, trans->transid); - btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); + /* Lock all pages first so we can lock the extent safely. */ + io_ctl_prepare_pages(&io_ctl, inode, 0); + + lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + 0, &cached_state); - err = 0; + io_ctl_set_generation(&io_ctl, trans->transid); + + /* Write out the extent entries in the free space cache */ + ret = write_cache_extent_entries(&io_ctl, ctl, + block_group, &entries, &bitmaps, + &bitmap_list); + if (ret) + goto out_nospc; + + /* + * Some spaces that are freed in the current transaction are pinned, + * they will be added into free space cache after the transaction is + * committed, we shouldn't lose them. + */ + ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries); + if (ret) + goto out_nospc; + + /* At last, we write out all the bitmaps. */ + ret = write_bitmap_entries(&io_ctl, &bitmap_list); + if (ret) + goto out_nospc; + + /* Zero out the rest of the pages just to make sure */ + io_ctl_zero_remaining_pages(&io_ctl); + + /* Everything is written out, now we dirty the pages in the file. */ + ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, + 0, i_size_read(inode), &cached_state); + if (ret) + goto out_nospc; + + if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) + up_write(&block_group->data_rwsem); + /* + * Release the pages and unlock the extent, we will flush + * them out later + */ + io_ctl_drop_pages(&io_ctl); + + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, + i_size_read(inode) - 1, &cached_state, GFP_NOFS); + + /* Flush the dirty pages in the cache file. */ + ret = flush_dirty_cache(inode); + if (ret) + goto out; + + /* Update the cache item to tell everyone this cache file is valid. */ + ret = update_cache_item(trans, root, inode, path, offset, + entries, bitmaps); out: io_ctl_free(&io_ctl); - if (err) { + if (ret) { invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; } btrfs_update_inode(trans, root, inode); - return err; + return ret; out_nospc: - list_for_each_safe(pos, n, &bitmap_list) { - struct btrfs_free_space *entry = - list_entry(pos, struct btrfs_free_space, list); - list_del_init(&entry->list); - } - io_ctl_drop_pages(&io_ctl); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, &cached_state, GFP_NOFS); + cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list); + + if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) + up_write(&block_group->data_rwsem); + goto out; } @@ -1087,6 +1219,12 @@ int btrfs_write_out_cache(struct btrfs_root *root, spin_unlock(&block_group->lock); return 0; } + + if (block_group->delalloc_bytes) { + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + return 0; + } spin_unlock(&block_group->lock); inode = lookup_free_space_inode(root, block_group, path); @@ -1898,7 +2036,7 @@ out: spin_unlock(&ctl->tree_lock); if (ret) { - printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret); + printk(KERN_CRIT "BTRFS: unable to add free space :%d\n", ret); ASSERT(ret != -EEXIST); } @@ -2007,14 +2145,15 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, info = rb_entry(n, struct btrfs_free_space, offset_index); if (info->bytes >= bytes && !block_group->ro) count++; - printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n", - info->offset, info->bytes, + btrfs_crit(block_group->fs_info, + "entry offset %llu, bytes %llu, bitmap %s", + info->offset, info->bytes, (info->bitmap) ? "yes" : "no"); } - printk(KERN_INFO "block group has cluster?: %s\n", + btrfs_info(block_group->fs_info, "block group has cluster?: %s", list_empty(&block_group->cluster_list) ? "no" : "yes"); - printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" - "\n", count); + btrfs_info(block_group->fs_info, + "%d blocks of free space at or bigger than bytes is", count); } void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) @@ -2276,7 +2415,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, goto out; entry = rb_entry(node, struct btrfs_free_space, offset_index); - while(1) { + while (1) { if (entry->bytes < bytes && entry->bytes > *max_extent_size) *max_extent_size = entry->bytes; @@ -2417,7 +2556,6 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *entry = NULL; struct btrfs_free_space *last; struct rb_node *node; - u64 window_start; u64 window_free; u64 max_extent; u64 total_size = 0; @@ -2439,7 +2577,6 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, entry = rb_entry(node, struct btrfs_free_space, offset_index); } - window_start = entry->offset; window_free = entry->bytes; max_extent = entry->bytes; first = entry; @@ -2967,19 +3104,15 @@ out: int btrfs_write_out_ino_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, - struct btrfs_path *path) + struct btrfs_path *path, + struct inode *inode) { struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct inode *inode; int ret; if (!btrfs_test_opt(root, INODE_MAP_CACHE)) return 0; - inode = lookup_free_ino_inode(root, path); - if (IS_ERR(inode)) - return 0; - ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); if (ret) { btrfs_delalloc_release_metadata(inode, inode->i_size); @@ -2990,7 +3123,6 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, #endif } - iput(inode); return ret; } diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index e737f92cf6d..0cf4977ef70 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -58,7 +58,6 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, struct btrfs_block_rsv *rsv); int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, - struct btrfs_path *path, struct inode *inode); int load_free_space_cache(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group); @@ -76,7 +75,8 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root); int btrfs_write_out_ino_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, - struct btrfs_path *path); + struct btrfs_path *path, + struct inode *inode); void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group); int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl, diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c new file mode 100644 index 00000000000..85889aa82c6 --- /dev/null +++ b/fs/btrfs/hash.c @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <crypto/hash.h> +#include <linux/err.h> +#include "hash.h" + +static struct crypto_shash *tfm; + +int __init btrfs_hash_init(void) +{ + tfm = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + return 0; +} + +void btrfs_hash_exit(void) +{ + crypto_free_shash(tfm); +} + +u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[crypto_shash_descsize(tfm)]; + } desc; + int err; + + desc.shash.tfm = tfm; + desc.shash.flags = 0; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(&desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h index 1d982812ab6..118a2316e5d 100644 --- a/fs/btrfs/hash.h +++ b/fs/btrfs/hash.h @@ -19,10 +19,15 @@ #ifndef __HASH__ #define __HASH__ -#include <linux/crc32c.h> +int __init btrfs_hash_init(void); + +void btrfs_hash_exit(void); + +u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length); + static inline u64 btrfs_name_hash(const char *name, int len) { - return crc32c((u32)~1, name, len); + return btrfs_crc32c((u32)~1, name, len); } /* @@ -31,7 +36,7 @@ static inline u64 btrfs_name_hash(const char *name, int len) static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, int len) { - return (u64) crc32c(parent_objectid, name, len); + return (u64) btrfs_crc32c(parent_objectid, name, len); } #endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index e0b7034d634..2be38df703c 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -91,32 +91,6 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid, return 0; } -static struct btrfs_inode_ref * -btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, int ins_len, - int cow) -{ - int ret; - struct btrfs_key key; - struct btrfs_inode_ref *ref; - - key.objectid = inode_objectid; - key.type = BTRFS_INODE_REF_KEY; - key.offset = ref_objectid; - - ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) - return NULL; - if (!find_name_in_backref(path, name, name_len, &ref)) - return NULL; - return ref; -} - /* Returns NULL if no extref found */ struct btrfs_inode_extref * btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, @@ -144,45 +118,6 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, return extref; } -int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, int mod, - u64 *ret_index) -{ - struct btrfs_inode_ref *ref; - struct btrfs_inode_extref *extref; - int ins_len = mod < 0 ? -1 : 0; - int cow = mod != 0; - - ref = btrfs_lookup_inode_ref(trans, root, path, name, name_len, - inode_objectid, ref_objectid, ins_len, - cow); - if (IS_ERR(ref)) - return PTR_ERR(ref); - - if (ref != NULL) { - *ret_index = btrfs_inode_ref_index(path->nodes[0], ref); - return 0; - } - - btrfs_release_path(path); - - extref = btrfs_lookup_inode_extref(trans, root, path, name, - name_len, inode_objectid, - ref_objectid, ins_len, cow); - if (IS_ERR(extref)) - return PTR_ERR(extref); - - if (extref) { - *ret_index = btrfs_inode_extref_index(path->nodes[0], extref); - return 0; - } - - return -ENOENT; -} - static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, @@ -369,7 +304,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, goto out; leaf = path->nodes[0]; - item = btrfs_item_nr(leaf, path->slots[0]); + item = btrfs_item_nr(path->slots[0]); ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char); ptr += btrfs_item_size(leaf, item) - ins_len; extref = (struct btrfs_inode_extref *)ptr; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 2c66ddbbe67..888fbe19079 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -55,7 +55,7 @@ static int caching_kthread(void *data) key.type = BTRFS_INODE_ITEM_KEY; again: /* need to make sure the commit_root doesn't disappear */ - mutex_lock(&root->fs_commit_mutex); + down_read(&fs_info->commit_root_sem); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -78,10 +78,8 @@ again: btrfs_transaction_in_commit(fs_info)) { leaf = path->nodes[0]; - if (btrfs_header_nritems(leaf) == 0) { - WARN_ON(1); + if (WARN_ON(btrfs_header_nritems(leaf) == 0)) break; - } /* * Save the key so we can advances forward @@ -90,7 +88,7 @@ again: btrfs_item_key_to_cpu(leaf, &key, 0); btrfs_release_path(path); root->cache_progress = last; - mutex_unlock(&root->fs_commit_mutex); + up_read(&fs_info->commit_root_sem); schedule_timeout(1); goto again; } else @@ -129,7 +127,7 @@ next: btrfs_unpin_free_ino(root); out: wake_up(&root->cache_wait); - mutex_unlock(&root->fs_commit_mutex); + up_read(&fs_info->commit_root_sem); btrfs_free_path(path); @@ -176,9 +174,13 @@ static void start_caching(struct btrfs_root *root) BTRFS_LAST_FREE_OBJECTID - objectid + 1); } - tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n", + tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu", root->root_key.objectid); - BUG_ON(IS_ERR(tsk)); /* -ENOMEM */ + if (IS_ERR(tsk)) { + btrfs_warn(root->fs_info, "failed to start inode caching task"); + btrfs_clear_and_info(root, CHANGE_INODE_CACHE, + "disabling inode map caching"); + } } int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) @@ -207,42 +209,28 @@ again: void btrfs_return_ino(struct btrfs_root *root, u64 objectid) { - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; if (!btrfs_test_opt(root, INODE_MAP_CACHE)) return; - again: if (root->cached == BTRFS_CACHE_FINISHED) { - __btrfs_add_free_space(ctl, objectid, 1); + __btrfs_add_free_space(pinned, objectid, 1); } else { - /* - * If we are in the process of caching free ino chunks, - * to avoid adding the same inode number to the free_ino - * tree twice due to cross transaction, we'll leave it - * in the pinned tree until a transaction is committed - * or the caching work is done. - */ - - mutex_lock(&root->fs_commit_mutex); + down_write(&root->fs_info->commit_root_sem); spin_lock(&root->cache_lock); if (root->cached == BTRFS_CACHE_FINISHED) { spin_unlock(&root->cache_lock); - mutex_unlock(&root->fs_commit_mutex); + up_write(&root->fs_info->commit_root_sem); goto again; } spin_unlock(&root->cache_lock); start_caching(root); - if (objectid <= root->cache_progress || - objectid > root->highest_objectid) - __btrfs_add_free_space(ctl, objectid, 1); - else - __btrfs_add_free_space(pinned, objectid, 1); + __btrfs_add_free_space(pinned, objectid, 1); - mutex_unlock(&root->fs_commit_mutex); + up_write(&root->fs_info->commit_root_sem); } } @@ -252,7 +240,7 @@ again: * and others will just be dropped, because the commit root we were * searching has changed. * - * Must be called with root->fs_commit_mutex held + * Must be called with root->fs_info->commit_root_sem held */ void btrfs_unpin_free_ino(struct btrfs_root *root) { @@ -412,8 +400,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root, return 0; /* Don't save inode cache if we are deleting this root */ - if (btrfs_root_refs(&root->root_item) == 0 && - root != root->fs_info->tree_root) + if (btrfs_root_refs(&root->root_item) == 0) return 0; if (!btrfs_test_opt(root, INODE_MAP_CACHE)) @@ -467,7 +454,7 @@ again: } if (i_size_read(inode) > 0) { - ret = btrfs_truncate_free_space_cache(root, trans, path, inode); + ret = btrfs_truncate_free_space_cache(root, trans, inode); if (ret) { if (ret != -ENOSPC) btrfs_abort_transaction(trans, root, ret); @@ -504,7 +491,7 @@ again: } btrfs_free_reserved_data_space(inode, prealloc); - ret = btrfs_write_out_ino_cache(root, trans, path); + ret = btrfs_write_out_ino_cache(root, trans, path, inode); out_put: iput(inode); out_release: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 22ebc13b6c9..3668048e16f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -43,7 +43,6 @@ #include <linux/btrfs.h> #include <linux/blkdev.h> #include <linux/posix_acl_xattr.h> -#include "compat.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -59,9 +58,10 @@ #include "inode-map.h" #include "backref.h" #include "hash.h" +#include "props.h" struct btrfs_iget_args { - u64 ino; + struct btrfs_key *location; struct btrfs_root *root; }; @@ -125,14 +125,13 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, * the btree. The caller should have done a btrfs_drop_extents so that * no overlapping inline items exist in the btree */ -static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, +static int insert_inline_extent(struct btrfs_trans_handle *trans, + struct btrfs_path *path, int extent_inserted, struct btrfs_root *root, struct inode *inode, u64 start, size_t size, size_t compressed_size, int compress_type, struct page **compressed_pages) { - struct btrfs_key key; - struct btrfs_path *path; struct extent_buffer *leaf; struct page *page = NULL; char *kaddr; @@ -141,29 +140,29 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, int err = 0; int ret; size_t cur_size = size; - size_t datasize; unsigned long offset; if (compressed_size && compressed_pages) cur_size = compressed_size; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + inode_add_bytes(inode, size); - path->leave_spinning = 1; + if (!extent_inserted) { + struct btrfs_key key; + size_t datasize; - key.objectid = btrfs_ino(inode); - key.offset = start; - btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); - datasize = btrfs_file_extent_calc_inline_size(cur_size); + key.objectid = btrfs_ino(inode); + key.offset = start; + btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); - inode_add_bytes(inode, size); - ret = btrfs_insert_empty_item(trans, root, path, &key, - datasize); - if (ret) { - err = ret; - goto fail; + datasize = btrfs_file_extent_calc_inline_size(cur_size); + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, root, path, &key, + datasize); + if (ret) { + err = ret; + goto fail; + } } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], @@ -204,7 +203,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, page_cache_release(page); } btrfs_mark_buffer_dirty(leaf); - btrfs_free_path(path); + btrfs_release_path(path); /* * we're an inline extent, so nobody can @@ -220,7 +219,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, return ret; fail: - btrfs_free_path(path); return err; } @@ -243,6 +241,9 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, u64 aligned_end = ALIGN(end, root->sectorsize); u64 data_len = inline_len; int ret; + struct btrfs_path *path; + int extent_inserted = 0; + u32 extent_item_size; if (compressed_size) data_len = compressed_size; @@ -257,12 +258,27 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, return 1; } + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) + if (IS_ERR(trans)) { + btrfs_free_path(path); return PTR_ERR(trans); + } trans->block_rsv = &root->fs_info->delalloc_block_rsv; - ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1); + if (compressed_size && compressed_pages) + extent_item_size = btrfs_file_extent_calc_inline_size( + compressed_size); + else + extent_item_size = btrfs_file_extent_calc_inline_size( + inline_len); + + ret = __btrfs_drop_extents(trans, root, inode, path, + start, aligned_end, NULL, + 1, 1, extent_item_size, &extent_inserted); if (ret) { btrfs_abort_transaction(trans, root, ret); goto out; @@ -270,7 +286,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, if (isize > actual_end) inline_len = min_t(u64, isize, actual_end); - ret = insert_inline_extent(trans, root, inode, start, + ret = insert_inline_extent(trans, path, extent_inserted, + root, inode, start, inline_len, compressed_size, compress_type, compressed_pages); if (ret && ret != -ENOSPC) { @@ -285,6 +302,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); out: + btrfs_free_path(path); btrfs_end_transaction(trans, root); return ret; } @@ -376,6 +394,14 @@ static noinline int compress_file_range(struct inode *inode, (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) btrfs_add_inode_defrag(NULL, inode); + /* + * skip compression for a small file range(<=blocksize) that + * isn't an inline extent, since it dosen't save disk space at all. + */ + if ((end - start + 1) <= blocksize && + (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) + goto cleanup_and_bail_uncompressed; + actual_end = min_t(u64, isize, end + 1); again: will_compress = 0; @@ -667,7 +693,7 @@ retry: ret = btrfs_reserve_extent(root, async_extent->compressed_size, async_extent->compressed_size, - 0, alloc_hint, &ins, 1); + 0, alloc_hint, &ins, 1, 1); if (ret) { int i; @@ -768,7 +794,7 @@ retry: out: return ret; out_free_reserve: - btrfs_free_reserved_extent(root, ins.objectid, ins.offset); + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_free: extent_clear_unlock_delalloc(inode, async_extent->start, async_extent->start + @@ -844,7 +870,11 @@ static noinline int cow_file_range(struct inode *inode, struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; int ret = 0; - BUG_ON(btrfs_is_free_space_inode(inode)); + if (btrfs_is_free_space_inode(inode)) { + WARN_ON_ONCE(1); + ret = -EINVAL; + goto out_unlock; + } num_bytes = ALIGN(end - start + 1, blocksize); num_bytes = max(blocksize, num_bytes); @@ -887,7 +917,7 @@ static noinline int cow_file_range(struct inode *inode, cur_alloc_size = disk_num_bytes; ret = btrfs_reserve_extent(root, cur_alloc_size, root->sectorsize, 0, alloc_hint, - &ins, 1); + &ins, 1, 1); if (ret < 0) goto out_unlock; @@ -965,7 +995,7 @@ out: return ret; out_reserve: - btrfs_free_reserved_extent(root, ins.objectid, ins.offset); + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_unlock: extent_clear_unlock_delalloc(inode, start, end, locked_page, EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | @@ -1054,17 +1084,15 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->end = cur_end; INIT_LIST_HEAD(&async_cow->extents); - async_cow->work.func = async_cow_start; - async_cow->work.ordered_func = async_cow_submit; - async_cow->work.ordered_free = async_cow_free; - async_cow->work.flags = 0; + btrfs_init_work(&async_cow->work, async_cow_start, + async_cow_submit, async_cow_free); nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> PAGE_CACHE_SHIFT; atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); - btrfs_queue_worker(&root->fs_info->delalloc_workers, - &async_cow->work); + btrfs_queue_work(root->fs_info->delalloc_workers, + &async_cow->work); if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { wait_event(root->fs_info->async_submit_wait, @@ -1178,10 +1206,8 @@ static noinline int run_delalloc_nocow(struct inode *inode, while (1) { ret = btrfs_lookup_file_extent(trans, root, path, ino, cur_offset, 0); - if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + if (ret < 0) goto error; - } if (ret > 0 && path->slots[0] > 0 && check_prev) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, @@ -1195,10 +1221,8 @@ next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); - if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + if (ret < 0) goto error; - } if (ret > 0) break; leaf = path->nodes[0]; @@ -1255,6 +1279,15 @@ next_slot: disk_bytenr += cur_offset - found_key.offset; num_bytes = min(end + 1, extent_end) - cur_offset; /* + * if there are pending snapshots for this root, + * we fall into common COW way. + */ + if (!nolock) { + err = btrfs_start_nocow_write(root); + if (!err) + goto out_check; + } + /* * force cow if csum exists in the range. * this ensure that csum for a given extent are * either valid or do not exist. @@ -1264,7 +1297,8 @@ next_slot: nocow = 1; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = found_key.offset + - btrfs_file_extent_inline_len(leaf, fi); + btrfs_file_extent_inline_len(leaf, + path->slots[0], fi); extent_end = ALIGN(extent_end, root->sectorsize); } else { BUG_ON(1); @@ -1272,6 +1306,8 @@ next_slot: out_check: if (extent_end <= start) { path->slots[0]++; + if (!nolock && nocow) + btrfs_end_nocow_write(root); goto next_slot; } if (!nocow) { @@ -1290,7 +1326,8 @@ out_check: cow_start, found_key.offset - 1, page_started, nr_written, 1); if (ret) { - btrfs_abort_transaction(trans, root, ret); + if (!nolock && nocow) + btrfs_end_nocow_write(root); goto error; } cow_start = (u64)-1; @@ -1340,7 +1377,8 @@ out_check: ret = btrfs_reloc_clone_csums(inode, cur_offset, num_bytes); if (ret) { - btrfs_abort_transaction(trans, root, ret); + if (!nolock && nocow) + btrfs_end_nocow_write(root); goto error; } } @@ -1350,6 +1388,8 @@ out_check: locked_page, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_SET_PRIVATE2); + if (!nolock && nocow) + btrfs_end_nocow_write(root); cur_offset = extent_end; if (cur_offset > end) break; @@ -1364,10 +1404,8 @@ out_check: if (cow_start != (u64)-1) { ret = cow_file_range(inode, locked_page, cow_start, end, page_started, nr_written, 1); - if (ret) { - btrfs_abort_transaction(trans, root, ret); + if (ret) goto error; - } } error: @@ -1551,7 +1589,13 @@ static void btrfs_clear_bit_hook(struct inode *inode, spin_unlock(&BTRFS_I(inode)->lock); } - if (*bits & EXTENT_DO_ACCOUNTING) + /* + * We don't reserve metadata space for space cache inodes so we + * don't need to call dellalloc_release_metadata if there is an + * error. + */ + if (*bits & EXTENT_DO_ACCOUNTING && + root != root->fs_info->tree_root) btrfs_delalloc_release_metadata(inode, len); if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID @@ -1579,7 +1623,7 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, unsigned long bio_flags) { struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - u64 logical = (u64)bio->bi_sector << 9; + u64 logical = (u64)bio->bi_iter.bi_sector << 9; u64 length = 0; u64 map_length; int ret; @@ -1587,7 +1631,7 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, if (bio_flags & EXTENT_BIO_COMPRESSED) return 0; - length = bio->bi_size; + length = bio->bi_iter.bi_size; map_length = length; ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, NULL, 0); @@ -1825,9 +1869,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) SetPageChecked(page); page_cache_get(page); - fixup->work.func = btrfs_writepage_fixup_worker; + btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; - btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); + btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work); return -EBUSY; } @@ -1843,14 +1887,13 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key ins; + int extent_inserted = 0; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->leave_spinning = 1; - /* * we may be replacing one extent in the tree with another. * The new extent is pinned in the extent map, and we don't want @@ -1860,17 +1903,23 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, * the caller is expected to unpin it and allow it to be merged * with the others. */ - ret = btrfs_drop_extents(trans, root, inode, file_pos, - file_pos + num_bytes, 0); + ret = __btrfs_drop_extents(trans, root, inode, path, file_pos, + file_pos + num_bytes, NULL, 0, + 1, sizeof(*fi), &extent_inserted); if (ret) goto out; - ins.objectid = btrfs_ino(inode); - ins.offset = file_pos; - ins.type = BTRFS_EXTENT_DATA_KEY; - ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); - if (ret) - goto out; + if (!extent_inserted) { + ins.objectid = btrfs_ino(inode); + ins.offset = file_pos; + ins.type = BTRFS_EXTENT_DATA_KEY; + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, root, path, &ins, + sizeof(*fi)); + if (ret) + goto out; + } leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -2041,10 +2090,8 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, key.offset = offset; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { - WARN_ON(1); + if (WARN_ON(ret < 0)) return ret; - } ret = 0; while (1) { @@ -2133,7 +2180,8 @@ static noinline bool record_extent_backrefs(struct btrfs_path *path, old->extent_offset, fs_info, path, record_one_backref, old); - BUG_ON(ret < 0 && ret != -ENOENT); + if (ret < 0 && ret != -ENOENT) + return false; /* no backref to be processed for this extent */ if (!old->count) { @@ -2217,6 +2265,11 @@ static noinline int relink_extent_backref(struct btrfs_path *path, return PTR_ERR(root); } + if (btrfs_root_readonly(root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + return 0; + } + /* step 2: get inode */ key.objectid = backref->inum; key.type = BTRFS_INODE_ITEM_KEY; @@ -2293,7 +2346,7 @@ again: u64 extent_len; struct btrfs_key found_key; - ret = btrfs_search_slot(trans, root, &key, path, 1, 1); + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) goto out_free_path; @@ -2367,10 +2420,23 @@ out_unlock: return ret; } +static void free_sa_defrag_extent(struct new_sa_defrag_extent *new) +{ + struct old_sa_defrag_extent *old, *tmp; + + if (!new) + return; + + list_for_each_entry_safe(old, tmp, &new->head, list) { + list_del(&old->list); + kfree(old); + } + kfree(new); +} + static void relink_file_extents(struct new_sa_defrag_extent *new) { struct btrfs_path *path; - struct old_sa_defrag_extent *old, *tmp; struct sa_defrag_extent_backref *backref; struct sa_defrag_extent_backref *prev = NULL; struct inode *inode; @@ -2413,16 +2479,11 @@ static void relink_file_extents(struct new_sa_defrag_extent *new) kfree(prev); btrfs_free_path(path); - - list_for_each_entry_safe(old, tmp, &new->head, list) { - list_del(&old->list); - kfree(old); - } out: + free_sa_defrag_extent(new); + atomic_dec(&root->fs_info->defrag_running); wake_up(&root->fs_info->transaction_wait); - - kfree(new); } static struct new_sa_defrag_extent * @@ -2432,7 +2493,7 @@ record_old_file_extents(struct inode *inode, struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; struct btrfs_key key; - struct old_sa_defrag_extent *old, *tmp; + struct old_sa_defrag_extent *old; struct new_sa_defrag_extent *new; int ret; @@ -2480,7 +2541,7 @@ record_old_file_extents(struct inode *inode, if (slot >= btrfs_header_nritems(l)) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto out_free_list; + goto out_free_path; else if (ret > 0) break; continue; @@ -2509,7 +2570,7 @@ record_old_file_extents(struct inode *inode, old = kmalloc(sizeof(*old), GFP_NOFS); if (!old) - goto out_free_list; + goto out_free_path; offset = max(new->file_pos, key.offset); end = min(new->file_pos + new->len, key.offset + num_bytes); @@ -2531,24 +2592,28 @@ next: return new; -out_free_list: - list_for_each_entry_safe(old, tmp, &new->head, list) { - list_del(&old->list); - kfree(old); - } out_free_path: btrfs_free_path(path); out_kfree: - kfree(new); + free_sa_defrag_extent(new); return NULL; } -/* - * helper function for btrfs_finish_ordered_io, this - * just reads in some of the csum leaves to prime them into ram - * before we start the transaction. It limits the amount of btree - * reads required while inside the transaction. - */ +static void btrfs_release_delalloc_bytes(struct btrfs_root *root, + u64 start, u64 len) +{ + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(root->fs_info, start); + ASSERT(cache); + + spin_lock(&cache->lock); + cache->delalloc_bytes -= len; + spin_unlock(&cache->lock); + + btrfs_put_block_group(cache); +} + /* as ordered data IO finishes, this gets called so we can finish * an ordered extent if the range of bytes in the file it covers are * fully written. @@ -2610,7 +2675,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) EXTENT_DEFRAG, 1, cached_state); if (ret) { u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); - if (last_snapshot >= BTRFS_I(inode)->generation) + if (0 && last_snapshot >= BTRFS_I(inode)->generation) /* the inode is shared */ new = record_old_file_extents(inode, ordered_extent); @@ -2628,6 +2693,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) trans = NULL; goto out_unlock; } + trans->block_rsv = &root->fs_info->delalloc_block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) @@ -2647,6 +2713,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) logical_len, logical_len, compress_type, 0, 0, BTRFS_FILE_EXTENT_REG); + if (!ret) + btrfs_release_delalloc_bytes(root, + ordered_extent->start, + ordered_extent->disk_len); } unpin_extent_cache(&BTRFS_I(inode)->extent_tree, ordered_extent->file_offset, ordered_extent->len, @@ -2699,7 +2769,7 @@ out: !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) btrfs_free_reserved_extent(root, ordered_extent->start, - ordered_extent->disk_len); + ordered_extent->disk_len, 1); } @@ -2710,8 +2780,14 @@ out: btrfs_remove_ordered_extent(inode, ordered_extent); /* for snapshot-aware defrag */ - if (new) - relink_file_extents(new); + if (new) { + if (ret) { + free_sa_defrag_extent(new); + atomic_dec(&root->fs_info->defrag_running); + } else { + relink_file_extents(new); + } + } /* once for us */ btrfs_put_ordered_extent(ordered_extent); @@ -2734,7 +2810,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, struct inode *inode = page->mapping->host; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ordered_extent *ordered_extent = NULL; - struct btrfs_workers *workers; + struct btrfs_workqueue *workers; trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); @@ -2743,14 +2819,13 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, end - start + 1, uptodate)) return 0; - ordered_extent->work.func = finish_ordered_fn; - ordered_extent->work.flags = 0; + btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL); if (btrfs_is_free_space_inode(inode)) - workers = &root->fs_info->endio_freespace_worker; + workers = root->fs_info->endio_freespace_worker; else - workers = &root->fs_info->endio_write_workers; - btrfs_queue_worker(workers, &ordered_extent->work); + workers = root->fs_info->endio_write_workers; + btrfs_queue_work(workers, &ordered_extent->work); return 0; } @@ -2892,14 +2967,15 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, root->orphan_block_rsv = NULL; spin_unlock(&root->orphan_lock); - if (root->orphan_item_inserted && + if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) && btrfs_root_refs(&root->root_item) > 0) { ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, root->root_key.objectid); if (ret) btrfs_abort_transaction(trans, root, ret); else - root->orphan_item_inserted = 0; + clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, + &root->state); } if (block_rsv) { @@ -2969,6 +3045,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) if (insert >= 1) { ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); if (ret) { + atomic_dec(&root->orphan_inodes); if (reserve) { clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, &BTRFS_I(inode)->runtime_flags); @@ -3018,14 +3095,16 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans, release_rsv = 1; spin_unlock(&root->orphan_lock); - if (trans && delete_item) - ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode)); - - if (release_rsv) { - btrfs_orphan_release_metadata(inode); + if (delete_item) { atomic_dec(&root->orphan_inodes); + if (trans) + ret = btrfs_del_orphan_item(trans, root, + btrfs_ino(inode)); } + if (release_rsv) + btrfs_orphan_release_metadata(inode); + return ret; } @@ -3172,8 +3251,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) /* if we have links, this was a truncate, lets do that */ if (inode->i_nlink) { - if (!S_ISREG(inode->i_mode)) { - WARN_ON(1); + if (WARN_ON(!S_ISREG(inode->i_mode))) { iput(inode); continue; } @@ -3214,7 +3292,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) btrfs_block_rsv_release(root, root->orphan_block_rsv, (u64)-1); - if (root->orphan_block_rsv || root->orphan_item_inserted) { + if (root->orphan_block_rsv || + test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { trans = btrfs_join_transaction(root); if (!IS_ERR(trans)) btrfs_end_transaction(trans, root); @@ -3240,7 +3319,8 @@ out: * slot is the slot the inode is in, objectid is the objectid of the inode */ static noinline int acls_after_inode_item(struct extent_buffer *leaf, - int slot, u64 objectid) + int slot, u64 objectid, + int *first_xattr_slot) { u32 nritems = btrfs_header_nritems(leaf); struct btrfs_key found_key; @@ -3256,6 +3336,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, } slot++; + *first_xattr_slot = -1; while (slot < nritems) { btrfs_item_key_to_cpu(leaf, &found_key, slot); @@ -3265,6 +3346,8 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, /* we found an xattr, assume we've got an acl */ if (found_key.type == BTRFS_XATTR_ITEM_KEY) { + if (*first_xattr_slot == -1) + *first_xattr_slot = slot; if (found_key.offset == xattr_access || found_key.offset == xattr_default) return 1; @@ -3293,6 +3376,8 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, * something larger than an xattr. We have to assume the inode * has acls */ + if (*first_xattr_slot == -1) + *first_xattr_slot = slot; return 1; } @@ -3307,10 +3392,12 @@ static void btrfs_read_locked_inode(struct inode *inode) struct btrfs_timespec *tspec; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key location; + unsigned long ptr; int maybe_acls; u32 rdev; int ret; bool filled = false; + int first_xattr_slot; ret = btrfs_fill_inode(inode, &rdev); if (!ret) @@ -3320,7 +3407,6 @@ static void btrfs_read_locked_inode(struct inode *inode) if (!path) goto make_bad; - path->leave_spinning = 1; memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); @@ -3330,7 +3416,7 @@ static void btrfs_read_locked_inode(struct inode *inode) leaf = path->nodes[0]; if (filled) - goto cache_acl; + goto cache_index; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); @@ -3373,18 +3459,51 @@ static void btrfs_read_locked_inode(struct inode *inode) BTRFS_I(inode)->index_cnt = (u64)-1; BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); + +cache_index: + path->slots[0]++; + if (inode->i_nlink != 1 || + path->slots[0] >= btrfs_header_nritems(leaf)) + goto cache_acl; + + btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); + if (location.objectid != btrfs_ino(inode)) + goto cache_acl; + + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + if (location.type == BTRFS_INODE_REF_KEY) { + struct btrfs_inode_ref *ref; + + ref = (struct btrfs_inode_ref *)ptr; + BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); + } else if (location.type == BTRFS_INODE_EXTREF_KEY) { + struct btrfs_inode_extref *extref; + + extref = (struct btrfs_inode_extref *)ptr; + BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, + extref); + } cache_acl: /* * try to precache a NULL acl entry for files that don't have * any xattrs or acls */ maybe_acls = acls_after_inode_item(leaf, path->slots[0], - btrfs_ino(inode)); + btrfs_ino(inode), &first_xattr_slot); + if (first_xattr_slot != -1) { + path->slots[0] = first_xattr_slot; + ret = btrfs_load_inode_props(inode, path); + if (ret) + btrfs_err(root->fs_info, + "error loading props for ino %llu (root %llu): %d", + btrfs_ino(inode), + root->root_key.objectid, ret); + } + btrfs_free_path(path); + if (!maybe_acls) cache_no_acl(inode); - btrfs_free_path(path); - switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &btrfs_aops; @@ -3488,7 +3607,6 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, goto failed; } - btrfs_unlock_up_safe(path, 1); leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); @@ -3585,6 +3703,24 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, goto err; btrfs_release_path(path); + /* + * If we don't have dir index, we have to get it by looking up + * the inode ref, since we get the inode ref, remove it directly, + * it is unnecessary to do delayed deletion. + * + * But if we have dir index, needn't search inode ref to get it. + * Since the inode ref is close to the inode item, it is better + * that we delay to delete it, and just do this deletion when + * we update the inode item. + */ + if (BTRFS_I(inode)->dir_index) { + ret = btrfs_delayed_delete_inode_ref(inode); + if (!ret) { + index = BTRFS_I(inode)->dir_index; + goto skip_backref; + } + } + ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, dir_ino, &index); if (ret) { @@ -3594,7 +3730,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, root, ret); goto err; } - +skip_backref: ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); if (ret) { btrfs_abort_transaction(trans, root, ret); @@ -3636,7 +3772,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, int ret; ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); if (!ret) { - btrfs_drop_nlink(inode); + drop_nlink(inode); ret = btrfs_update_inode(trans, root, inode); } return ret; @@ -3884,7 +4020,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, * not block aligned since we will be keeping the last block of the * extent just the way it is. */ - if (root->ref_cows || root == root->fs_info->tree_root) + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + root == root->fs_info->tree_root) btrfs_drop_extent_cache(inode, ALIGN(new_size, root->sectorsize), (u64)-1, 0); @@ -3940,7 +4077,7 @@ search_again: btrfs_file_extent_num_bytes(leaf, fi); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_inline_len(leaf, - fi); + path->slots[0], fi); } item_end--; } @@ -3977,7 +4114,9 @@ search_again: extent_num_bytes); num_dec = (orig_num_bytes - extent_num_bytes); - if (root->ref_cows && extent_start != 0) + if (test_bit(BTRFS_ROOT_REF_COWS, + &root->state) && + extent_start != 0) inode_sub_bytes(inode, num_dec); btrfs_mark_buffer_dirty(leaf); } else { @@ -3991,7 +4130,8 @@ search_again: num_dec = btrfs_file_extent_num_bytes(leaf, fi); if (extent_start != 0) { found_extent = 1; - if (root->ref_cows) + if (test_bit(BTRFS_ROOT_REF_COWS, + &root->state)) inode_sub_bytes(inode, num_dec); } } @@ -4006,14 +4146,20 @@ search_again: btrfs_file_extent_other_encoding(leaf, fi) == 0) { u32 size = new_size - found_key.offset; - if (root->ref_cows) { + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) inode_sub_bytes(inode, item_end + 1 - new_size); - } + + /* + * update the ram bytes to properly reflect + * the new size of our item + */ + btrfs_set_file_extent_ram_bytes(leaf, fi, size); size = btrfs_file_extent_calc_inline_size(size); btrfs_truncate_item(root, path, size, 1); - } else if (root->ref_cows) { + } else if (test_bit(BTRFS_ROOT_REF_COWS, + &root->state)) { inode_sub_bytes(inode, item_end + 1 - found_key.offset); } @@ -4035,8 +4181,9 @@ delete: } else { break; } - if (found_extent && (root->ref_cows || - root == root->fs_info->tree_root)) { + if (found_extent && + (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + root == root->fs_info->tree_root)) { btrfs_set_path_blocking(path); ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, 0, @@ -4195,6 +4342,49 @@ out: return ret; } +static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, + u64 offset, u64 len) +{ + struct btrfs_trans_handle *trans; + int ret; + + /* + * Still need to make sure the inode looks like it's been updated so + * that any holes get logged if we fsync. + */ + if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) { + BTRFS_I(inode)->last_trans = root->fs_info->generation; + BTRFS_I(inode)->last_sub_trans = root->log_transid; + BTRFS_I(inode)->last_log_commit = root->last_log_commit; + return 0; + } + + /* + * 1 - for the one we're dropping + * 1 - for the one we're adding + * 1 - for updating the inode. + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + return ret; + } + + ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, + 0, 0, len, 0, len, 0, 0, 0); + if (ret) + btrfs_abort_transaction(trans, root, ret); + else + btrfs_update_inode(trans, root, inode); + btrfs_end_transaction(trans, root); + return ret; +} + /* * This function puts in dummy file extents for the area we're creating a hole * for. So if we are truncating this file to a larger size we need to insert @@ -4203,7 +4393,6 @@ out: */ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) { - struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_map *em = NULL; @@ -4230,15 +4419,16 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) while (1) { struct btrfs_ordered_extent *ordered; - btrfs_wait_ordered_range(inode, hole_start, - block_end - hole_start); + lock_extent_bits(io_tree, hole_start, block_end - 1, 0, &cached_state); - ordered = btrfs_lookup_ordered_extent(inode, hole_start); + ordered = btrfs_lookup_ordered_range(inode, hole_start, + block_end - hole_start); if (!ordered) break; unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, GFP_NOFS); + btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); } @@ -4257,31 +4447,10 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) struct extent_map *hole_em; hole_size = last_byte - cur_offset; - trans = btrfs_start_transaction(root, 3); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); - break; - } - - err = btrfs_drop_extents(trans, root, inode, - cur_offset, - cur_offset + hole_size, 1); - if (err) { - btrfs_abort_transaction(trans, root, err); - btrfs_end_transaction(trans, root); - break; - } - - err = btrfs_insert_file_extent(trans, root, - btrfs_ino(inode), cur_offset, 0, - 0, hole_size, 0, hole_size, - 0, 0, 0); - if (err) { - btrfs_abort_transaction(trans, root, err); - btrfs_end_transaction(trans, root); + err = maybe_insert_hole(root, inode, cur_offset, + hole_size); + if (err) break; - } - btrfs_drop_extent_cache(inode, cur_offset, cur_offset + hole_size - 1, 0); hole_em = alloc_extent_map(); @@ -4300,7 +4469,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) hole_em->ram_bytes = hole_size; hole_em->bdev = root->fs_info->fs_devices->latest_bdev; hole_em->compress_type = BTRFS_COMPRESS_NONE; - hole_em->generation = trans->transid; + hole_em->generation = root->fs_info->generation; while (1) { write_lock(&em_tree->lock); @@ -4313,17 +4482,14 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) hole_size - 1, 0); } free_extent_map(hole_em); -next: - btrfs_update_inode(trans, root, inode); - btrfs_end_transaction(trans, root); } +next: free_extent_map(em); em = NULL; cur_offset = last_byte; if (cur_offset >= block_end) break; } - free_extent_map(em); unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, GFP_NOFS); @@ -4345,8 +4511,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ - if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) - inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); + if (newsize != oldsize) { + inode_inc_iversion(inode); + if (!(mask & (ATTR_CTIME | ATTR_MTIME))) + inode->i_ctime = inode->i_mtime = + current_fs_time(inode->i_sb); + } if (newsize > oldsize) { truncate_pagecache(inode, newsize); @@ -4455,12 +4625,70 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) err = btrfs_dirty_inode(inode); if (!err && attr->ia_valid & ATTR_MODE) - err = btrfs_acl_chmod(inode); + err = posix_acl_chmod(inode, inode->i_mode); } return err; } +/* + * While truncating the inode pages during eviction, we get the VFS calling + * btrfs_invalidatepage() against each page of the inode. This is slow because + * the calls to btrfs_invalidatepage() result in a huge amount of calls to + * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting + * extent_state structures over and over, wasting lots of time. + * + * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all + * those expensive operations on a per page basis and do only the ordered io + * finishing, while we release here the extent_map and extent_state structures, + * without the excessive merging and splitting. + */ +static void evict_inode_truncate_pages(struct inode *inode) +{ + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree; + struct rb_node *node; + + ASSERT(inode->i_state & I_FREEING); + truncate_inode_pages_final(&inode->i_data); + + write_lock(&map_tree->lock); + while (!RB_EMPTY_ROOT(&map_tree->map)) { + struct extent_map *em; + + node = rb_first(&map_tree->map); + em = rb_entry(node, struct extent_map, rb_node); + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + remove_extent_mapping(map_tree, em); + free_extent_map(em); + } + write_unlock(&map_tree->lock); + + spin_lock(&io_tree->lock); + while (!RB_EMPTY_ROOT(&io_tree->state)) { + struct extent_state *state; + struct extent_state *cached_state = NULL; + + node = rb_first(&io_tree->state); + state = rb_entry(node, struct extent_state, rb_node); + atomic_inc(&state->refs); + spin_unlock(&io_tree->lock); + + lock_extent_bits(io_tree, state->start, state->end, + 0, &cached_state); + clear_extent_bit(io_tree, state->start, state->end, + EXTENT_LOCKED | EXTENT_DIRTY | + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 1, 1, + &cached_state, GFP_NOFS); + free_extent_state(state); + + spin_lock(&io_tree->lock); + } + spin_unlock(&io_tree->lock); +} + void btrfs_evict_inode(struct inode *inode) { struct btrfs_trans_handle *trans; @@ -4471,9 +4699,12 @@ void btrfs_evict_inode(struct inode *inode) trace_btrfs_inode_evict(inode); - truncate_inode_pages(&inode->i_data, 0); - if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || - btrfs_is_free_space_inode(inode))) + evict_inode_truncate_pages(inode); + + if (inode->i_nlink && + ((btrfs_root_refs(&root->root_item) != 0 && + root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || + btrfs_is_free_space_inode(inode))) goto no_delete; if (is_bad_inode(inode)) { @@ -4490,7 +4721,8 @@ void btrfs_evict_inode(struct inode *inode) } if (inode->i_nlink > 0) { - BUG_ON(btrfs_root_refs(&root->root_item) != 0); + BUG_ON(btrfs_root_refs(&root->root_item) != 0 && + root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID); goto no_delete; } @@ -4643,9 +4875,9 @@ static int fixup_tree_root_location(struct btrfs_root *root, } err = -ENOENT; - ret = btrfs_find_root_ref(root->fs_info->tree_root, path, - BTRFS_I(dir)->root->root_key.objectid, - location->objectid); + ret = btrfs_find_item(root->fs_info->tree_root, path, + BTRFS_I(dir)->root->root_key.objectid, + location->objectid, BTRFS_ROOT_REF_KEY, NULL); if (ret) { if (ret < 0) err = ret; @@ -4731,14 +4963,7 @@ static void inode_tree_del(struct inode *inode) } spin_unlock(&root->inode_lock); - /* - * Free space cache has inodes in the tree root, but the tree root has a - * root_refs of 0, so this could end up dropping the tree root as a - * snapshot, so we need the extra !root->fs_info->tree_root check to - * make sure we don't drop it. - */ - if (empty && btrfs_root_refs(&root->root_item) == 0 && - root != root->fs_info->tree_root) { + if (empty && btrfs_root_refs(&root->root_item) == 0) { synchronize_srcu(&root->fs_info->subvol_srcu); spin_lock(&root->inode_lock); empty = RB_EMPTY_ROOT(&root->inode_tree); @@ -4756,7 +4981,8 @@ void btrfs_invalidate_inodes(struct btrfs_root *root) struct inode *inode; u64 objectid = 0; - WARN_ON(btrfs_root_refs(&root->root_item) != 0); + if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) + WARN_ON(btrfs_root_refs(&root->root_item) != 0); spin_lock(&root->inode_lock); again: @@ -4813,7 +5039,9 @@ again: static int btrfs_init_locked_inode(struct inode *inode, void *p) { struct btrfs_iget_args *args = p; - inode->i_ino = args->ino; + inode->i_ino = args->location->objectid; + memcpy(&BTRFS_I(inode)->location, args->location, + sizeof(*args->location)); BTRFS_I(inode)->root = args->root; return 0; } @@ -4821,20 +5049,22 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) static int btrfs_find_actor(struct inode *inode, void *opaque) { struct btrfs_iget_args *args = opaque; - return args->ino == btrfs_ino(inode) && + return args->location->objectid == BTRFS_I(inode)->location.objectid && args->root == BTRFS_I(inode)->root; } static struct inode *btrfs_iget_locked(struct super_block *s, - u64 objectid, + struct btrfs_key *location, struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; - args.ino = objectid; + unsigned long hashval = btrfs_inode_hash(location->objectid, root); + + args.location = location; args.root = root; - inode = iget5_locked(s, objectid, btrfs_find_actor, + inode = iget5_locked(s, hashval, btrfs_find_actor, btrfs_init_locked_inode, (void *)&args); return inode; @@ -4848,13 +5078,11 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, { struct inode *inode; - inode = btrfs_iget_locked(s, location->objectid, root); + inode = btrfs_iget_locked(s, location, root); if (!inode) return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { - BTRFS_I(inode)->root = root; - memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); btrfs_read_locked_inode(inode); if (!is_bad_inode(inode)) { inode_tree_add(inode); @@ -4910,7 +5138,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return ERR_PTR(ret); if (location.objectid == 0) - return NULL; + return ERR_PTR(-ENOENT); if (location.type == BTRFS_INODE_ITEM_KEY) { inode = btrfs_iget(dir->i_sb, &location, root, NULL); @@ -4967,17 +5195,23 @@ static int btrfs_dentry_delete(const struct dentry *dentry) static void btrfs_dentry_release(struct dentry *dentry) { - if (dentry->d_fsdata) - kfree(dentry->d_fsdata); + kfree(dentry->d_fsdata); } static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - struct dentry *ret; + struct inode *inode; - ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry); - return ret; + inode = btrfs_lookup_dentry(dir, dentry); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -ENOENT) + inode = NULL; + else + return ERR_CAST(inode); + } + + return d_materialise_unique(dentry, inode); } unsigned char btrfs_filetype_table[] = { @@ -5048,7 +5282,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) continue; } - item = btrfs_item_nr(leaf, slot); + item = btrfs_item_nr(slot); btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid != key.objectid) @@ -5345,9 +5579,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_inode_ref *ref; struct btrfs_key key[2]; u32 sizes[2]; + int nitems = name ? 2 : 1; unsigned long ptr; int ret; - int owner; path = btrfs_alloc_path(); if (!path) @@ -5365,7 +5599,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, */ inode->i_ino = objectid; - if (dir) { + if (dir && name) { trace_btrfs_inode_request(dir); ret = btrfs_set_inode_index(dir, index); @@ -5374,6 +5608,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, iput(inode); return ERR_PTR(ret); } + } else if (dir) { + *index = 0; } /* * index_cnt is ignored for everything but a dir, @@ -5381,6 +5617,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, * number */ BTRFS_I(inode)->index_cnt = 2; + BTRFS_I(inode)->dir_index = *index; BTRFS_I(inode)->root = root; BTRFS_I(inode)->generation = trans->transid; inode->i_generation = BTRFS_I(inode)->generation; @@ -5393,30 +5630,28 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, */ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - if (S_ISDIR(mode)) - owner = 0; - else - owner = 1; - key[0].objectid = objectid; btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); key[0].offset = 0; - /* - * Start new inodes with an inode_ref. This is slightly more - * efficient for small numbers of hard links since they will - * be packed into one item. Extended refs will kick in if we - * add more hard links than can fit in the ref item. - */ - key[1].objectid = objectid; - btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); - key[1].offset = ref_objectid; - sizes[0] = sizeof(struct btrfs_inode_item); - sizes[1] = name_len + sizeof(*ref); + + if (name) { + /* + * Start new inodes with an inode_ref. This is slightly more + * efficient for small numbers of hard links since they will + * be packed into one item. Extended refs will kick in if we + * add more hard links than can fit in the ref item. + */ + key[1].objectid = objectid; + btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); + key[1].offset = ref_objectid; + + sizes[1] = name_len + sizeof(*ref); + } path->leave_spinning = 1; - ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); + ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); if (ret != 0) goto fail; @@ -5429,12 +5664,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, sizeof(*inode_item)); fill_inode_item(trans, path->nodes[0], inode_item, inode); - ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, - struct btrfs_inode_ref); - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); - btrfs_set_inode_ref_index(path->nodes[0], ref, *index); - ptr = (unsigned long)(ref + 1); - write_extent_buffer(path->nodes[0], name, ptr, name_len); + if (name) { + ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, + struct btrfs_inode_ref); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, *index); + ptr = (unsigned long)(ref + 1); + write_extent_buffer(path->nodes[0], name, ptr, name_len); + } btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); @@ -5454,7 +5691,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, BTRFS_INODE_NODATASUM; } - insert_inode_hash(inode); + btrfs_insert_inode_hash(inode); inode_tree_add(inode); trace_btrfs_inode_new(inode); @@ -5462,9 +5699,15 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_update_root_times(trans, root); + ret = btrfs_inode_inherit_props(trans, inode, dir); + if (ret) + btrfs_err(root->fs_info, + "error inheriting props for ino %llu (root %llu): %d", + btrfs_ino(inode), root->root_key.objectid, ret); + return inode; fail: - if (dir) + if (dir && name) BTRFS_I(dir)->index_cnt--; btrfs_free_path(path); iput(inode); @@ -5621,6 +5864,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, } out_unlock: btrfs_end_transaction(trans, root); + btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); if (drop_inode) { inode_dec_link_count(inode); @@ -5694,6 +5938,7 @@ out_unlock: inode_dec_link_count(inode); iput(inode); } + btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); return err; } @@ -5730,7 +5975,9 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, goto fail; } - btrfs_inc_nlink(inode); + /* There are several dir indexes for this inode, clear the cache. */ + BTRFS_I(inode)->dir_index = 0ULL; + inc_nlink(inode); inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; ihold(inode); @@ -5745,11 +5992,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, err = btrfs_update_inode(trans, root, inode); if (err) goto fail; + if (inode->i_nlink == 1) { + /* + * If new hard link count is 1, it's a file created + * with open(2) O_TMPFILE flag. + */ + err = btrfs_orphan_del(trans, inode); + if (err) + goto fail; + } d_instantiate(dentry, inode); btrfs_log_new_name(trans, inode, NULL, parent); } btrfs_end_transaction(trans, root); + btrfs_balance_delayed_items(root); fail: if (drop_inode) { inode_dec_link_count(inode); @@ -5816,6 +6073,7 @@ out_fail: btrfs_end_transaction(trans, root); if (drop_on_err) iput(inode); + btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); return err; } @@ -5860,7 +6118,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, compress_type = btrfs_file_extent_compression(leaf, item); max_size = btrfs_file_extent_ram_bytes(leaf, item); inline_size = btrfs_file_extent_inline_item_len(leaf, - btrfs_item_nr(leaf, path->slots[0])); + btrfs_item_nr(path->slots[0])); tmp = kmalloc(inline_size, GFP_NOFS); if (!tmp) return -ENOMEM; @@ -5871,16 +6129,8 @@ static noinline int uncompress_inline(struct btrfs_path *path, max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); ret = btrfs_decompress(compress_type, tmp, page, extent_offset, inline_size, max_size); - if (ret) { - char *kaddr = kmap_atomic(page); - unsigned long copy_size = min_t(u64, - PAGE_CACHE_SIZE - pg_offset, - max_size - extent_offset); - memset(kaddr + pg_offset, 0, copy_size); - kunmap_atomic(kaddr); - } kfree(tmp); - return 0; + return ret; } /* @@ -5898,7 +6148,6 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, { int ret; int err = 0; - u64 bytenr; u64 extent_start = 0; u64 extent_end = 0; u64 objectid = btrfs_ino(inode); @@ -5912,7 +6161,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_trans_handle *trans = NULL; - int compress_type; + const bool new_inline = !page || create; again: read_lock(&em_tree->lock); @@ -5974,22 +6223,28 @@ again: found_type = btrfs_key_type(&found_key); if (found_key.objectid != objectid || found_type != BTRFS_EXTENT_DATA_KEY) { - goto not_found; + /* + * If we backup past the first extent we want to move forward + * and see if there is an extent in front of us, otherwise we'll + * say there is a hole for our whole search range which can + * cause problems. + */ + extent_end = start; + goto next; } found_type = btrfs_file_extent_type(leaf, item); extent_start = found_key.offset; - compress_type = btrfs_file_extent_compression(leaf, item); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { extent_end = extent_start + btrfs_file_extent_num_bytes(leaf, item); } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size_t size; - size = btrfs_file_extent_inline_len(leaf, item); + size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); extent_end = ALIGN(extent_start + size, root->sectorsize); } - +next: if (start >= extent_end) { path->slots[0]++; if (path->slots[0] >= btrfs_header_nritems(leaf)) { @@ -6014,32 +6269,10 @@ again: goto not_found_em; } - em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, item); + btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em); + if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { - em->start = extent_start; - em->len = extent_end - extent_start; - em->orig_start = extent_start - - btrfs_file_extent_offset(leaf, item); - em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, - item); - bytenr = btrfs_file_extent_disk_bytenr(leaf, item); - if (bytenr == 0) { - em->block_start = EXTENT_MAP_HOLE; - goto insert; - } - if (compress_type != BTRFS_COMPRESS_NONE) { - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->compress_type = compress_type; - em->block_start = bytenr; - em->block_len = em->orig_block_len; - } else { - bytenr += btrfs_file_extent_offset(leaf, item); - em->block_start = bytenr; - em->block_len = em->len; - if (found_type == BTRFS_FILE_EXTENT_PREALLOC) - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); - } goto insert; } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { unsigned long ptr; @@ -6048,14 +6281,10 @@ again: size_t extent_offset; size_t copy_size; - em->block_start = EXTENT_MAP_INLINE; - if (!page || create) { - em->start = extent_start; - em->len = extent_end - extent_start; + if (new_inline) goto out; - } - size = btrfs_file_extent_inline_len(leaf, item); + size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); extent_offset = page_offset(page) + pg_offset - extent_start; copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, size - extent_offset); @@ -6063,10 +6292,6 @@ again: em->len = ALIGN(copy_size, root->sectorsize); em->orig_block_len = em->len; em->orig_start = em->start; - if (compress_type) { - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->compress_type = compress_type; - } ptr = btrfs_file_extent_inline_start(item) + extent_offset; if (create == 0 && !PageUptodate(page)) { if (btrfs_file_extent_compression(leaf, item) != @@ -6074,7 +6299,10 @@ again: ret = uncompress_inline(path, inode, page, pg_offset, extent_offset, item); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + err = ret; + goto out; + } } else { map = kmap(page); read_extent_buffer(leaf, map + pg_offset, ptr, @@ -6110,8 +6338,6 @@ again: set_extent_uptodate(io_tree, em->start, extent_map_end(em) - 1, NULL, GFP_NOFS); goto insert; - } else { - WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type); } not_found: em->start = start; @@ -6173,8 +6399,7 @@ insert: write_unlock(&em_tree->lock); out: - if (em) - trace_btrfs_get_extent(root, em); + trace_btrfs_get_extent(root, em); if (path) btrfs_free_path(path); @@ -6249,7 +6474,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag /* adjust the range_start to make sure it doesn't * go backwards from the start they passed in */ - range_start = max(start,range_start); + range_start = max(start, range_start); found = found_end - range_start; if (found > 0) { @@ -6329,21 +6554,21 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, alloc_hint = get_extent_allocation_hint(inode, start, len); ret = btrfs_reserve_extent(root, len, root->sectorsize, 0, - alloc_hint, &ins, 1); + alloc_hint, &ins, 1, 1); if (ret) return ERR_PTR(ret); em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, ins.offset, ins.offset, ins.offset, 0); if (IS_ERR(em)) { - btrfs_free_reserved_extent(root, ins.objectid, ins.offset); + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); return em; } ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, ins.offset, ins.offset, 0); if (ret) { - btrfs_free_reserved_extent(root, ins.objectid, ins.offset); + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); free_extent_map(em); return ERR_PTR(ret); } @@ -6364,6 +6589,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, int ret; struct extent_buffer *leaf; struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_file_extent_item *fi; struct btrfs_key key; u64 disk_bytenr; @@ -6373,6 +6599,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, int slot; int found_type; bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -6416,6 +6643,10 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) goto out; + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + if (extent_end <= offset) + goto out; + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); if (disk_bytenr == 0) goto out; @@ -6433,11 +6664,24 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); } - extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); - if (btrfs_extent_readonly(root, disk_bytenr)) goto out; + num_bytes = min(offset + *len, extent_end) - offset; + if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) { + u64 range_end; + + range_end = round_up(offset + num_bytes, root->sectorsize) - 1; + ret = test_range_bit(io_tree, offset, range_end, + EXTENT_DELALLOC, 0, NULL); + if (ret) { + ret = -EAGAIN; + goto out; + } + } + + btrfs_release_path(path); + /* * look for other files referencing this extent, if we * find any we must cow @@ -6464,7 +6708,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, */ disk_bytenr += backref_offset; disk_bytenr += offset - key.offset; - num_bytes = min(offset + *len, extent_end) - offset; if (csum_exist_in_range(root, disk_bytenr, num_bytes)) goto out; /* @@ -6478,6 +6721,76 @@ out: return ret; } +bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) +{ + struct radix_tree_root *root = &inode->i_mapping->page_tree; + int found = false; + void **pagep = NULL; + struct page *page = NULL; + int start_idx; + int end_idx; + + start_idx = start >> PAGE_CACHE_SHIFT; + + /* + * end is the last byte in the last page. end == start is legal + */ + end_idx = end >> PAGE_CACHE_SHIFT; + + rcu_read_lock(); + + /* Most of the code in this while loop is lifted from + * find_get_page. It's been modified to begin searching from a + * page and return just the first page found in that range. If the + * found idx is less than or equal to the end idx then we know that + * a page exists. If no pages are found or if those pages are + * outside of the range then we're fine (yay!) */ + while (page == NULL && + radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) { + page = radix_tree_deref_slot(pagep); + if (unlikely(!page)) + break; + + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + page = NULL; + continue; + } + /* + * Otherwise, shmem/tmpfs must be storing a swap entry + * here as an exceptional entry: so return it without + * attempting to raise page count. + */ + page = NULL; + break; /* TODO: Is this relevant for this use case? */ + } + + if (!page_cache_get_speculative(page)) { + page = NULL; + continue; + } + + /* + * Has the page moved? + * This is part of the lockless pagecache protocol. See + * include/linux/pagemap.h for details. + */ + if (unlikely(page != *pagep)) { + page_cache_release(page); + page = NULL; + } + } + + if (page) { + if (page->index <= end_idx) + found = true; + page_cache_release(page); + } + + rcu_read_unlock(); + return found; +} + static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, struct extent_state **cached_state, int writing) { @@ -6502,10 +6815,9 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, * invalidate needs to happen so that reads after a write do not * get stale data. */ - if (!ordered && (!writing || - !test_range_bit(&BTRFS_I(inode)->io_tree, - lockstart, lockend, EXTENT_UPTODATE, 0, - *cached_state))) + if (!ordered && + (!writing || + !btrfs_page_exists_in_range(inode, lockstart, lockend))) break; unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, @@ -6765,17 +7077,16 @@ unlock_err: static void btrfs_endio_direct_read(struct bio *bio, int err) { struct btrfs_dio_private *dip = bio->bi_private; - struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; - struct bio_vec *bvec = bio->bi_io_vec; + struct bio_vec *bvec; struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct bio *dio_bio; u32 *csums = (u32 *)dip->csum; - int index = 0; u64 start; + int i; start = dip->logical_offset; - do { + bio_for_each_segment_all(bvec, bio, i) { if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { struct page *page = bvec->bv_page; char *kaddr; @@ -6791,18 +7102,16 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) local_irq_restore(flags); flush_dcache_page(bvec->bv_page); - if (csum != csums[index]) { + if (csum != csums[i]) { btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", btrfs_ino(inode), start, csum, - csums[index]); + csums[i]); err = -EIO; } } start += bvec->bv_len; - bvec++; - index++; - } while (bvec <= bvec_end); + } unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); @@ -6837,10 +7146,9 @@ again: if (!ret) goto out_test; - ordered->work.func = finish_ordered_fn; - ordered->work.flags = 0; - btrfs_queue_worker(&root->fs_info->endio_write_workers, - &ordered->work); + btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL); + btrfs_queue_work(root->fs_info->endio_write_workers, + &ordered->work); out_test: /* * our bio might span multiple ordered extents. If we haven't @@ -6880,17 +7188,18 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) struct btrfs_dio_private *dip = bio->bi_private; if (err) { - printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu " - "sector %#Lx len %u err no %d\n", + btrfs_err(BTRFS_I(dip->inode)->root->fs_info, + "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d", btrfs_ino(dip->inode), bio->bi_rw, - (unsigned long long)bio->bi_sector, bio->bi_size, err); + (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_iter.bi_size, err); dip->errors = 1; /* * before atomic variable goto zero, we must make sure * dip->errors is perceived to be set. */ - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); } /* if there are more bios still pending for this dio, just exit */ @@ -6974,7 +7283,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, struct bio *bio; struct bio *orig_bio = dip->orig_bio; struct bio_vec *bvec = orig_bio->bi_io_vec; - u64 start_sector = orig_bio->bi_sector; + u64 start_sector = orig_bio->bi_iter.bi_sector; u64 file_offset = dip->logical_offset; u64 submit_len = 0; u64 map_length; @@ -6982,7 +7291,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, int ret = 0; int async_submit = 0; - map_length = orig_bio->bi_size; + map_length = orig_bio->bi_iter.bi_size; ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, &map_length, NULL, 0); if (ret) { @@ -6990,7 +7299,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, return -EIO; } - if (map_length >= orig_bio->bi_size) { + if (map_length >= orig_bio->bi_iter.bi_size) { bio = orig_bio; goto submit; } @@ -7042,7 +7351,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; - map_length = orig_bio->bi_size; + map_length = orig_bio->bi_iter.bi_size; ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, &map_length, NULL, 0); @@ -7052,7 +7361,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, } } else { submit_len += bvec->bv_len; - nr_pages ++; + nr_pages++; bvec++; } } @@ -7070,7 +7379,7 @@ out_err: * before atomic variable goto zero, we must * make sure dip->errors is perceived to be set. */ - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); if (atomic_dec_and_test(&dip->pending_bios)) bio_io_error(dip->orig_bio); @@ -7100,7 +7409,8 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, if (!skip_sum && !write) { csum_size = btrfs_super_csum_size(root->fs_info->super_copy); - sum_len = dio_bio->bi_size >> inode->i_sb->s_blocksize_bits; + sum_len = dio_bio->bi_iter.bi_size >> + inode->i_sb->s_blocksize_bits; sum_len *= csum_size; } else { sum_len = 0; @@ -7115,8 +7425,8 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, dip->private = dio_bio->bi_private; dip->inode = inode; dip->logical_offset = file_offset; - dip->bytes = dio_bio->bi_size; - dip->disk_bytenr = (u64)dio_bio->bi_sector << 9; + dip->bytes = dio_bio->bi_iter.bi_size; + dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; io_bio->bi_private = dip; dip->errors = 0; dip->orig_bio = io_bio; @@ -7146,7 +7456,7 @@ free_ordered: if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) btrfs_free_reserved_extent(root, ordered->start, - ordered->disk_len); + ordered->disk_len, 1); btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } @@ -7154,39 +7464,30 @@ free_ordered: } static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + const struct iov_iter *iter, loff_t offset) { int seg; int i; - size_t size; - unsigned long addr; unsigned blocksize_mask = root->sectorsize - 1; ssize_t retval = -EINVAL; - loff_t end = offset; if (offset & blocksize_mask) goto out; - /* Check the memory alignment. Blocks cannot straddle pages */ - for (seg = 0; seg < nr_segs; seg++) { - addr = (unsigned long)iov[seg].iov_base; - size = iov[seg].iov_len; - end += size; - if ((addr & blocksize_mask) || (size & blocksize_mask)) - goto out; - - /* If this is a write we don't need to check anymore */ - if (rw & WRITE) - continue; + if (iov_iter_alignment(iter) & blocksize_mask) + goto out; - /* - * Check to make sure we don't have duplicate iov_base's in this - * iovec, if so return EINVAL, otherwise we'll get csum errors - * when reading back. - */ - for (i = seg + 1; i < nr_segs; i++) { - if (iov[seg].iov_base == iov[i].iov_base) + /* If this is a write we don't need to check anymore */ + if (rw & WRITE) + return 0; + /* + * Check to make sure we don't have duplicate iov_base's in this + * iovec, if so return EINVAL, otherwise we'll get csum errors + * when reading back. + */ + for (seg = 0; seg < iter->nr_segs; seg++) { + for (i = seg + 1; i < iter->nr_segs; i++) { + if (iter->iov[seg].iov_base == iter->iov[i].iov_base) goto out; } } @@ -7196,8 +7497,7 @@ out: } static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; @@ -7207,21 +7507,22 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, bool relock = false; ssize_t ret; - if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, - offset, nr_segs)) + if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset)) return 0; atomic_inc(&inode->i_dio_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); /* - * The generic stuff only does filemap_write_and_wait_range, which isn't - * enough if we've written compressed pages to this area, so we need to - * call btrfs_wait_ordered_range to make absolutely sure that any - * outstanding dirty pages are on disk. + * The generic stuff only does filemap_write_and_wait_range, which + * isn't enough if we've written compressed pages to this area, so + * we need to flush the dirty pages again to make absolutely sure + * that any outstanding dirty pages are on disk. */ - count = iov_length(iov, nr_segs); - btrfs_wait_ordered_range(inode, offset, count); + count = iov_iter_count(iter); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_fdatawrite_range(inode->i_mapping, offset, count); if (rw & WRITE) { /* @@ -7245,7 +7546,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, ret = __blockdev_direct_IO(rw, iocb, inode, BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, - iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, + iter, offset, btrfs_get_blocks_direct, NULL, btrfs_submit_direct, flags); if (rw & WRITE) { if (ret < 0 && ret != -EIOCBQUEUED) @@ -7351,6 +7652,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + int inode_evicting = inode->i_state & I_FREEING; /* * we have the page locked, so new writeback can't start, @@ -7366,17 +7668,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, btrfs_releasepage(page, GFP_NOFS); return; } - lock_extent_bits(tree, page_start, page_end, 0, &cached_state); - ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); + + if (!inode_evicting) + lock_extent_bits(tree, page_start, page_end, 0, &cached_state); + ordered = btrfs_lookup_ordered_extent(inode, page_start); if (ordered) { /* * IO on this page will never be started, so we need * to account for any ordered extents now */ - clear_extent_bit(tree, page_start, page_end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS); + if (!inode_evicting) + clear_extent_bit(tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 1, 0, &cached_state, + GFP_NOFS); /* * whoever cleared the private bit is responsible * for the finish_ordered_io @@ -7400,14 +7706,22 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, btrfs_finish_ordered_io(ordered); } btrfs_put_ordered_extent(ordered); - cached_state = NULL; - lock_extent_bits(tree, page_start, page_end, 0, &cached_state); + if (!inode_evicting) { + cached_state = NULL; + lock_extent_bits(tree, page_start, page_end, 0, + &cached_state); + } + } + + if (!inode_evicting) { + clear_extent_bit(tree, page_start, page_end, + EXTENT_LOCKED | EXTENT_DIRTY | + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 1, 1, + &cached_state, GFP_NOFS); + + __btrfs_releasepage(page, GFP_NOFS); } - clear_extent_bit(tree, page_start, page_end, - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, - &cached_state, GFP_NOFS); - __btrfs_releasepage(page, GFP_NOFS); ClearPageChecked(page); if (PagePrivate(page)) { @@ -7562,7 +7876,10 @@ static int btrfs_truncate(struct inode *inode) u64 mask = root->sectorsize - 1; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); - btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); + ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), + (u64)-1); + if (ret) + return ret; /* * Yes ladies and gentelment, this is indeed ugly. The fact is we have @@ -7714,7 +8031,9 @@ out: * create a new subvolume directory/inode (helper for the ioctl). */ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, u64 new_dirid) + struct btrfs_root *new_root, + struct btrfs_root *parent_root, + u64 new_dirid) { struct inode *inode; int err; @@ -7732,6 +8051,12 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, set_nlink(inode, 1); btrfs_i_size_write(inode, 0); + err = btrfs_subvol_inherit_props(trans, new_root, parent_root); + if (err) + btrfs_err(new_root->fs_info, + "error inheriting subvolume %llu properties: %d", + new_root->root_key.objectid, err); + err = btrfs_update_inode(trans, new_root, inode); iput(inode); @@ -7757,6 +8082,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->flags = 0; ei->csum_bytes = 0; ei->index_cnt = (u64)-1; + ei->dir_index = 0; ei->last_unlink_trans = 0; ei->last_log_commit = 0; @@ -7786,6 +8112,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) return inode; } +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_destroy_inode(struct inode *inode) +{ + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); +} +#endif + static void btrfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); @@ -7856,8 +8190,7 @@ int btrfs_drop_inode(struct inode *inode) return 1; /* the snap/subvol tree is on deleting */ - if (btrfs_root_refs(&root->root_item) == 0 && - root != root->fs_info->tree_root) + if (btrfs_root_refs(&root->root_item) == 0) return 1; else return generic_drop_inode(inode); @@ -7986,7 +8319,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* check for collisions, even if the name isn't there */ - ret = btrfs_check_dir_item_collision(root, new_dir->i_ino, + ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, new_dentry->d_name.name, new_dentry->d_name.len); @@ -7994,8 +8327,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (ret == -EEXIST) { /* we shouldn't get * eexist without a new_inode */ - if (!new_inode) { - WARN_ON(1); + if (WARN_ON(!new_inode)) { return ret; } } else { @@ -8038,9 +8370,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (ret) goto out_fail; + BTRFS_I(old_inode)->dir_index = 0ULL; if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { /* force full log commit if subvolume involved. */ - root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); } else { ret = btrfs_insert_inode_ref(trans, dest, new_dentry->d_name.name, @@ -8126,6 +8459,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_fail; } + if (old_inode->i_nlink == 1) + BTRFS_I(old_inode)->dir_index = index; + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { struct dentry *parent = new_dentry->d_parent; btrfs_log_new_name(trans, old_inode, old_dir, parent); @@ -8143,18 +8479,24 @@ out_notrans: static void btrfs_run_delalloc_work(struct btrfs_work *work) { struct btrfs_delalloc_work *delalloc_work; + struct inode *inode; delalloc_work = container_of(work, struct btrfs_delalloc_work, work); - if (delalloc_work->wait) - btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1); - else - filemap_flush(delalloc_work->inode->i_mapping); + inode = delalloc_work->inode; + if (delalloc_work->wait) { + btrfs_wait_ordered_range(inode, 0, (u64)-1); + } else { + filemap_flush(inode->i_mapping); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_flush(inode->i_mapping); + } if (delalloc_work->delay_iput) - btrfs_add_delayed_iput(delalloc_work->inode); + btrfs_add_delayed_iput(inode); else - iput(delalloc_work->inode); + iput(inode); complete(&delalloc_work->completion); } @@ -8172,7 +8514,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, work->inode = inode; work->wait = wait; work->delay_iput = delay_iput; - work->work.func = btrfs_run_delalloc_work; + btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); return work; } @@ -8187,7 +8529,8 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput) +static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput, + int nr) { struct btrfs_inode *binode; struct inode *inode; @@ -8199,6 +8542,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput) INIT_LIST_HEAD(&works); INIT_LIST_HEAD(&splice); + mutex_lock(&root->delalloc_mutex); spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { @@ -8224,19 +8568,16 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput) goto out; } list_add_tail(&work->list, &works); - btrfs_queue_worker(&root->fs_info->flush_workers, - &work->work); - + btrfs_queue_work(root->fs_info->flush_workers, + &work->work); + ret++; + if (nr != -1 && ret >= nr) + goto out; cond_resched(); spin_lock(&root->delalloc_lock); } spin_unlock(&root->delalloc_lock); - list_for_each_entry_safe(work, next, &works, list) { - list_del_init(&work->list); - btrfs_wait_and_free_delalloc_work(work); - } - return 0; out: list_for_each_entry_safe(work, next, &works, list) { list_del_init(&work->list); @@ -8248,6 +8589,7 @@ out: list_splice_tail(&splice, &root->delalloc_inodes); spin_unlock(&root->delalloc_lock); } + mutex_unlock(&root->delalloc_mutex); return ret; } @@ -8255,10 +8597,12 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) { int ret; - if (root->fs_info->sb->s_flags & MS_RDONLY) + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return -EROFS; - ret = __start_delalloc_inodes(root, delay_iput); + ret = __start_delalloc_inodes(root, delay_iput, -1); + if (ret > 0) + ret = 0; /* * the filemap_flush will queue IO into the worker threads, but * we have to make sure the IO is actually started and that @@ -8275,21 +8619,22 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) return ret; } -int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info, - int delay_iput) +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, + int nr) { struct btrfs_root *root; struct list_head splice; int ret; - if (fs_info->sb->s_flags & MS_RDONLY) + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; INIT_LIST_HEAD(&splice); + mutex_lock(&fs_info->delalloc_root_mutex); spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); - while (!list_empty(&splice)) { + while (!list_empty(&splice) && nr) { root = list_first_entry(&splice, struct btrfs_root, delalloc_root); root = btrfs_grab_fs_root(root); @@ -8298,15 +8643,20 @@ int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = __start_delalloc_inodes(root, delay_iput); + ret = __start_delalloc_inodes(root, delay_iput, nr); btrfs_put_fs_root(root); - if (ret) + if (ret < 0) goto out; + if (nr != -1) { + nr -= ret; + WARN_ON(nr < 0); + } spin_lock(&fs_info->delalloc_root_lock); } spin_unlock(&fs_info->delalloc_root_lock); + ret = 0; atomic_inc(&fs_info->async_submit_draining); while (atomic_read(&fs_info->nr_async_submits) || atomic_read(&fs_info->async_delalloc_pages)) { @@ -8315,13 +8665,13 @@ int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info, atomic_read(&fs_info->async_delalloc_pages) == 0)); } atomic_dec(&fs_info->async_submit_draining); - return 0; out: if (!list_empty_careful(&splice)) { spin_lock(&fs_info->delalloc_root_lock); list_splice_tail(&splice, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); } + mutex_unlock(&fs_info->delalloc_root_mutex); return ret; } @@ -8336,14 +8686,14 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, int err; int drop_inode = 0; u64 objectid; - u64 index = 0 ; + u64 index = 0; int name_len; int datasize; unsigned long ptr; struct btrfs_file_extent_item *ei; struct extent_buffer *leaf; - name_len = strlen(symname) + 1; + name_len = strlen(symname); if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) return -ENAMETOOLONG; @@ -8431,7 +8781,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &btrfs_symlink_aops; inode->i_mapping->backing_dev_info = &root->fs_info->bdi; inode_set_bytes(inode, name_len); - btrfs_i_size_write(inode, name_len - 1); + btrfs_i_size_write(inode, name_len); err = btrfs_update_inode(trans, root, inode); if (err) drop_inode = 1; @@ -8477,7 +8827,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); cur_bytes = max(cur_bytes, min_size); ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, - *alloc_hint, &ins, 1); + *alloc_hint, &ins, 1, 0); if (ret) { if (own_trans) btrfs_end_transaction(trans, root); @@ -8490,6 +8840,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, ins.offset, 0, 0, 0, BTRFS_FILE_EXTENT_PREALLOC); if (ret) { + btrfs_free_reserved_extent(root, ins.objectid, + ins.offset, 0); btrfs_abort_transaction(trans, root, ret); if (own_trans) btrfs_end_transaction(trans, root); @@ -8599,6 +8951,66 @@ static int btrfs_permission(struct inode *inode, int mask) return generic_permission(inode, mask); } +static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = NULL; + u64 objectid; + u64 index; + int ret = 0; + + /* + * 5 units required for adding orphan entry + */ + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_find_free_ino(root, &objectid); + if (ret) + goto out; + + inode = btrfs_new_inode(trans, root, dir, NULL, 0, + btrfs_ino(dir), objectid, mode, &index); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; + goto out; + } + + ret = btrfs_init_inode_security(trans, inode, dir, NULL); + if (ret) + goto out; + + ret = btrfs_update_inode(trans, root, inode); + if (ret) + goto out; + + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + + ret = btrfs_orphan_add(trans, inode); + if (ret) + goto out; + + d_tmpfile(dentry, inode); + mark_inode_dirty(inode); + +out: + btrfs_end_transaction(trans, root); + if (ret) + iput(inode); + btrfs_balance_delayed_items(root); + btrfs_btree_balance_dirty(root); + + return ret; +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, @@ -8617,12 +9029,15 @@ static const struct inode_operations btrfs_dir_inode_operations = { .removexattr = btrfs_removexattr, .permission = btrfs_permission, .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, + .tmpfile = btrfs_tmpfile, }; static const struct inode_operations btrfs_dir_ro_inode_operations = { .lookup = btrfs_lookup, .permission = btrfs_permission, .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; @@ -8692,6 +9107,7 @@ static const struct inode_operations btrfs_file_inode_operations = { .permission = btrfs_permission, .fiemap = btrfs_fiemap, .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_special_inode_operations = { @@ -8703,6 +9119,7 @@ static const struct inode_operations btrfs_special_inode_operations = { .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_symlink_inode_operations = { @@ -8716,7 +9133,6 @@ static const struct inode_operations btrfs_symlink_inode_operations = { .getxattr = btrfs_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, - .get_acl = btrfs_get_acl, .update_time = btrfs_update_time, }; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9d46f60cb94..47aceb494d1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -44,7 +44,6 @@ #include <linux/uuid.h> #include <linux/btrfs.h> #include <linux/uaccess.h> -#include "compat.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -57,6 +56,35 @@ #include "rcu-string.h" #include "send.h" #include "dev-replace.h" +#include "props.h" +#include "sysfs.h" +#include "qgroup.h" + +#ifdef CONFIG_64BIT +/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI + * structures are incorrect, as the timespec structure from userspace + * is 4 bytes too small. We define these alternatives here to teach + * the kernel about the 32-bit struct packing. + */ +struct btrfs_ioctl_timespec_32 { + __u64 sec; + __u32 nsec; +} __attribute__ ((__packed__)); + +struct btrfs_ioctl_received_subvol_args_32 { + char uuid[BTRFS_UUID_SIZE]; /* in */ + __u64 stransid; /* in */ + __u64 rtransid; /* out */ + struct btrfs_ioctl_timespec_32 stime; /* in */ + struct btrfs_ioctl_timespec_32 rtime; /* out */ + __u64 flags; /* in */ + __u64 reserved[16]; /* in */ +} __attribute__ ((__packed__)); + +#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \ + struct btrfs_ioctl_received_subvol_args_32) +#endif + static int btrfs_clone(struct inode *src, struct inode *inode, u64 off, u64 olen, u64 olen_aligned, u64 destoff); @@ -108,19 +136,22 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags) void btrfs_update_iflags(struct inode *inode) { struct btrfs_inode *ip = BTRFS_I(inode); - - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + unsigned int new_fl = 0; if (ip->flags & BTRFS_INODE_SYNC) - inode->i_flags |= S_SYNC; + new_fl |= S_SYNC; if (ip->flags & BTRFS_INODE_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; + new_fl |= S_IMMUTABLE; if (ip->flags & BTRFS_INODE_APPEND) - inode->i_flags |= S_APPEND; + new_fl |= S_APPEND; if (ip->flags & BTRFS_INODE_NOATIME) - inode->i_flags |= S_NOATIME; + new_fl |= S_NOATIME; if (ip->flags & BTRFS_INODE_DIRSYNC) - inode->i_flags |= S_DIRSYNC; + new_fl |= S_DIRSYNC; + + set_mask_bits(&inode->i_flags, + S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC, + new_fl); } /* @@ -191,6 +222,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) unsigned int i_oldflags; umode_t mode; + if (!inode_owner_or_capable(inode)) + return -EPERM; + if (btrfs_root_readonly(root)) return -EROFS; @@ -201,9 +235,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (ret) return ret; - if (!inode_owner_or_capable(inode)) - return -EACCES; - ret = mnt_want_write_file(file); if (ret) return ret; @@ -281,9 +312,25 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (flags & FS_NOCOMP_FL) { ip->flags &= ~BTRFS_INODE_COMPRESS; ip->flags |= BTRFS_INODE_NOCOMPRESS; + + ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0); + if (ret && ret != -ENODATA) + goto out_drop; } else if (flags & FS_COMPR_FL) { + const char *comp; + ip->flags |= BTRFS_INODE_COMPRESS; ip->flags &= ~BTRFS_INODE_NOCOMPRESS; + + if (root->fs_info->compress_type == BTRFS_COMPRESS_LZO) + comp = "lzo"; + else + comp = "zlib"; + ret = btrfs_set_prop(inode, "btrfs.compression", + comp, strlen(comp), 0); + if (ret) + goto out_drop; + } else { ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } @@ -321,7 +368,7 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg) static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) { - struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb); struct btrfs_device *device; struct request_queue *q; struct fstrim_range range; @@ -369,9 +416,13 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) int btrfs_is_empty_uuid(u8 *uuid) { - static char empty_uuid[BTRFS_UUID_SIZE] = {0}; + int i; - return !memcmp(uuid, empty_uuid, BTRFS_UUID_SIZE); + for (i = 0; i < BTRFS_UUID_SIZE; i++) { + if (uuid[i]) + return 0; + } + return 1; } static noinline int create_subvol(struct inode *dir, @@ -389,6 +440,7 @@ static noinline int create_subvol(struct inode *dir, struct btrfs_root *new_root; struct btrfs_block_rsv block_rsv; struct timespec cur_time = CURRENT_TIME; + struct inode *inode; int ret; int err; u64 objectid; @@ -414,7 +466,9 @@ static noinline int create_subvol(struct inode *dir, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - goto out; + btrfs_subvolume_release_metadata(root, &block_rsv, + qgroup_reserved); + return ret; } trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; @@ -436,7 +490,7 @@ static noinline int create_subvol(struct inode *dir, btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(leaf, objectid); - write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(leaf), + write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, btrfs_header_chunk_tree_uuid(leaf), @@ -497,7 +551,7 @@ static noinline int create_subvol(struct inode *dir, btrfs_record_root_in_trans(trans, new_root); - ret = btrfs_create_subvol_root(trans, new_root, new_dirid); + ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid); if (ret) { /* We potentially lose an unused inode item here */ btrfs_abort_transaction(trans, root, ret); @@ -539,6 +593,8 @@ static noinline int create_subvol(struct inode *dir, fail: trans->block_rsv = NULL; trans->bytes_reserved = 0; + btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); + if (async_transid) { *async_transid = trans->transid; err = btrfs_commit_transaction_async(trans, root, 1); @@ -550,13 +606,32 @@ fail: if (err && !ret) ret = err; - if (!ret) - d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); -out: - btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); + if (!ret) { + inode = btrfs_lookup_dentry(dir, dentry); + if (IS_ERR(inode)) + return PTR_ERR(inode); + d_instantiate(dentry, inode); + } return ret; } +static void btrfs_wait_nocow_write(struct btrfs_root *root) +{ + s64 writers; + DEFINE_WAIT(wait); + + do { + prepare_to_wait(&root->subv_writers->wait, &wait, + TASK_UNINTERRUPTIBLE); + + writers = percpu_counter_sum(&root->subv_writers->counter); + if (writers) + schedule(); + + finish_wait(&root->subv_writers->wait, &wait); + } while (writers); +} + static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct dentry *dentry, char *name, int namelen, u64 *async_transid, bool readonly, @@ -567,18 +642,24 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct btrfs_trans_handle *trans; int ret; - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return -EINVAL; + atomic_inc(&root->will_be_snapshoted); + smp_mb__after_atomic(); + btrfs_wait_nocow_write(root); + ret = btrfs_start_delalloc_inodes(root, 0); if (ret) - return ret; + goto out; - btrfs_wait_ordered_extents(root); + btrfs_wait_ordered_extents(root, -1); pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); - if (!pending_snapshot) - return -ENOMEM; + if (!pending_snapshot) { + ret = -ENOMEM; + goto out; + } btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); @@ -595,7 +676,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, &pending_snapshot->qgroup_reserved, false); if (ret) - goto out; + goto free; pending_snapshot->dentry = dentry; pending_snapshot->root = root; @@ -634,20 +715,51 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto fail; + /* + * If orphan cleanup did remove any orphans, it means the tree was + * modified and therefore the commit root is not the same as the + * current root anymore. This is a problem, because send uses the + * commit root and therefore can see inode items that don't exist + * in the current root anymore, and for example make calls to + * btrfs_iget, which will do tree lookups based on the current root + * and not on the commit root. Those lookups will fail, returning a + * -ESTALE error, and making send fail with that error. So make sure + * a send does not see any orphans we have just removed, and that it + * will see the same inodes regardless of whether a transaction + * commit happened before it started (meaning that the commit root + * will be the same as the current root) or not. + */ + if (readonly && pending_snapshot->snap->node != + pending_snapshot->snap->commit_root) { + trans = btrfs_join_transaction(pending_snapshot->snap); + if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) { + ret = PTR_ERR(trans); + goto fail; + } + if (!IS_ERR(trans)) { + ret = btrfs_commit_transaction(trans, + pending_snapshot->snap); + if (ret) + goto fail; + } + } + inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto fail; } - BUG_ON(!inode); + d_instantiate(dentry, inode); ret = 0; fail: btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, &pending_snapshot->block_rsv, pending_snapshot->qgroup_reserved); -out: +free: kfree(pending_snapshot); +out: + atomic_dec(&root->will_be_snapshoted); return ret; } @@ -688,7 +800,7 @@ static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode) * nfs_async_unlink(). */ -static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir) +static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) { int error; @@ -842,7 +954,6 @@ static int find_new_extents(struct btrfs_root *root, { struct btrfs_path *path; struct btrfs_key min_key; - struct btrfs_key max_key; struct extent_buffer *leaf; struct btrfs_file_extent_item *extent; int type; @@ -857,17 +968,14 @@ static int find_new_extents(struct btrfs_root *root, min_key.type = BTRFS_EXTENT_DATA_KEY; min_key.offset = *off; - max_key.objectid = ino; - max_key.type = (u8)-1; - max_key.offset = (u64)-1; - - path->keep_locks = 1; - - while(1) { - ret = btrfs_search_forward(root, &min_key, &max_key, - path, newer_than); + while (1) { + path->keep_locks = 1; + ret = btrfs_search_forward(root, &min_key, path, newer_than); if (ret != 0) goto none; + path->keep_locks = 0; + btrfs_unlock_up_safe(path, 1); +process_slot: if (min_key.objectid != ino) goto none; if (min_key.type != BTRFS_EXTENT_DATA_KEY) @@ -886,6 +994,12 @@ static int find_new_extents(struct btrfs_root *root, return 0; } + path->slots[0]++; + if (path->slots[0] < btrfs_header_nritems(leaf)) { + btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]); + goto process_slot; + } + if (min_key.offset == (u64)-1) goto none; @@ -913,10 +1027,13 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) read_unlock(&em_tree->lock); if (!em) { + struct extent_state *cached = NULL; + u64 end = start + len - 1; + /* get the big lock and read metadata off disk */ - lock_extent(io_tree, start, start + len - 1); + lock_extent_bits(io_tree, start, end, 0, &cached); em = btrfs_get_extent(inode, NULL, 0, start, len, 0); - unlock_extent(io_tree, start, start + len - 1); + unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS); if (IS_ERR(em)) return NULL; @@ -935,7 +1052,8 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) return false; next = defrag_lookup_extent(inode, em->start + em->len); - if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) + if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE || + (em->block_start + em->block_len == next->block_start)) ret = false; free_extent_map(next); @@ -1014,7 +1132,7 @@ out: static int cluster_pages_for_defrag(struct inode *inode, struct page **pages, unsigned long start_index, - int num_pages) + unsigned long num_pages) { unsigned long file_end; u64 isize = i_size_read(inode); @@ -1054,10 +1172,12 @@ again: page_start = page_offset(page); page_end = page_start + PAGE_CACHE_SIZE - 1; while (1) { - lock_extent(tree, page_start, page_end); + lock_extent_bits(tree, page_start, page_end, + 0, &cached_state); ordered = btrfs_lookup_ordered_extent(inode, page_start); - unlock_extent(tree, page_start, page_end); + unlock_extent_cached(tree, page_start, page_end, + &cached_state, GFP_NOFS); if (!ordered) break; @@ -1172,8 +1292,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, int defrag_count = 0; int compress_type = BTRFS_COMPRESS_ZLIB; int extent_thresh = range->extent_thresh; - int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; - int cluster = max_cluster; + unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; + unsigned long cluster = max_cluster; u64 new_align = ~((u64)128 * 1024 - 1); struct page **pages = NULL; @@ -1206,7 +1326,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra = &file->f_ra; } - pages = kmalloc(sizeof(struct page *) * max_cluster, + pages = kmalloc_array(max_cluster, sizeof(struct page *), GFP_NOFS); if (!pages) { ret = -ENOMEM; @@ -1257,7 +1377,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, break; if (btrfs_defrag_cancelled(root->fs_info)) { - printk(KERN_DEBUG "btrfs: defrag_file cancelled\n"); + printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n"); ret = -EAGAIN; break; } @@ -1334,8 +1454,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, } } - if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) + if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) { filemap_flush(inode->i_mapping); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_flush(inode->i_mapping); + } if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { /* the filemap_flush will queue IO into the worker threads, but @@ -1381,6 +1505,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, struct btrfs_trans_handle *trans; struct btrfs_device *device = NULL; char *sizestr; + char *retptr; char *devstr = NULL; int ret = 0; int mod = 0; @@ -1410,29 +1535,30 @@ static noinline int btrfs_ioctl_resize(struct file *file, sizestr = vol_args->name; devstr = strchr(sizestr, ':'); if (devstr) { - char *end; sizestr = devstr + 1; *devstr = '\0'; devstr = vol_args->name; - devid = simple_strtoull(devstr, &end, 10); + ret = kstrtoull(devstr, 10, &devid); + if (ret) + goto out_free; if (!devid) { ret = -EINVAL; goto out_free; } - printk(KERN_INFO "btrfs: resizing devid %llu\n", devid); + btrfs_info(root->fs_info, "resizing devid %llu", devid); } device = btrfs_find_device(root->fs_info, devid, NULL, NULL); if (!device) { - printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", + btrfs_info(root->fs_info, "resizer unable to find device %llu", devid); ret = -ENODEV; goto out_free; } if (!device->writeable) { - printk(KERN_INFO "btrfs: resizer unable to apply on " - "readonly device %llu\n", + btrfs_info(root->fs_info, + "resizer unable to apply on readonly device %llu", devid); ret = -EPERM; goto out_free; @@ -1448,8 +1574,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, mod = 1; sizestr++; } - new_size = memparse(sizestr, NULL); - if (new_size == 0) { + new_size = memparse(sizestr, &retptr); + if (*retptr != '\0' || new_size == 0) { ret = -EINVAL; goto out_free; } @@ -1469,6 +1595,10 @@ static noinline int btrfs_ioctl_resize(struct file *file, } new_size = old_size - new_size; } else if (mod > 0) { + if (new_size > ULLONG_MAX - old_size) { + ret = -ERANGE; + goto out_free; + } new_size = old_size + new_size; } @@ -1484,7 +1614,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, do_div(new_size, root->sectorsize); new_size *= root->sectorsize; - printk_in_rcu(KERN_INFO "btrfs: new size for %s is %llu\n", + printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n", rcu_str_deref(device->name), new_size); if (new_size > old_size) { @@ -1545,9 +1675,15 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, src_inode = file_inode(src.file); if (src_inode->i_sb != file_inode(file)->i_sb) { - printk(KERN_INFO "btrfs: Snapshot src from " - "another FS\n"); - ret = -EINVAL; + btrfs_info(BTRFS_I(src_inode)->root->fs_info, + "Snapshot src from another FS"); + ret = -EXDEV; + } else if (!inode_owner_or_capable(src_inode)) { + /* + * Subvolume creation is not restricted, but snapshots + * are limited to own subvolumes only + */ + ret = -EPERM; } else { ret = btrfs_mksubvol(&file->f_path, name, namelen, BTRFS_I(src_inode)->root, @@ -1665,6 +1801,9 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, u64 flags; int ret = 0; + if (!inode_owner_or_capable(inode)) + return -EPERM; + ret = mnt_want_write_file(file); if (ret) goto out; @@ -1689,11 +1828,6 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, goto out_drop_write; } - if (!inode_owner_or_capable(inode)) { - ret = -EACCES; - goto out_drop_write; - } - down_write(&root->fs_info->subvol_sem); /* nothing to do */ @@ -1701,12 +1835,28 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, goto out_drop_sem; root_flags = btrfs_root_flags(&root->root_item); - if (flags & BTRFS_SUBVOL_RDONLY) + if (flags & BTRFS_SUBVOL_RDONLY) { btrfs_set_root_flags(&root->root_item, root_flags | BTRFS_ROOT_SUBVOL_RDONLY); - else - btrfs_set_root_flags(&root->root_item, + } else { + /* + * Block RO -> RW transition if this subvolume is involved in + * send + */ + spin_lock(&root->root_item_lock); + if (root->send_in_progress == 0) { + btrfs_set_root_flags(&root->root_item, root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY); + spin_unlock(&root->root_item_lock); + } else { + spin_unlock(&root->root_item_lock); + btrfs_warn(root->fs_info, + "Attempt to set subvolume %llu read-write during send", + root->root_key.objectid); + ret = -EPERM; + goto out_drop_sem; + } + } trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { @@ -1751,7 +1901,9 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) if (di && !IS_ERR(di)) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); if (key.objectid == root->root_key.objectid) { - ret = -ENOTEMPTY; + ret = -EPERM; + btrfs_err(root->fs_info, "deleting default subvolume " + "%llu is not allowed", key.objectid); goto out; } btrfs_release_path(path); @@ -1808,7 +1960,8 @@ static noinline int copy_to_sk(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, struct btrfs_ioctl_search_key *sk, - char *buf, + size_t *buf_size, + char __user *ubuf, unsigned long *sk_offset, int *num_found) { @@ -1840,13 +1993,25 @@ static noinline int copy_to_sk(struct btrfs_root *root, if (!key_in_sk(key, sk)) continue; - if (sizeof(sh) + item_len > BTRFS_SEARCH_ARGS_BUFSIZE) + if (sizeof(sh) + item_len > *buf_size) { + if (*num_found) { + ret = 1; + goto out; + } + + /* + * return one empty item back for v1, which does not + * handle -EOVERFLOW + */ + + *buf_size = sizeof(sh) + item_len; item_len = 0; + ret = -EOVERFLOW; + } - if (sizeof(sh) + item_len + *sk_offset > - BTRFS_SEARCH_ARGS_BUFSIZE) { + if (sizeof(sh) + item_len + *sk_offset > *buf_size) { ret = 1; - goto overflow; + goto out; } sh.objectid = key->objectid; @@ -1856,20 +2021,33 @@ static noinline int copy_to_sk(struct btrfs_root *root, sh.transid = found_transid; /* copy search result header */ - memcpy(buf + *sk_offset, &sh, sizeof(sh)); + if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) { + ret = -EFAULT; + goto out; + } + *sk_offset += sizeof(sh); if (item_len) { - char *p = buf + *sk_offset; + char __user *up = ubuf + *sk_offset; /* copy the item */ - read_extent_buffer(leaf, p, - item_off, item_len); + if (read_extent_buffer_to_user(leaf, up, + item_off, item_len)) { + ret = -EFAULT; + goto out; + } + *sk_offset += item_len; } (*num_found)++; - if (*num_found >= sk->nr_items) - break; + if (ret) /* -EOVERFLOW from above */ + goto out; + + if (*num_found >= sk->nr_items) { + ret = 1; + goto out; + } } advance_key: ret = 0; @@ -1884,23 +2062,37 @@ advance_key: key->objectid++; } else ret = 1; -overflow: +out: + /* + * 0: all items from this leaf copied, continue with next + * 1: * more items can be copied, but unused buffer is too small + * * all items were found + * Either way, it will stops the loop which iterates to the next + * leaf + * -EOVERFLOW: item was to large for buffer + * -EFAULT: could not copy extent buffer back to userspace + */ return ret; } static noinline int search_ioctl(struct inode *inode, - struct btrfs_ioctl_search_args *args) + struct btrfs_ioctl_search_key *sk, + size_t *buf_size, + char __user *ubuf) { struct btrfs_root *root; struct btrfs_key key; - struct btrfs_key max_key; struct btrfs_path *path; - struct btrfs_ioctl_search_key *sk = &args->key; struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info; int ret; int num_found = 0; unsigned long sk_offset = 0; + if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) { + *buf_size = sizeof(struct btrfs_ioctl_search_header); + return -EOVERFLOW; + } + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1914,7 +2106,7 @@ static noinline int search_ioctl(struct inode *inode, key.offset = (u64)-1; root = btrfs_read_fs_root_no_name(info, &key); if (IS_ERR(root)) { - printk(KERN_ERR "could not find root %llu\n", + printk(KERN_ERR "BTRFS: could not find root %llu\n", sk->tree_id); btrfs_free_path(path); return -ENOENT; @@ -1925,28 +2117,24 @@ static noinline int search_ioctl(struct inode *inode, key.type = sk->min_type; key.offset = sk->min_offset; - max_key.objectid = sk->max_objectid; - max_key.type = sk->max_type; - max_key.offset = sk->max_offset; - path->keep_locks = 1; - while(1) { - ret = btrfs_search_forward(root, &key, &max_key, path, - sk->min_transid); + while (1) { + ret = btrfs_search_forward(root, &key, path, sk->min_transid); if (ret != 0) { if (ret > 0) ret = 0; goto err; } - ret = copy_to_sk(root, path, &key, sk, args->buf, + ret = copy_to_sk(root, path, &key, sk, buf_size, ubuf, &sk_offset, &num_found); btrfs_release_path(path); - if (ret || num_found >= sk->nr_items) + if (ret) break; } - ret = 0; + if (ret > 0) + ret = 0; err: sk->nr_items = num_found; btrfs_free_path(path); @@ -1956,22 +2144,73 @@ err: static noinline int btrfs_ioctl_tree_search(struct file *file, void __user *argp) { - struct btrfs_ioctl_search_args *args; - struct inode *inode; - int ret; + struct btrfs_ioctl_search_args __user *uargs; + struct btrfs_ioctl_search_key sk; + struct inode *inode; + int ret; + size_t buf_size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - args = memdup_user(argp, sizeof(*args)); - if (IS_ERR(args)) - return PTR_ERR(args); + uargs = (struct btrfs_ioctl_search_args __user *)argp; + + if (copy_from_user(&sk, &uargs->key, sizeof(sk))) + return -EFAULT; + + buf_size = sizeof(uargs->buf); inode = file_inode(file); - ret = search_ioctl(inode, args); - if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) + ret = search_ioctl(inode, &sk, &buf_size, uargs->buf); + + /* + * In the origin implementation an overflow is handled by returning a + * search header with a len of zero, so reset ret. + */ + if (ret == -EOVERFLOW) + ret = 0; + + if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk))) ret = -EFAULT; - kfree(args); + return ret; +} + +static noinline int btrfs_ioctl_tree_search_v2(struct file *file, + void __user *argp) +{ + struct btrfs_ioctl_search_args_v2 __user *uarg; + struct btrfs_ioctl_search_args_v2 args; + struct inode *inode; + int ret; + size_t buf_size; + const size_t buf_limit = 16 * 1024 * 1024; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* copy search header and buffer size */ + uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp; + if (copy_from_user(&args, uarg, sizeof(args))) + return -EFAULT; + + buf_size = args.buf_size; + + if (buf_size < sizeof(struct btrfs_ioctl_search_header)) + return -EOVERFLOW; + + /* limit result size to 16MB */ + if (buf_size > buf_limit) + buf_size = buf_limit; + + inode = file_inode(file); + ret = search_ioctl(inode, &args.key, &buf_size, + (char *)(&uarg->buf[0])); + if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) + ret = -EFAULT; + else if (ret == -EOVERFLOW && + copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size))) + ret = -EFAULT; + return ret; } @@ -2009,7 +2248,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, key.offset = (u64)-1; root = btrfs_read_fs_root_no_name(info, &key); if (IS_ERR(root)) { - printk(KERN_ERR "could not find root %llu\n", tree_id); + printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id); ret = -ENOENT; goto out; } @@ -2018,7 +2257,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; - while(1) { + while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; @@ -2047,7 +2286,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, } *(ptr + len) = '/'; - read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len); + read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len); if (key.offset == BTRFS_FIRST_FREE_OBJECTID) break; @@ -2058,7 +2297,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, dirid = key.objectid; } memmove(name, ptr, total_len); - name[total_len]='\0'; + name[total_len] = '\0'; ret = 0; out: btrfs_free_path(path); @@ -2098,7 +2337,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file, static noinline int btrfs_ioctl_snap_destroy(struct file *file, void __user *arg) { - struct dentry *parent = fdentry(file); + struct dentry *parent = file->f_path.dentry; struct dentry *dentry; struct inode *dir = parent->d_inode; struct inode *inode; @@ -2107,6 +2346,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct btrfs_ioctl_vol_args *vol_args; struct btrfs_trans_handle *trans; struct btrfs_block_rsv block_rsv; + u64 root_flags; u64 qgroup_reserved; int namelen; int ret; @@ -2128,9 +2368,10 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (err) goto out; + err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); if (err == -EINTR) - goto out; + goto out_drop_write; dentry = lookup_one_len(vol_args->name, parent, namelen); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); @@ -2144,7 +2385,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, inode = dentry->d_inode; dest = BTRFS_I(inode)->root; - if (!capable(CAP_SYS_ADMIN)){ + if (!capable(CAP_SYS_ADMIN)) { /* * Regular user. Only allow this with a special mount * option, when the user has write+exec access to the @@ -2189,6 +2430,27 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, } mutex_lock(&inode->i_mutex); + + /* + * Don't allow to delete a subvolume with send in progress. This is + * inside the i_mutex so the error handling that has to drop the bit + * again is not run concurrently. + */ + spin_lock(&dest->root_item_lock); + root_flags = btrfs_root_flags(&dest->root_item); + if (dest->send_in_progress == 0) { + btrfs_set_root_flags(&dest->root_item, + root_flags | BTRFS_ROOT_SUBVOL_DEAD); + spin_unlock(&dest->root_item_lock); + } else { + spin_unlock(&dest->root_item_lock); + btrfs_warn(root->fs_info, + "Attempt to delete subvolume %llu during send", + dest->root_key.objectid); + err = -EPERM; + goto out_dput; + } + err = d_invalidate(dentry); if (err) goto out_unlock; @@ -2234,7 +2496,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dest->root_item.drop_level = 0; btrfs_set_root_refs(&dest->root_item, 0); - if (!xchg(&dest->orphan_item_inserted, 1)) { + if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, dest->root_key.objectid); @@ -2277,11 +2539,19 @@ out_release: out_up_write: up_write(&root->fs_info->subvol_sem); out_unlock: + if (err) { + spin_lock(&dest->root_item_lock); + root_flags = btrfs_root_flags(&dest->root_item); + btrfs_set_root_flags(&dest->root_item, + root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); + spin_unlock(&dest->root_item_lock); + } mutex_unlock(&inode->i_mutex); if (!err) { shrink_dcache_sb(root->fs_info->sb); btrfs_invalidate_inodes(dest); d_delete(dentry); + ASSERT(dest->send_in_progress == 0); /* the last ref */ if (dest->cache_inode) { @@ -2293,6 +2563,7 @@ out_dput: dput(dentry); out_unlock_dir: mutex_unlock(&dir->i_mutex); +out_drop_write: mnt_drop_write_file(file); out: kfree(vol_args); @@ -2444,9 +2715,6 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; int ret = 0; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL); if (!fi_args) return -ENOMEM; @@ -2461,6 +2729,10 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) } mutex_unlock(&fs_devices->device_list_mutex); + fi_args->nodesize = root->fs_info->super_copy->nodesize; + fi_args->sectorsize = root->fs_info->super_copy->sectorsize; + fi_args->clone_alignment = root->fs_info->super_copy->sectorsize; + if (copy_to_user(arg, fi_args, sizeof(*fi_args))) ret = -EFAULT; @@ -2476,9 +2748,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) int ret = 0; char *s_uuid = NULL; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - di_args = memdup_user(arg, sizeof(*di_args)); if (IS_ERR(di_args)) return PTR_ERR(di_args); @@ -2556,10 +2825,15 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); ordered = btrfs_lookup_first_ordered_extent(inode, off + len - 1); - if (!ordered && + if ((!ordered || + ordered->file_offset + ordered->len <= off || + ordered->file_offset >= off + len) && !test_range_bit(&BTRFS_I(inode)->io_tree, off, - off + len - 1, EXTENT_DELALLOC, 0, NULL)) + off + len - 1, EXTENT_DELALLOC, 0, NULL)) { + if (ordered) + btrfs_put_ordered_extent(ordered); break; + } unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); if (ordered) btrfs_put_ordered_extent(ordered); @@ -2694,14 +2968,11 @@ out_unlock: #define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) static long btrfs_ioctl_file_extent_same(struct file *file, - void __user *argp) + struct btrfs_ioctl_same_args __user *argp) { - struct btrfs_ioctl_same_args tmp; struct btrfs_ioctl_same_args *same; struct btrfs_ioctl_same_extent_info *info; - struct inode *src = file->f_dentry->d_inode; - struct file *dst_file = NULL; - struct inode *dst; + struct inode *src = file_inode(file); u64 off; u64 len; int i; @@ -2709,6 +2980,7 @@ static long btrfs_ioctl_file_extent_same(struct file *file, unsigned long size; u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; bool is_admin = capable(CAP_SYS_ADMIN); + u16 count; if (!(file->f_mode & FMODE_READ)) return -EINVAL; @@ -2717,25 +2989,17 @@ static long btrfs_ioctl_file_extent_same(struct file *file, if (ret) return ret; - if (copy_from_user(&tmp, - (struct btrfs_ioctl_same_args __user *)argp, - sizeof(tmp))) { + if (get_user(count, &argp->dest_count)) { ret = -EFAULT; goto out; } - size = sizeof(tmp) + - tmp.dest_count * sizeof(struct btrfs_ioctl_same_extent_info); + size = offsetof(struct btrfs_ioctl_same_args __user, info[count]); - same = kmalloc(size, GFP_NOFS); - if (!same) { - ret = -EFAULT; - goto out; - } + same = memdup_user(argp, size); - if (copy_from_user(same, - (struct btrfs_ioctl_same_args __user *)argp, size)) { - ret = -EFAULT; + if (IS_ERR(same)) { + ret = PTR_ERR(same); goto out; } @@ -2769,52 +3033,35 @@ static long btrfs_ioctl_file_extent_same(struct file *file, goto out; /* pre-format output fields to sane values */ - for (i = 0; i < same->dest_count; i++) { + for (i = 0; i < count; i++) { same->info[i].bytes_deduped = 0ULL; same->info[i].status = 0; } - ret = 0; - for (i = 0; i < same->dest_count; i++) { - info = &same->info[i]; - - dst_file = fget(info->fd); - if (!dst_file) { + for (i = 0, info = same->info; i < count; i++, info++) { + struct inode *dst; + struct fd dst_file = fdget(info->fd); + if (!dst_file.file) { info->status = -EBADF; - goto next; + continue; } + dst = file_inode(dst_file.file); - if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) { + if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) { info->status = -EINVAL; - goto next; - } - - info->status = -EXDEV; - if (file->f_path.mnt != dst_file->f_path.mnt) - goto next; - - dst = dst_file->f_dentry->d_inode; - if (src->i_sb != dst->i_sb) - goto next; - - if (S_ISDIR(dst->i_mode)) { + } else if (file->f_path.mnt != dst_file.file->f_path.mnt) { + info->status = -EXDEV; + } else if (S_ISDIR(dst->i_mode)) { info->status = -EISDIR; - goto next; - } - - if (!S_ISREG(dst->i_mode)) { + } else if (!S_ISREG(dst->i_mode)) { info->status = -EACCES; - goto next; + } else { + info->status = btrfs_extent_same(src, off, len, dst, + info->logical_offset); + if (info->status == 0) + info->bytes_deduped += len; } - - info->status = btrfs_extent_same(src, off, len, dst, - info->logical_offset); - if (info->status == 0) - info->bytes_deduped += len; - -next: - if (dst_file) - fput(dst_file); + fdput(dst_file); } ret = copy_to_user(argp, same, size); @@ -2826,6 +3073,129 @@ out: return ret; } +/* Helper to check and see if this root currently has a ref on the given disk + * bytenr. If it does then we need to update the quota for this root. This + * doesn't do anything if quotas aren't enabled. + */ +static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + u64 disko) +{ + struct seq_list tree_mod_seq_elem = {}; + struct ulist *roots; + struct ulist_iterator uiter; + struct ulist_node *root_node = NULL; + int ret; + + if (!root->fs_info->quota_enabled) + return 1; + + btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); + ret = btrfs_find_all_roots(trans, root->fs_info, disko, + tree_mod_seq_elem.seq, &roots); + if (ret < 0) + goto out; + ret = 0; + ULIST_ITER_INIT(&uiter); + while ((root_node = ulist_next(roots, &uiter))) { + if (root_node->val == root->objectid) { + ret = 1; + break; + } + } + ulist_free(roots); +out: + btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); + return ret; +} + +static int clone_finish_inode_update(struct btrfs_trans_handle *trans, + struct inode *inode, + u64 endoff, + const u64 destoff, + const u64 olen) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + inode_inc_iversion(inode); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + /* + * We round up to the block size at eof when determining which + * extents to clone above, but shouldn't round up the file size. + */ + if (endoff > destoff + olen) + endoff = destoff + olen; + if (endoff > inode->i_size) + btrfs_i_size_write(inode, endoff); + + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + goto out; + } + ret = btrfs_end_transaction(trans, root); +out: + return ret; +} + +static void clone_update_extent_map(struct inode *inode, + const struct btrfs_trans_handle *trans, + const struct btrfs_path *path, + const u64 hole_offset, + const u64 hole_len) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + int ret; + + em = alloc_extent_map(); + if (!em) { + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + return; + } + + if (path) { + struct btrfs_file_extent_item *fi; + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + btrfs_extent_item_to_extent_map(inode, path, fi, false, em); + em->generation = -1; + if (btrfs_file_extent_type(path->nodes[0], fi) == + BTRFS_FILE_EXTENT_INLINE) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + } else { + em->start = hole_offset; + em->len = hole_len; + em->ram_bytes = em->len; + em->orig_start = hole_offset; + em->block_start = EXTENT_MAP_HOLE; + em->block_len = 0; + em->orig_block_len = 0; + em->compress_type = BTRFS_COMPRESS_NONE; + em->generation = trans->transid; + } + + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 1); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, em->start, + em->start + em->len - 1, 0); + } + + if (unlikely(ret)) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); +} + /** * btrfs_clone() - clone a range from inode file to another * @@ -2838,7 +3208,8 @@ out: * @destoff: Offset within @inode to start clone */ static int btrfs_clone(struct inode *src, struct inode *inode, - u64 off, u64 olen, u64 olen_aligned, u64 destoff) + const u64 off, const u64 olen, const u64 olen_aligned, + const u64 destoff) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path = NULL; @@ -2849,7 +3220,10 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u32 nritems; int slot; int ret; - u64 len = olen_aligned; + int no_quota; + const u64 len = olen_aligned; + u64 last_disko = 0; + u64 last_dest_end = destoff; ret = -ENOMEM; buf = vmalloc(btrfs_level_size(root, 0)); @@ -2866,19 +3240,33 @@ static int btrfs_clone(struct inode *src, struct inode *inode, /* clone data */ key.objectid = btrfs_ino(src); key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = 0; + key.offset = off; while (1) { /* * note the key will change type as we walk through the * tree. */ + path->leave_spinning = 1; ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, 0, 0); if (ret < 0) goto out; + /* + * First search, if no extent item that starts at offset off was + * found but the previous item is an extent item, it's possible + * it might overlap our target range, therefore process it. + */ + if (key.offset == off && ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0] - 1); + if (key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } nritems = btrfs_header_nritems(path->nodes[0]); +process_slot: + no_quota = 1; if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(BTRFS_I(src)->root, path); if (ret < 0) @@ -2903,12 +3291,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u64 disko = 0, diskl = 0; u64 datao = 0, datal = 0; u8 comp; - u64 endoff; - - size = btrfs_item_size_nr(leaf, slot); - read_extent_buffer(leaf, buf, - btrfs_item_ptr_offset(leaf, slot), - size); + u64 drop_start; extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); @@ -2928,11 +3311,26 @@ static int btrfs_clone(struct inode *src, struct inode *inode, datal = btrfs_file_extent_ram_bytes(leaf, extent); } - btrfs_release_path(path); - if (key.offset + datal <= off || - key.offset >= off + len - 1) - goto next; + /* + * The first search might have left us at an extent + * item that ends before our target range's start, can + * happen if we have holes and NO_HOLES feature enabled. + */ + if (key.offset + datal <= off) { + path->slots[0]++; + goto process_slot; + } else if (key.offset >= off + len) { + break; + } + + size = btrfs_item_size_nr(leaf, slot); + read_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + + btrfs_release_path(path); + path->leave_spinning = 0; memcpy(&new_key, &key, sizeof(new_key)); new_key.objectid = btrfs_ino(inode); @@ -2942,6 +3340,18 @@ static int btrfs_clone(struct inode *src, struct inode *inode, new_key.offset = destoff; /* + * Deal with a hole that doesn't have an extent item + * that represents it (NO_HOLES feature enabled). + * This hole is either in the middle of the cloning + * range or at the beginning (fully overlaps it or + * partially overlaps it). + */ + if (new_key.offset != last_dest_end) + drop_start = last_dest_end; + else + drop_start = new_key.offset; + + /* * 1 - adjusting old extent (we may have to split it) * 1 - add new extent * 1 - inode update @@ -2959,23 +3369,24 @@ static int btrfs_clone(struct inode *src, struct inode *inode, * | ------------- extent ------------- | */ - /* substract range b */ + /* subtract range b */ if (key.offset + datal > off + len) datal = off + len - key.offset; - /* substract range a */ + /* subtract range a */ if (off > key.offset) { datao += off - key.offset; datal -= off - key.offset; } ret = btrfs_drop_extents(trans, root, inode, - new_key.offset, + drop_start, new_key.offset + datal, 1); if (ret) { - btrfs_abort_transaction(trans, root, - ret); + if (ret != -EOPNOTSUPP) + btrfs_abort_transaction(trans, + root, ret); btrfs_end_transaction(trans, root); goto out; } @@ -3006,6 +3417,28 @@ static int btrfs_clone(struct inode *src, struct inode *inode, datao); btrfs_set_file_extent_num_bytes(leaf, extent, datal); + + /* + * We need to look up the roots that point at + * this bytenr and see if the new root does. If + * it does not we need to make sure we update + * quotas appropriately. + */ + if (disko && root != BTRFS_I(src)->root && + disko != last_disko) { + no_quota = check_ref(trans, root, + disko); + if (no_quota < 0) { + btrfs_abort_transaction(trans, + root, + ret); + btrfs_end_transaction(trans, + root); + ret = no_quota; + goto out; + } + } + if (disko) { inode_add_bytes(inode, datal); ret = btrfs_inc_extent_ref(trans, root, @@ -3013,7 +3446,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, root->root_key.objectid, btrfs_ino(inode), new_key.offset - datao, - 0); + no_quota); if (ret) { btrfs_abort_transaction(trans, root, @@ -3027,6 +3460,8 @@ static int btrfs_clone(struct inode *src, struct inode *inode, } else if (type == BTRFS_FILE_EXTENT_INLINE) { u64 skip = 0; u64 trim = 0; + u64 aligned_end = 0; + if (off > key.offset) { skip = off - key.offset; new_key.offset += skip; @@ -3043,13 +3478,16 @@ static int btrfs_clone(struct inode *src, struct inode *inode, size -= skip + trim; datal -= skip + trim; + aligned_end = ALIGN(new_key.offset + datal, + root->sectorsize); ret = btrfs_drop_extents(trans, root, inode, - new_key.offset, - new_key.offset + datal, + drop_start, + aligned_end, 1); if (ret) { - btrfs_abort_transaction(trans, root, - ret); + if (ret != -EOPNOTSUPP) + btrfs_abort_transaction(trans, + root, ret); btrfs_end_transaction(trans, root); goto out; } @@ -3078,39 +3516,62 @@ static int btrfs_clone(struct inode *src, struct inode *inode, inode_add_bytes(inode, datal); } + /* If we have an implicit hole (NO_HOLES feature). */ + if (drop_start < new_key.offset) + clone_update_extent_map(inode, trans, + NULL, drop_start, + new_key.offset - drop_start); + + clone_update_extent_map(inode, trans, path, 0, 0); + btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - inode_inc_iversion(inode); - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - - /* - * we round up to the block size at eof when - * determining which extents to clone above, - * but shouldn't round up the file size - */ - endoff = new_key.offset + datal; - if (endoff > destoff+olen) - endoff = destoff+olen; - if (endoff > inode->i_size) - btrfs_i_size_write(inode, endoff); - - ret = btrfs_update_inode(trans, root, inode); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - btrfs_end_transaction(trans, root); + last_dest_end = new_key.offset + datal; + ret = clone_finish_inode_update(trans, inode, + last_dest_end, + destoff, olen); + if (ret) goto out; - } - ret = btrfs_end_transaction(trans, root); + if (new_key.offset + datal >= destoff + len) + break; } -next: btrfs_release_path(path); key.offset++; } ret = 0; + if (last_dest_end < destoff + len) { + /* + * We have an implicit hole (NO_HOLES feature is enabled) that + * fully or partially overlaps our cloning range at its end. + */ + btrfs_release_path(path); + + /* + * 1 - remove extent(s) + * 1 - inode update + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + ret = btrfs_drop_extents(trans, root, inode, + last_dest_end, destoff + len, 1); + if (ret) { + if (ret != -EOPNOTSUPP) + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + goto out; + } + clone_update_extent_map(inode, trans, NULL, last_dest_end, + destoff + len - last_dest_end); + ret = clone_finish_inode_update(trans, inode, destoff + len, + destoff, olen); + } + out: - btrfs_release_path(path); btrfs_free_path(path); vfree(buf); return ret; @@ -3119,7 +3580,7 @@ out: static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, u64 off, u64 olen, u64 destoff) { - struct inode *inode = fdentry(file)->d_inode; + struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct fd src_file; struct inode *src; @@ -3134,8 +3595,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, * decompress into destination's address_space (the file offset * may change, so source mapping won't do), then recompress (or * otherwise reinsert) a subrange. - * - allow ranges within the same file to be cloned (provided - * they don't overlap)? + * + * - split destination inode's inline extents. The inline extents can + * be either compressed or non-compressed. */ /* the destination must be opened for writing */ @@ -3221,19 +3683,53 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, goto out_unlock; } - /* truncate page cache pages from target inode range */ - truncate_inode_pages_range(&inode->i_data, destoff, - PAGE_CACHE_ALIGN(destoff + len) - 1); + /* + * Lock the target range too. Right after we replace the file extent + * items in the fs tree (which now point to the cloned data), we might + * have a worker replace them with extent items relative to a write + * operation that was issued before this clone operation (i.e. confront + * with inode.c:btrfs_finish_ordered_io). + */ + if (same_inode) { + u64 lock_start = min_t(u64, off, destoff); + u64 lock_len = max_t(u64, off, destoff) + len - lock_start; - lock_extent_range(src, off, len); + lock_extent_range(src, lock_start, lock_len); + } else { + lock_extent_range(src, off, len); + lock_extent_range(inode, destoff, len); + } ret = btrfs_clone(src, inode, off, olen, len, destoff); - unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); + if (same_inode) { + u64 lock_start = min_t(u64, off, destoff); + u64 lock_end = max_t(u64, off, destoff) + len - 1; + + unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end); + } else { + unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); + unlock_extent(&BTRFS_I(inode)->io_tree, destoff, + destoff + len - 1); + } + /* + * Truncate page cache pages so that future reads will see the cloned + * data immediately and not the previous data. + */ + truncate_inode_pages_range(&inode->i_data, destoff, + PAGE_CACHE_ALIGN(destoff + len) - 1); out_unlock: - mutex_unlock(&src->i_mutex); - if (!same_inode) - mutex_unlock(&inode->i_mutex); + if (!same_inode) { + if (inode < src) { + mutex_unlock(&src->i_mutex); + mutex_unlock(&inode->i_mutex); + } else { + mutex_unlock(&inode->i_mutex); + mutex_unlock(&src->i_mutex); + } + } else { + mutex_unlock(&src->i_mutex); + } out_fput: fdput(src_file); out_drop_write: @@ -3356,8 +3852,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) if (IS_ERR_OR_NULL(di)) { btrfs_free_path(path); btrfs_end_transaction(trans, root); - printk(KERN_ERR "Umm, you don't have the default dir item, " - "this isn't going to work\n"); + btrfs_err(new_root->fs_info, "Umm, you don't have the default dir" + "item, this isn't going to work"); ret = -ENOENT; goto out; } @@ -3438,6 +3934,11 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) up_read(&info->groups_sem); } + /* + * Global block reserve, exported as a space_info + */ + slot_count++; + /* space_slots == 0 means they are asking for a count */ if (space_args.space_slots == 0) { space_args.total_spaces = slot_count; @@ -3496,6 +3997,21 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) up_read(&info->groups_sem); } + /* + * Add global block reserve + */ + if (slot_count) { + struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv; + + spin_lock(&block_rsv->lock); + space.total_bytes = block_rsv->size; + space.used_bytes = block_rsv->size - block_rsv->reserved; + spin_unlock(&block_rsv->lock); + space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV; + memcpy(dest, &space, sizeof(space)); + space_args.total_spaces++; + } + user_dest = (struct btrfs_ioctl_space_info __user *) (arg + sizeof(struct btrfs_ioctl_space_args)); @@ -3679,9 +4195,10 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg) switch (p->cmd) { case BTRFS_IOCTL_DEV_REPLACE_CMD_START: - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - + if (root->fs_info->sb->s_flags & MS_RDONLY) { + ret = -EROFS; + goto out; + } if (atomic_xchg( &root->fs_info->mutually_exclusive_operation_running, 1)) { @@ -3707,7 +4224,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg) if (copy_to_user(arg, p, sizeof(*p))) ret = -EFAULT; - +out: kfree(p); return ret; } @@ -4317,7 +4834,7 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4325,10 +4842,9 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg) return btrfs_qgroup_wait_for_completion(root->fs_info); } -static long btrfs_ioctl_set_received_subvol(struct file *file, - void __user *arg) +static long _btrfs_ioctl_set_received_subvol(struct file *file, + struct btrfs_ioctl_received_subvol_args *sa) { - struct btrfs_ioctl_received_subvol_args *sa = NULL; struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root_item *root_item = &root->root_item; @@ -4337,6 +4853,9 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, int ret = 0; int received_uuid_changed; + if (!inode_owner_or_capable(inode)) + return -EPERM; + ret = mnt_want_write_file(file); if (ret < 0) return ret; @@ -4353,18 +4872,6 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, goto out; } - if (!inode_owner_or_capable(inode)) { - ret = -EACCES; - goto out; - } - - sa = memdup_user(arg, sizeof(*sa)); - if (IS_ERR(sa)) { - ret = PTR_ERR(sa); - sa = NULL; - goto out; - } - /* * 1 - root item * 2 - uuid items (received uuid + subvol uuid) @@ -4418,14 +4925,90 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, goto out; } +out: + up_write(&root->fs_info->subvol_sem); + mnt_drop_write_file(file); + return ret; +} + +#ifdef CONFIG_64BIT +static long btrfs_ioctl_set_received_subvol_32(struct file *file, + void __user *arg) +{ + struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL; + struct btrfs_ioctl_received_subvol_args *args64 = NULL; + int ret = 0; + + args32 = memdup_user(arg, sizeof(*args32)); + if (IS_ERR(args32)) { + ret = PTR_ERR(args32); + args32 = NULL; + goto out; + } + + args64 = kmalloc(sizeof(*args64), GFP_NOFS); + if (!args64) { + ret = -ENOMEM; + goto out; + } + + memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE); + args64->stransid = args32->stransid; + args64->rtransid = args32->rtransid; + args64->stime.sec = args32->stime.sec; + args64->stime.nsec = args32->stime.nsec; + args64->rtime.sec = args32->rtime.sec; + args64->rtime.nsec = args32->rtime.nsec; + args64->flags = args32->flags; + + ret = _btrfs_ioctl_set_received_subvol(file, args64); + if (ret) + goto out; + + memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE); + args32->stransid = args64->stransid; + args32->rtransid = args64->rtransid; + args32->stime.sec = args64->stime.sec; + args32->stime.nsec = args64->stime.nsec; + args32->rtime.sec = args64->rtime.sec; + args32->rtime.nsec = args64->rtime.nsec; + args32->flags = args64->flags; + + ret = copy_to_user(arg, args32, sizeof(*args32)); + if (ret) + ret = -EFAULT; + +out: + kfree(args32); + kfree(args64); + return ret; +} +#endif + +static long btrfs_ioctl_set_received_subvol(struct file *file, + void __user *arg) +{ + struct btrfs_ioctl_received_subvol_args *sa = NULL; + int ret = 0; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + sa = NULL; + goto out; + } + + ret = _btrfs_ioctl_set_received_subvol(file, sa); + + if (ret) + goto out; + ret = copy_to_user(arg, sa, sizeof(*sa)); if (ret) ret = -EFAULT; out: kfree(sa); - up_write(&root->fs_info->subvol_sem); - mnt_drop_write_file(file); return ret; } @@ -4443,8 +5026,8 @@ static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) len = strnlen(label, BTRFS_LABEL_SIZE); if (len == BTRFS_LABEL_SIZE) { - pr_warn("btrfs: label is too long, return the first %zu bytes\n", - --len); + btrfs_warn(root->fs_info, + "label is too long, return the first %zu bytes", --len); } ret = copy_to_user(arg, label, len); @@ -4467,7 +5050,7 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) return -EFAULT; if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) { - pr_err("btrfs: unable to set label with more than %d bytes\n", + btrfs_err(root->fs_info, "unable to set label with more than %d bytes", BTRFS_LABEL_SIZE - 1); return -EINVAL; } @@ -4485,13 +5068,173 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) spin_lock(&root->fs_info->super_lock); strcpy(super_block->label, label); spin_unlock(&root->fs_info->super_lock); - ret = btrfs_end_transaction(trans, root); + ret = btrfs_commit_transaction(trans, root); out_unlock: mnt_drop_write_file(file); return ret; } +#define INIT_FEATURE_FLAGS(suffix) \ + { .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \ + .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \ + .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix } + +static int btrfs_ioctl_get_supported_features(struct file *file, + void __user *arg) +{ + static struct btrfs_ioctl_feature_flags features[3] = { + INIT_FEATURE_FLAGS(SUPP), + INIT_FEATURE_FLAGS(SAFE_SET), + INIT_FEATURE_FLAGS(SAFE_CLEAR) + }; + + if (copy_to_user(arg, &features, sizeof(features))) + return -EFAULT; + + return 0; +} + +static int btrfs_ioctl_get_features(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_super_block *super_block = root->fs_info->super_copy; + struct btrfs_ioctl_feature_flags features; + + features.compat_flags = btrfs_super_compat_flags(super_block); + features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block); + features.incompat_flags = btrfs_super_incompat_flags(super_block); + + if (copy_to_user(arg, &features, sizeof(features))) + return -EFAULT; + + return 0; +} + +static int check_feature_bits(struct btrfs_root *root, + enum btrfs_feature_set set, + u64 change_mask, u64 flags, u64 supported_flags, + u64 safe_set, u64 safe_clear) +{ + const char *type = btrfs_feature_set_names[set]; + char *names; + u64 disallowed, unsupported; + u64 set_mask = flags & change_mask; + u64 clear_mask = ~flags & change_mask; + + unsupported = set_mask & ~supported_flags; + if (unsupported) { + names = btrfs_printable_features(set, unsupported); + if (names) { + btrfs_warn(root->fs_info, + "this kernel does not support the %s feature bit%s", + names, strchr(names, ',') ? "s" : ""); + kfree(names); + } else + btrfs_warn(root->fs_info, + "this kernel does not support %s bits 0x%llx", + type, unsupported); + return -EOPNOTSUPP; + } + + disallowed = set_mask & ~safe_set; + if (disallowed) { + names = btrfs_printable_features(set, disallowed); + if (names) { + btrfs_warn(root->fs_info, + "can't set the %s feature bit%s while mounted", + names, strchr(names, ',') ? "s" : ""); + kfree(names); + } else + btrfs_warn(root->fs_info, + "can't set %s bits 0x%llx while mounted", + type, disallowed); + return -EPERM; + } + + disallowed = clear_mask & ~safe_clear; + if (disallowed) { + names = btrfs_printable_features(set, disallowed); + if (names) { + btrfs_warn(root->fs_info, + "can't clear the %s feature bit%s while mounted", + names, strchr(names, ',') ? "s" : ""); + kfree(names); + } else + btrfs_warn(root->fs_info, + "can't clear %s bits 0x%llx while mounted", + type, disallowed); + return -EPERM; + } + + return 0; +} + +#define check_feature(root, change_mask, flags, mask_base) \ +check_feature_bits(root, FEAT_##mask_base, change_mask, flags, \ + BTRFS_FEATURE_ ## mask_base ## _SUPP, \ + BTRFS_FEATURE_ ## mask_base ## _SAFE_SET, \ + BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR) + +static int btrfs_ioctl_set_features(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_super_block *super_block = root->fs_info->super_copy; + struct btrfs_ioctl_feature_flags flags[2]; + struct btrfs_trans_handle *trans; + u64 newflags; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(flags, arg, sizeof(flags))) + return -EFAULT; + + /* Nothing to do */ + if (!flags[0].compat_flags && !flags[0].compat_ro_flags && + !flags[0].incompat_flags) + return 0; + + ret = check_feature(root, flags[0].compat_flags, + flags[1].compat_flags, COMPAT); + if (ret) + return ret; + + ret = check_feature(root, flags[0].compat_ro_flags, + flags[1].compat_ro_flags, COMPAT_RO); + if (ret) + return ret; + + ret = check_feature(root, flags[0].incompat_flags, + flags[1].incompat_flags, INCOMPAT); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + spin_lock(&root->fs_info->super_lock); + newflags = btrfs_super_compat_flags(super_block); + newflags |= flags[0].compat_flags & flags[1].compat_flags; + newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags); + btrfs_set_super_compat_flags(super_block, newflags); + + newflags = btrfs_super_compat_ro_flags(super_block); + newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags; + newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags); + btrfs_set_super_compat_ro_flags(super_block, newflags); + + newflags = btrfs_super_incompat_flags(super_block); + newflags |= flags[0].incompat_flags & flags[1].incompat_flags; + newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags); + btrfs_set_super_incompat_flags(super_block, newflags); + spin_unlock(&root->fs_info->super_lock); + + return btrfs_commit_transaction(trans, root); +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -4549,6 +5292,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_trans_end(file); case BTRFS_IOC_TREE_SEARCH: return btrfs_ioctl_tree_search(file, argp); + case BTRFS_IOC_TREE_SEARCH_V2: + return btrfs_ioctl_tree_search_v2(file, argp); case BTRFS_IOC_INO_LOOKUP: return btrfs_ioctl_ino_lookup(file, argp); case BTRFS_IOC_INO_PATHS: @@ -4557,9 +5302,15 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_logical_to_ino(root, argp); case BTRFS_IOC_SPACE_INFO: return btrfs_ioctl_space_info(root, argp); - case BTRFS_IOC_SYNC: - btrfs_sync_fs(file->f_dentry->d_sb, 1); - return 0; + case BTRFS_IOC_SYNC: { + int ret; + + ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1); + if (ret) + return ret; + ret = btrfs_sync_fs(file->f_dentry->d_sb, 1); + return ret; + } case BTRFS_IOC_START_SYNC: return btrfs_ioctl_start_sync(root, argp); case BTRFS_IOC_WAIT_SYNC: @@ -4578,6 +5329,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_balance_progress(root, argp); case BTRFS_IOC_SET_RECEIVED_SUBVOL: return btrfs_ioctl_set_received_subvol(file, argp); +#ifdef CONFIG_64BIT + case BTRFS_IOC_SET_RECEIVED_SUBVOL_32: + return btrfs_ioctl_set_received_subvol_32(file, argp); +#endif case BTRFS_IOC_SEND: return btrfs_ioctl_send(file, argp); case BTRFS_IOC_GET_DEV_STATS: @@ -4604,6 +5359,12 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_set_fslabel(file, argp); case BTRFS_IOC_FILE_EXTENT_SAME: return btrfs_ioctl_file_extent_same(file, argp); + case BTRFS_IOC_GET_SUPPORTED_FEATURES: + return btrfs_ioctl_get_supported_features(file, argp); + case BTRFS_IOC_GET_FEATURES: + return btrfs_ioctl_get_features(file, argp); + case BTRFS_IOC_SET_FEATURES: + return btrfs_ioctl_set_features(file, argp); } return -ENOTTY; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 01277b8f237..5665d214924 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -33,14 +33,14 @@ static void btrfs_assert_tree_read_locked(struct extent_buffer *eb); */ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) { - if (eb->lock_nested) { - read_lock(&eb->lock); - if (eb->lock_nested && current->pid == eb->lock_owner) { - read_unlock(&eb->lock); - return; - } - read_unlock(&eb->lock); - } + /* + * no lock is required. The lock owner may change if + * we have a read lock, but it won't change to or away + * from us. If we have the write lock, we are the owner + * and it'll never change. + */ + if (eb->lock_nested && current->pid == eb->lock_owner) + return; if (rw == BTRFS_WRITE_LOCK) { if (atomic_read(&eb->blocking_writers) == 0) { WARN_ON(atomic_read(&eb->spinning_writers) != 1); @@ -65,14 +65,15 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) */ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) { - if (eb->lock_nested) { - read_lock(&eb->lock); - if (eb->lock_nested && current->pid == eb->lock_owner) { - read_unlock(&eb->lock); - return; - } - read_unlock(&eb->lock); - } + /* + * no lock is required. The lock owner may change if + * we have a read lock, but it won't change to or away + * from us. If we have the write lock, we are the owner + * and it'll never change. + */ + if (eb->lock_nested && current->pid == eb->lock_owner) + return; + if (rw == BTRFS_WRITE_LOCK_BLOCKING) { BUG_ON(atomic_read(&eb->blocking_writers) != 1); write_lock(&eb->lock); @@ -99,6 +100,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) void btrfs_tree_read_lock(struct extent_buffer *eb) { again: + BUG_ON(!atomic_read(&eb->blocking_writers) && + current->pid == eb->lock_owner); + read_lock(&eb->lock); if (atomic_read(&eb->blocking_writers) && current->pid == eb->lock_owner) { @@ -132,7 +136,9 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) if (atomic_read(&eb->blocking_writers)) return 0; - read_lock(&eb->lock); + if (!read_trylock(&eb->lock)) + return 0; + if (atomic_read(&eb->blocking_writers)) { read_unlock(&eb->lock); return 0; @@ -151,7 +157,10 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) if (atomic_read(&eb->blocking_writers) || atomic_read(&eb->blocking_readers)) return 0; - write_lock(&eb->lock); + + if (!write_trylock(&eb->lock)) + return 0; + if (atomic_read(&eb->blocking_writers) || atomic_read(&eb->blocking_readers)) { write_unlock(&eb->lock); @@ -168,14 +177,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) */ void btrfs_tree_read_unlock(struct extent_buffer *eb) { - if (eb->lock_nested) { - read_lock(&eb->lock); - if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = 0; - read_unlock(&eb->lock); - return; - } - read_unlock(&eb->lock); + /* + * if we're nested, we have the write lock. No new locking + * is needed as long as we are the lock owner. + * The write unlock will do a barrier for us, and the lock_nested + * field only matters to the lock owner. + */ + if (eb->lock_nested && current->pid == eb->lock_owner) { + eb->lock_nested = 0; + return; } btrfs_assert_tree_read_locked(eb); WARN_ON(atomic_read(&eb->spinning_readers) == 0); @@ -189,14 +199,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) */ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) { - if (eb->lock_nested) { - read_lock(&eb->lock); - if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = 0; - read_unlock(&eb->lock); - return; - } - read_unlock(&eb->lock); + /* + * if we're nested, we have the write lock. No new locking + * is needed as long as we are the lock owner. + * The write unlock will do a barrier for us, and the lock_nested + * field only matters to the lock owner. + */ + if (eb->lock_nested && current->pid == eb->lock_owner) { + eb->lock_nested = 0; + return; } btrfs_assert_tree_read_locked(eb); WARN_ON(atomic_read(&eb->blocking_readers) == 0); @@ -244,6 +255,7 @@ void btrfs_tree_unlock(struct extent_buffer *eb) BUG_ON(blockers > 1); btrfs_assert_tree_locked(eb); + eb->lock_owner = 0; atomic_dec(&eb->write_locks); if (blockers) { diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index b6a6f07c5ce..dfad8514f0d 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -141,9 +141,9 @@ static int lzo_compress_pages(struct list_head *ws, ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, &out_len, workspace->mem); if (ret != LZO_E_OK) { - printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", + printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", ret); - ret = -1; + ret = -EIO; goto out; } @@ -189,7 +189,7 @@ static int lzo_compress_pages(struct list_head *ws, kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; - ret = -1; + ret = -E2BIG; goto out; } @@ -208,7 +208,7 @@ static int lzo_compress_pages(struct list_head *ws, /* we're making it bigger, give up */ if (tot_in > 8192 && tot_in < tot_out) { - ret = -1; + ret = -E2BIG; goto out; } @@ -335,7 +335,7 @@ cont: break; if (page_in_index + 1 >= total_pages_in) { - ret = -1; + ret = -EIO; goto done; } @@ -357,8 +357,8 @@ cont: if (need_unmap) kunmap(pages_in[page_in_index - 1]); if (ret != LZO_E_OK) { - printk(KERN_WARNING "btrfs decompress failed\n"); - ret = -1; + printk(KERN_WARNING "BTRFS: decompress failed\n"); + ret = -EIO; break; } @@ -401,13 +401,13 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in, out_len = PAGE_CACHE_SIZE; ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); if (ret != LZO_E_OK) { - printk(KERN_WARNING "btrfs decompress failed!\n"); - ret = -1; + printk(KERN_WARNING "BTRFS: decompress failed!\n"); + ret = -EIO; goto out; } if (out_len < start_byte) { - ret = -1; + ret = -EIO; goto out; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index c702cb62f78..7187b14faa6 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -67,7 +67,7 @@ static void ordered_data_tree_panic(struct inode *inode, int errno, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset " - "%llu\n", offset); + "%llu", offset); } /* @@ -336,22 +336,26 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, entry->len); *file_offset = dec_end; if (dec_start > dec_end) { - printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n", - dec_start, dec_end); + btrfs_crit(BTRFS_I(inode)->root->fs_info, + "bad ordering dec_start %llu end %llu", dec_start, dec_end); } to_dec = dec_end - dec_start; if (to_dec > entry->bytes_left) { - printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", - entry->bytes_left, to_dec); + btrfs_crit(BTRFS_I(inode)->root->fs_info, + "bad ordered accounting left %llu size %llu", + entry->bytes_left, to_dec); } entry->bytes_left -= to_dec; if (!uptodate) set_bit(BTRFS_ORDERED_IOERR, &entry->flags); - if (entry->bytes_left == 0) + if (entry->bytes_left == 0) { ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); - else + if (waitqueue_active(&entry->wait)) + wake_up(&entry->wait); + } else { ret = 1; + } out: if (!ret && cached && entry) { *cached = entry; @@ -401,17 +405,21 @@ have_entry: } if (io_size > entry->bytes_left) { - printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", + btrfs_crit(BTRFS_I(inode)->root->fs_info, + "bad ordered accounting left %llu size %llu", entry->bytes_left, io_size); } entry->bytes_left -= io_size; if (!uptodate) set_bit(BTRFS_ORDERED_IOERR, &entry->flags); - if (entry->bytes_left == 0) + if (entry->bytes_left == 0) { ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); - else + if (waitqueue_active(&entry->wait)) + wake_up(&entry->wait); + } else { ret = 1; + } out: if (!ret && cached && entry) { *cached = entry; @@ -422,27 +430,48 @@ out: } /* Needs to either be called under a log transaction or the log_mutex */ -void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode) +void btrfs_get_logged_extents(struct inode *inode, + struct list_head *logged_list) { struct btrfs_ordered_inode_tree *tree; struct btrfs_ordered_extent *ordered; struct rb_node *n; - int index = log->log_transid % 2; tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irq(&tree->lock); for (n = rb_first(&tree->tree); n; n = rb_next(n)) { ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); - spin_lock(&log->log_extents_lock[index]); - if (list_empty(&ordered->log_list)) { - list_add_tail(&ordered->log_list, &log->logged_list[index]); - atomic_inc(&ordered->refs); - } - spin_unlock(&log->log_extents_lock[index]); + if (!list_empty(&ordered->log_list)) + continue; + list_add_tail(&ordered->log_list, logged_list); + atomic_inc(&ordered->refs); } spin_unlock_irq(&tree->lock); } +void btrfs_put_logged_extents(struct list_head *logged_list) +{ + struct btrfs_ordered_extent *ordered; + + while (!list_empty(logged_list)) { + ordered = list_first_entry(logged_list, + struct btrfs_ordered_extent, + log_list); + list_del_init(&ordered->log_list); + btrfs_put_ordered_extent(ordered); + } +} + +void btrfs_submit_logged_extents(struct list_head *logged_list, + struct btrfs_root *log) +{ + int index = log->log_transid % 2; + + spin_lock_irq(&log->log_extents_lock[index]); + list_splice_tail(logged_list, &log->logged_list[index]); + spin_unlock_irq(&log->log_extents_lock[index]); +} + void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) { struct btrfs_ordered_extent *ordered; @@ -455,8 +484,19 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) log_list); list_del_init(&ordered->log_list); spin_unlock_irq(&log->log_extents_lock[index]); + + if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && + !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { + struct inode *inode = ordered->inode; + u64 start = ordered->file_offset; + u64 end = ordered->file_offset + ordered->len - 1; + + WARN_ON(!inode); + filemap_fdatawrite_range(inode->i_mapping, start, end); + } wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)); + btrfs_put_ordered_extent(ordered); spin_lock_irq(&log->log_extents_lock[index]); } @@ -520,7 +560,8 @@ void btrfs_remove_ordered_extent(struct inode *inode, spin_lock_irq(&tree->lock); node = &entry->rb_node; rb_erase(node, &tree->tree); - tree->last = NULL; + if (tree->last == node) + tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); spin_unlock_irq(&tree->lock); @@ -537,7 +578,9 @@ void btrfs_remove_ordered_extent(struct inode *inode, */ if (RB_EMPTY_ROOT(&tree->tree) && !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { + spin_lock(&root->fs_info->ordered_root_lock); list_del_init(&BTRFS_I(inode)->ordered_operations); + spin_unlock(&root->fs_info->ordered_root_lock); } if (!root->nr_ordered_extents) { @@ -563,18 +606,19 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ -void btrfs_wait_ordered_extents(struct btrfs_root *root) +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) { struct list_head splice, works; struct btrfs_ordered_extent *ordered, *next; + int count = 0; INIT_LIST_HEAD(&splice); INIT_LIST_HEAD(&works); - mutex_lock(&root->fs_info->ordered_operations_mutex); + mutex_lock(&root->ordered_extent_mutex); spin_lock(&root->ordered_extent_lock); list_splice_init(&root->ordered_extents, &splice); - while (!list_empty(&splice)) { + while (!list_empty(&splice) && nr) { ordered = list_first_entry(&splice, struct btrfs_ordered_extent, root_extent_list); list_move_tail(&ordered->root_extent_list, @@ -582,14 +626,19 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root) atomic_inc(&ordered->refs); spin_unlock(&root->ordered_extent_lock); - ordered->flush_work.func = btrfs_run_ordered_extent_work; + btrfs_init_work(&ordered->flush_work, + btrfs_run_ordered_extent_work, NULL, NULL); list_add_tail(&ordered->work_list, &works); - btrfs_queue_worker(&root->fs_info->flush_workers, - &ordered->flush_work); + btrfs_queue_work(root->fs_info->flush_workers, + &ordered->flush_work); cond_resched(); spin_lock(&root->ordered_extent_lock); + if (nr != -1) + nr--; + count++; } + list_splice_tail(&splice, &root->ordered_extents); spin_unlock(&root->ordered_extent_lock); list_for_each_entry_safe(ordered, next, &works, work_list) { @@ -598,19 +647,23 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root) btrfs_put_ordered_extent(ordered); cond_resched(); } - mutex_unlock(&root->fs_info->ordered_operations_mutex); + mutex_unlock(&root->ordered_extent_mutex); + + return count; } -void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info) +void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) { struct btrfs_root *root; struct list_head splice; + int done; INIT_LIST_HEAD(&splice); + mutex_lock(&fs_info->ordered_operations_mutex); spin_lock(&fs_info->ordered_root_lock); list_splice_init(&fs_info->ordered_roots, &splice); - while (!list_empty(&splice)) { + while (!list_empty(&splice) && nr) { root = list_first_entry(&splice, struct btrfs_root, ordered_root); root = btrfs_grab_fs_root(root); @@ -619,12 +672,18 @@ void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info) &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); - btrfs_wait_ordered_extents(root); + done = btrfs_wait_ordered_extents(root, nr); btrfs_put_fs_root(root); spin_lock(&fs_info->ordered_root_lock); + if (nr != -1) { + nr -= done; + WARN_ON(nr < 0); + } } + list_splice_tail(&splice, &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); + mutex_unlock(&fs_info->ordered_operations_mutex); } /* @@ -686,8 +745,8 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, goto out; } list_add_tail(&work->list, &works); - btrfs_queue_worker(&root->fs_info->flush_workers, - &work->work); + btrfs_queue_work(root->fs_info->flush_workers, + &work->work); cond_resched(); spin_lock(&root->fs_info->ordered_root_lock); @@ -734,8 +793,9 @@ void btrfs_start_ordered_extent(struct inode *inode, /* * Used to wait on ordered extents across a large range of bytes. */ -void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) +int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) { + int ret = 0; u64 end; u64 orig_end; struct btrfs_ordered_extent *ordered; @@ -751,8 +811,9 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) /* start IO across the range first to instantiate any delalloc * extents */ - filemap_fdatawrite_range(inode->i_mapping, start, orig_end); - + ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end); + if (ret) + return ret; /* * So with compression we will find and lock a dirty page and clear the * first one as dirty, setup an async extent, and immediately return @@ -768,10 +829,15 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) * right and you are wrong. */ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_fdatawrite_range(inode->i_mapping, start, orig_end); - - filemap_fdatawait_range(inode->i_mapping, start, orig_end); + &BTRFS_I(inode)->runtime_flags)) { + ret = filemap_fdatawrite_range(inode->i_mapping, start, + orig_end); + if (ret) + return ret; + } + ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end); + if (ret) + return ret; end = orig_end; while (1) { @@ -782,17 +848,20 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) btrfs_put_ordered_extent(ordered); break; } - if (ordered->file_offset + ordered->len < start) { + if (ordered->file_offset + ordered->len <= start) { btrfs_put_ordered_extent(ordered); break; } btrfs_start_ordered_extent(inode, ordered, 1); end = ordered->file_offset; + if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) + ret = -EIO; btrfs_put_ordered_extent(ordered); - if (end == 0 || end == start) + if (ret || end == 0 || end == start) break; end--; } + return ret; } /* @@ -1076,7 +1145,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, * if this file hasn't been changed since the last transaction * commit, we can safely return without doing anything */ - if (last_mod < root->fs_info->last_trans_committed) + if (last_mod <= root->fs_info->last_trans_committed) return; spin_lock(&root->fs_info->ordered_root_lock); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 0c0b35612d7..246897058ef 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -180,7 +180,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, u64 file_offset); void btrfs_start_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry, int wait); -void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); +int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, @@ -195,9 +195,13 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); -void btrfs_wait_ordered_extents(struct btrfs_root *root); -void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info); -void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode); +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); +void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); +void btrfs_get_logged_extents(struct inode *inode, + struct list_head *logged_list); +void btrfs_put_logged_extents(struct list_head *logged_list); +void btrfs_submit_logged_extents(struct list_head *logged_list, + struct btrfs_root *log); void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); int __init ordered_data_init(void); diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index 24cad1695af..65793edb38c 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -69,23 +69,3 @@ out: btrfs_free_path(path); return ret; } - -int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset) -{ - struct btrfs_path *path; - struct btrfs_key key; - int ret; - - key.objectid = BTRFS_ORPHAN_OBJECTID; - key.type = BTRFS_ORPHAN_ITEM_KEY; - key.offset = offset; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - - btrfs_free_path(path); - return ret; -} diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 0088bedc863..9626b4ad3b9 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -54,7 +54,7 @@ static void print_extent_data_ref(struct extent_buffer *eb, btrfs_extent_data_ref_count(eb, ref)); } -static void print_extent_item(struct extent_buffer *eb, int slot) +static void print_extent_item(struct extent_buffer *eb, int slot, int type) { struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; @@ -63,7 +63,6 @@ static void print_extent_item(struct extent_buffer *eb, int slot) struct btrfs_disk_key key; unsigned long end; unsigned long ptr; - int type; u32 item_size = btrfs_item_size_nr(eb, slot); u64 flags; u64 offset; @@ -88,7 +87,8 @@ static void print_extent_item(struct extent_buffer *eb, int slot) btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei), flags); - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + if ((type == BTRFS_EXTENT_ITEM_KEY) && + flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { struct btrfs_tree_block_info *info; info = (struct btrfs_tree_block_info *)(ei + 1); btrfs_tree_block_key(eb, info, &key); @@ -154,7 +154,7 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset, u32 item_size) { if (!IS_ALIGNED(item_size, sizeof(u64))) { - pr_warn("btrfs: uuid item with illegal size %lu!\n", + pr_warn("BTRFS: uuid item with illegal size %lu!\n", (unsigned long)item_size); return; } @@ -193,7 +193,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) btrfs_info(root->fs_info, "leaf %llu total ptrs %d free space %d", btrfs_header_bytenr(l), nr, btrfs_leaf_free_space(root, l)); for (i = 0 ; i < nr ; i++) { - item = btrfs_item_nr(l, i); + item = btrfs_item_nr(i); btrfs_item_key_to_cpu(l, &key, i); type = btrfs_key_type(&key); printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d " @@ -223,7 +223,8 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) btrfs_disk_root_refs(l, ri)); break; case BTRFS_EXTENT_ITEM_KEY: - print_extent_item(l, i); + case BTRFS_METADATA_ITEM_KEY: + print_extent_item(l, i, type); break; case BTRFS_TREE_BLOCK_REF_KEY: printk(KERN_INFO "\t\ttree block backref\n"); @@ -249,7 +250,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) BTRFS_FILE_EXTENT_INLINE) { printk(KERN_INFO "\t\tinline extent data " "size %u\n", - btrfs_file_extent_inline_len(l, fi)); + btrfs_file_extent_inline_len(l, i, fi)); break; } printk(KERN_INFO "\t\textent data disk bytenr %llu " diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c new file mode 100644 index 00000000000..129b1dd2852 --- /dev/null +++ b/fs/btrfs/props.c @@ -0,0 +1,427 @@ +/* + * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/hashtable.h> +#include "props.h" +#include "btrfs_inode.h" +#include "hash.h" +#include "transaction.h" +#include "xattr.h" + +#define BTRFS_PROP_HANDLERS_HT_BITS 8 +static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); + +struct prop_handler { + struct hlist_node node; + const char *xattr_name; + int (*validate)(const char *value, size_t len); + int (*apply)(struct inode *inode, const char *value, size_t len); + const char *(*extract)(struct inode *inode); + int inheritable; +}; + +static int prop_compression_validate(const char *value, size_t len); +static int prop_compression_apply(struct inode *inode, + const char *value, + size_t len); +static const char *prop_compression_extract(struct inode *inode); + +static struct prop_handler prop_handlers[] = { + { + .xattr_name = XATTR_BTRFS_PREFIX "compression", + .validate = prop_compression_validate, + .apply = prop_compression_apply, + .extract = prop_compression_extract, + .inheritable = 1 + }, + { + .xattr_name = NULL + } +}; + +void __init btrfs_props_init(void) +{ + struct prop_handler *p; + + hash_init(prop_handlers_ht); + + for (p = &prop_handlers[0]; p->xattr_name; p++) { + u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name)); + + hash_add(prop_handlers_ht, &p->node, h); + } +} + +static const struct hlist_head *find_prop_handlers_by_hash(const u64 hash) +{ + struct hlist_head *h; + + h = &prop_handlers_ht[hash_min(hash, BTRFS_PROP_HANDLERS_HT_BITS)]; + if (hlist_empty(h)) + return NULL; + + return h; +} + +static const struct prop_handler * +find_prop_handler(const char *name, + const struct hlist_head *handlers) +{ + struct prop_handler *h; + + if (!handlers) { + u64 hash = btrfs_name_hash(name, strlen(name)); + + handlers = find_prop_handlers_by_hash(hash); + if (!handlers) + return NULL; + } + + hlist_for_each_entry(h, handlers, node) + if (!strcmp(h->xattr_name, name)) + return h; + + return NULL; +} + +static int __btrfs_set_prop(struct btrfs_trans_handle *trans, + struct inode *inode, + const char *name, + const char *value, + size_t value_len, + int flags) +{ + const struct prop_handler *handler; + int ret; + + if (strlen(name) <= XATTR_BTRFS_PREFIX_LEN) + return -EINVAL; + + handler = find_prop_handler(name, NULL); + if (!handler) + return -EINVAL; + + if (value_len == 0) { + ret = __btrfs_setxattr(trans, inode, handler->xattr_name, + NULL, 0, flags); + if (ret) + return ret; + + ret = handler->apply(inode, NULL, 0); + ASSERT(ret == 0); + + return ret; + } + + ret = handler->validate(value, value_len); + if (ret) + return ret; + ret = __btrfs_setxattr(trans, inode, handler->xattr_name, + value, value_len, flags); + if (ret) + return ret; + ret = handler->apply(inode, value, value_len); + if (ret) { + __btrfs_setxattr(trans, inode, handler->xattr_name, + NULL, 0, flags); + return ret; + } + + set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); + + return 0; +} + +int btrfs_set_prop(struct inode *inode, + const char *name, + const char *value, + size_t value_len, + int flags) +{ + return __btrfs_set_prop(NULL, inode, name, value, value_len, flags); +} + +static int iterate_object_props(struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid, + void (*iterator)(void *, + const struct prop_handler *, + const char *, + size_t), + void *ctx) +{ + int ret; + char *name_buf = NULL; + char *value_buf = NULL; + int name_buf_len = 0; + int value_buf_len = 0; + + while (1) { + struct btrfs_key key; + struct btrfs_dir_item *di; + struct extent_buffer *leaf; + u32 total_len, cur, this_len; + int slot; + const struct hlist_head *handlers; + + slot = path->slots[0]; + leaf = path->nodes[0]; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != objectid) + break; + if (key.type != BTRFS_XATTR_ITEM_KEY) + break; + + handlers = find_prop_handlers_by_hash(key.offset); + if (!handlers) + goto next_slot; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + cur = 0; + total_len = btrfs_item_size_nr(leaf, slot); + + while (cur < total_len) { + u32 name_len = btrfs_dir_name_len(leaf, di); + u32 data_len = btrfs_dir_data_len(leaf, di); + unsigned long name_ptr, data_ptr; + const struct prop_handler *handler; + + this_len = sizeof(*di) + name_len + data_len; + name_ptr = (unsigned long)(di + 1); + data_ptr = name_ptr + name_len; + + if (name_len <= XATTR_BTRFS_PREFIX_LEN || + memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX, + name_ptr, + XATTR_BTRFS_PREFIX_LEN)) + goto next_dir_item; + + if (name_len >= name_buf_len) { + kfree(name_buf); + name_buf_len = name_len + 1; + name_buf = kmalloc(name_buf_len, GFP_NOFS); + if (!name_buf) { + ret = -ENOMEM; + goto out; + } + } + read_extent_buffer(leaf, name_buf, name_ptr, name_len); + name_buf[name_len] = '\0'; + + handler = find_prop_handler(name_buf, handlers); + if (!handler) + goto next_dir_item; + + if (data_len > value_buf_len) { + kfree(value_buf); + value_buf_len = data_len; + value_buf = kmalloc(data_len, GFP_NOFS); + if (!value_buf) { + ret = -ENOMEM; + goto out; + } + } + read_extent_buffer(leaf, value_buf, data_ptr, data_len); + + iterator(ctx, handler, value_buf, data_len); +next_dir_item: + cur += this_len; + di = (struct btrfs_dir_item *)((char *) di + this_len); + } + +next_slot: + path->slots[0]++; + } + + ret = 0; +out: + btrfs_release_path(path); + kfree(name_buf); + kfree(value_buf); + + return ret; +} + +static void inode_prop_iterator(void *ctx, + const struct prop_handler *handler, + const char *value, + size_t len) +{ + struct inode *inode = ctx; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + ret = handler->apply(inode, value, len); + if (unlikely(ret)) + btrfs_warn(root->fs_info, + "error applying prop %s to ino %llu (root %llu): %d", + handler->xattr_name, btrfs_ino(inode), + root->root_key.objectid, ret); + else + set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); +} + +int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 ino = btrfs_ino(inode); + int ret; + + ret = iterate_object_props(root, path, ino, inode_prop_iterator, inode); + + return ret; +} + +static int inherit_props(struct btrfs_trans_handle *trans, + struct inode *inode, + struct inode *parent) +{ + const struct prop_handler *h; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + if (!test_bit(BTRFS_INODE_HAS_PROPS, + &BTRFS_I(parent)->runtime_flags)) + return 0; + + for (h = &prop_handlers[0]; h->xattr_name; h++) { + const char *value; + u64 num_bytes; + + if (!h->inheritable) + continue; + + value = h->extract(parent); + if (!value) + continue; + + num_bytes = btrfs_calc_trans_metadata_size(root, 1); + ret = btrfs_block_rsv_add(root, trans->block_rsv, + num_bytes, BTRFS_RESERVE_NO_FLUSH); + if (ret) + goto out; + ret = __btrfs_set_prop(trans, inode, h->xattr_name, + value, strlen(value), 0); + btrfs_block_rsv_release(root, trans->block_rsv, num_bytes); + if (ret) + goto out; + } + ret = 0; +out: + return ret; +} + +int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, + struct inode *inode, + struct inode *dir) +{ + if (!dir) + return 0; + + return inherit_props(trans, inode, dir); +} + +int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *parent_root) +{ + struct btrfs_key key; + struct inode *parent_inode, *child_inode; + int ret; + + key.objectid = BTRFS_FIRST_FREE_OBJECTID; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + parent_inode = btrfs_iget(parent_root->fs_info->sb, &key, + parent_root, NULL); + if (IS_ERR(parent_inode)) + return PTR_ERR(parent_inode); + + child_inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); + if (IS_ERR(child_inode)) { + iput(parent_inode); + return PTR_ERR(child_inode); + } + + ret = inherit_props(trans, child_inode, parent_inode); + iput(child_inode); + iput(parent_inode); + + return ret; +} + +static int prop_compression_validate(const char *value, size_t len) +{ + if (!strncmp("lzo", value, len)) + return 0; + else if (!strncmp("zlib", value, len)) + return 0; + + return -EINVAL; +} + +static int prop_compression_apply(struct inode *inode, + const char *value, + size_t len) +{ + int type; + + if (len == 0) { + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; + + return 0; + } + + if (!strncmp("lzo", value, len)) + type = BTRFS_COMPRESS_LZO; + else if (!strncmp("zlib", value, len)) + type = BTRFS_COMPRESS_ZLIB; + else + return -EINVAL; + + BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->force_compress = type; + + return 0; +} + +static const char *prop_compression_extract(struct inode *inode) +{ + switch (BTRFS_I(inode)->force_compress) { + case BTRFS_COMPRESS_ZLIB: + return "zlib"; + case BTRFS_COMPRESS_LZO: + return "lzo"; + } + + return NULL; +} diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h new file mode 100644 index 00000000000..100f18829d5 --- /dev/null +++ b/fs/btrfs/props.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_PROPS_H +#define __BTRFS_PROPS_H + +#include "ctree.h" + +void __init btrfs_props_init(void); + +int btrfs_set_prop(struct inode *inode, + const char *name, + const char *value, + size_t value_len, + int flags); + +int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path); + +int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, + struct inode *inode, + struct inode *dir); + +int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *parent_root); + +#endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 4e6ef490619..98cb6b2630f 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -32,6 +32,7 @@ #include "ulist.h" #include "backref.h" #include "extent_io.h" +#include "qgroup.h" /* TODO XXX FIXME * - subvol delete -> delete when ref goes to 0? delete limits also? @@ -84,8 +85,8 @@ struct btrfs_qgroup { /* * temp variables for accounting operations */ - u64 tag; - u64 refcnt; + u64 old_refcnt; + u64 new_refcnt; }; /* @@ -98,6 +99,9 @@ struct btrfs_qgroup_list { struct btrfs_qgroup *member; }; +#define ptr_to_u64(x) ((u64)(uintptr_t)x) +#define u64_to_ptr(x) ((struct btrfs_qgroup *)(uintptr_t)x) + static int qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, int init_flags); @@ -242,6 +246,21 @@ static int del_relation_rb(struct btrfs_fs_info *fs_info, return -ENOENT; } +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, + u64 rfer, u64 excl) +{ + struct btrfs_qgroup *qgroup; + + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (!qgroup) + return -EINVAL; + if (qgroup->rfer != rfer || qgroup->excl != excl) + return -EINVAL; + return 0; +} +#endif + /* * The full config is read in one go, only called from open_ctree() * It doesn't use any locking, as at this point we're still single-threaded @@ -301,16 +320,16 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) if (btrfs_qgroup_status_version(l, ptr) != BTRFS_QGROUP_STATUS_VERSION) { - printk(KERN_ERR - "btrfs: old qgroup version, quota disabled\n"); + btrfs_err(fs_info, + "old qgroup version, quota disabled"); goto out; } if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) { flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - printk(KERN_ERR - "btrfs: qgroup generation mismatch, " - "marked as inconsistent\n"); + btrfs_err(fs_info, + "qgroup generation mismatch, " + "marked as inconsistent"); } fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr); @@ -325,7 +344,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) qgroup = find_qgroup_rb(fs_info, found_key.offset); if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { - printk(KERN_ERR "btrfs: inconsitent qgroup config\n"); + btrfs_err(fs_info, "inconsitent qgroup config"); flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } if (!qgroup) { @@ -396,8 +415,8 @@ next1: ret = add_relation_rb(fs_info, found_key.objectid, found_key.offset); if (ret == -ENOENT) { - printk(KERN_WARNING - "btrfs: orphan qgroup relation 0x%llx->0x%llx\n", + btrfs_warn(fs_info, + "orphan qgroup relation 0x%llx->0x%llx", found_key.objectid, found_key.offset); ret = 0; /* ignore the error */ } @@ -520,6 +539,10 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_key key; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, "a_root->state))) + return 0; +#endif path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -644,8 +667,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, l = path->nodes[0]; slot = path->slots[0]; - qgroup_limit = btrfs_item_ptr(l, path->slots[0], - struct btrfs_qgroup_limit_item); + qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item); btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags); btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer); btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl); @@ -670,6 +692,10 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, int ret; int slot; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 0; +#endif key.objectid = 0; key.type = BTRFS_QGROUP_INFO_KEY; key.offset = qgroup->qgroupid; @@ -687,8 +713,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, l = path->nodes[0]; slot = path->slots[0]; - qgroup_info = btrfs_item_ptr(l, path->slots[0], - struct btrfs_qgroup_info_item); + qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item); btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid); btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer); btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); @@ -1161,7 +1186,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, limit->rsv_excl); if (ret) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - printk(KERN_INFO "unable to update quota limit for %llu\n", + btrfs_info(fs_info, "unable to update quota limit for %llu", qgroupid); } @@ -1176,33 +1201,198 @@ out: mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } +static int comp_oper(struct btrfs_qgroup_operation *oper1, + struct btrfs_qgroup_operation *oper2) +{ + if (oper1->bytenr < oper2->bytenr) + return -1; + if (oper1->bytenr > oper2->bytenr) + return 1; + if (oper1->seq < oper2->seq) + return -1; + if (oper1->seq > oper2->seq) + return -1; + if (oper1->ref_root < oper2->ref_root) + return -1; + if (oper1->ref_root > oper2->ref_root) + return 1; + if (oper1->type < oper2->type) + return -1; + if (oper1->type > oper2->type) + return 1; + return 0; +} + +static int insert_qgroup_oper(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct btrfs_qgroup_operation *cur; + int cmp; + + spin_lock(&fs_info->qgroup_op_lock); + p = &fs_info->qgroup_op_tree.rb_node; + while (*p) { + parent = *p; + cur = rb_entry(parent, struct btrfs_qgroup_operation, n); + cmp = comp_oper(cur, oper); + if (cmp < 0) { + p = &(*p)->rb_right; + } else if (cmp) { + p = &(*p)->rb_left; + } else { + spin_unlock(&fs_info->qgroup_op_lock); + return -EEXIST; + } + } + rb_link_node(&oper->n, parent, p); + rb_insert_color(&oper->n, &fs_info->qgroup_op_tree); + spin_unlock(&fs_info->qgroup_op_lock); + return 0; +} /* - * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts - * the modification into a list that's later used by btrfs_end_transaction to - * pass the recorded modifications on to btrfs_qgroup_account_ref. + * Record a quota operation for processing later on. + * @trans: the transaction we are adding the delayed op to. + * @fs_info: the fs_info for this fs. + * @ref_root: the root of the reference we are acting on, + * @bytenr: the bytenr we are acting on. + * @num_bytes: the number of bytes in the reference. + * @type: the type of operation this is. + * @mod_seq: do we need to get a sequence number for looking up roots. + * + * We just add it to our trans qgroup_ref_list and carry on and process these + * operations in order at some later point. If the reference root isn't a fs + * root then we don't bother with doing anything. + * + * MUST BE HOLDING THE REF LOCK. */ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_fs_info *fs_info, u64 ref_root, + u64 bytenr, u64 num_bytes, + enum btrfs_qgroup_operation_type type, int mod_seq) { - struct qgroup_update *u; + struct btrfs_qgroup_operation *oper; + int ret; - BUG_ON(!trans->delayed_ref_elem.seq); - u = kmalloc(sizeof(*u), GFP_NOFS); - if (!u) + if (!is_fstree(ref_root) || !fs_info->quota_enabled) + return 0; + + oper = kmalloc(sizeof(*oper), GFP_NOFS); + if (!oper) return -ENOMEM; - u->node = node; - u->extent_op = extent_op; - list_add_tail(&u->list, &trans->qgroup_ref_list); + oper->ref_root = ref_root; + oper->bytenr = bytenr; + oper->num_bytes = num_bytes; + oper->type = type; + oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); + INIT_LIST_HEAD(&oper->elem.list); + oper->elem.seq = 0; + ret = insert_qgroup_oper(fs_info, oper); + if (ret) { + /* Shouldn't happen so have an assert for developers */ + ASSERT(0); + kfree(oper); + return ret; + } + list_add_tail(&oper->list, &trans->qgroup_ref_list); + + if (mod_seq) + btrfs_get_tree_mod_seq(fs_info, &oper->elem); return 0; } -static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info, - struct ulist *roots, struct ulist *tmp, - u64 seq) +/* + * The easy accounting, if we are adding/removing the only ref for an extent + * then this qgroup and all of the parent qgroups get their refrence and + * exclusive counts adjusted. + */ +static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct btrfs_qgroup *qgroup; + struct ulist *tmp; + struct btrfs_qgroup_list *glist; + struct ulist_node *unode; + struct ulist_iterator uiter; + int sign = 0; + int ret = 0; + + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + + spin_lock(&fs_info->qgroup_lock); + if (!fs_info->quota_root) + goto out; + qgroup = find_qgroup_rb(fs_info, oper->ref_root); + if (!qgroup) + goto out; + switch (oper->type) { + case BTRFS_QGROUP_OPER_ADD_EXCL: + sign = 1; + break; + case BTRFS_QGROUP_OPER_SUB_EXCL: + sign = -1; + break; + default: + ASSERT(0); + } + qgroup->rfer += sign * oper->num_bytes; + qgroup->rfer_cmpr += sign * oper->num_bytes; + + WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes); + qgroup->excl += sign * oper->num_bytes; + qgroup->excl_cmpr += sign * oper->num_bytes; + + qgroup_dirty(fs_info, qgroup); + + /* Get all of the parent groups that contain this qgroup */ + list_for_each_entry(glist, &qgroup->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + + /* Iterate all of the parents and adjust their reference counts */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + qgroup = u64_to_ptr(unode->aux); + qgroup->rfer += sign * oper->num_bytes; + qgroup->rfer_cmpr += sign * oper->num_bytes; + qgroup->excl += sign * oper->num_bytes; + if (sign < 0) + WARN_ON(qgroup->excl < oper->num_bytes); + qgroup->excl_cmpr += sign * oper->num_bytes; + qgroup_dirty(fs_info, qgroup); + + /* Add any parents of the parents */ + list_for_each_entry(glist, &qgroup->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + } + ret = 0; +out: + spin_unlock(&fs_info->qgroup_lock); + ulist_free(tmp); + return ret; +} + +/* + * Walk all of the roots that pointed to our bytenr and adjust their refcnts as + * properly. + */ +static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, + u64 root_to_skip, struct ulist *tmp, + struct ulist *roots, struct ulist *qgroups, + u64 seq, int *old_roots, int rescan) { struct ulist_node *unode; struct ulist_iterator uiter; @@ -1213,261 +1403,551 @@ static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info, ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(roots, &uiter))) { + /* We don't count our current root here */ + if (unode->val == root_to_skip) + continue; qg = find_qgroup_rb(fs_info, unode->val); if (!qg) continue; + /* + * We could have a pending removal of this same ref so we may + * not have actually found our ref root when doing + * btrfs_find_all_roots, so we need to keep track of how many + * old roots we find in case we removed ours and added a + * different one at the same time. I don't think this could + * happen in practice but that sort of thinking leads to pain + * and suffering and to the dark side. + */ + (*old_roots)++; ulist_reinit(tmp); - /* XXX id not needed */ - ret = ulist_add(tmp, qg->qgroupid, - (u64)(uintptr_t)qg, GFP_ATOMIC); + ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), + GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC); if (ret < 0) return ret; ULIST_ITER_INIT(&tmp_uiter); while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; - if (qg->refcnt < seq) - qg->refcnt = seq + 1; + qg = u64_to_ptr(tmp_unode->aux); + /* + * We use this sequence number to keep from having to + * run the whole list and 0 out the refcnt every time. + * We basically use sequnce as the known 0 count and + * then add 1 everytime we see a qgroup. This is how we + * get how many of the roots actually point up to the + * upper level qgroups in order to determine exclusive + * counts. + * + * For rescan we want to set old_refcnt to seq so our + * exclusive calculations end up correct. + */ + if (rescan) + qg->old_refcnt = seq; + else if (qg->old_refcnt < seq) + qg->old_refcnt = seq + 1; else - ++qg->refcnt; + qg->old_refcnt++; + if (qg->new_refcnt < seq) + qg->new_refcnt = seq + 1; + else + qg->new_refcnt++; list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(qgroups, glist->group->qgroupid, + ptr_to_u64(glist->group), + GFP_ATOMIC); + if (ret < 0) + return ret; ret = ulist_add(tmp, glist->group->qgroupid, - (u64)(uintptr_t)glist->group, + ptr_to_u64(glist->group), GFP_ATOMIC); if (ret < 0) return ret; } } } + return 0; +} + +/* + * We need to walk forward in our operation tree and account for any roots that + * were deleted after we made this operation. + */ +static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper, + struct ulist *tmp, + struct ulist *qgroups, u64 seq, + int *old_roots) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + struct btrfs_qgroup *qg; + struct btrfs_qgroup_operation *tmp_oper; + struct rb_node *n; + int ret; + ulist_reinit(tmp); + + /* + * We only walk forward in the tree since we're only interested in + * removals that happened _after_ our operation. + */ + spin_lock(&fs_info->qgroup_op_lock); + n = rb_next(&oper->n); + spin_unlock(&fs_info->qgroup_op_lock); + if (!n) + return 0; + tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); + while (tmp_oper->bytenr == oper->bytenr) { + /* + * If it's not a removal we don't care, additions work out + * properly with our refcnt tracking. + */ + if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED && + tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL) + goto next; + qg = find_qgroup_rb(fs_info, tmp_oper->ref_root); + if (!qg) + goto next; + ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), + GFP_ATOMIC); + if (ret) { + if (ret < 0) + return ret; + /* + * We only want to increase old_roots if this qgroup is + * not already in the list of qgroups. If it is already + * there then that means it must have been re-added or + * the delete will be discarded because we had an + * existing ref that we haven't looked up yet. In this + * case we don't want to increase old_roots. So if ret + * == 1 then we know that this is the first time we've + * seen this qgroup and we can bump the old_roots. + */ + (*old_roots)++; + ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), + GFP_ATOMIC); + if (ret < 0) + return ret; + } +next: + spin_lock(&fs_info->qgroup_op_lock); + n = rb_next(&tmp_oper->n); + spin_unlock(&fs_info->qgroup_op_lock); + if (!n) + break; + tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); + } + + /* Ok now process the qgroups we found */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + struct btrfs_qgroup_list *glist; + + qg = u64_to_ptr(unode->aux); + if (qg->old_refcnt < seq) + qg->old_refcnt = seq + 1; + else + qg->old_refcnt++; + if (qg->new_refcnt < seq) + qg->new_refcnt = seq + 1; + else + qg->new_refcnt++; + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(qgroups, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + return ret; + } + } return 0; } -static int qgroup_account_ref_step2(struct btrfs_fs_info *fs_info, - struct ulist *roots, struct ulist *tmp, - u64 seq, int sgn, u64 num_bytes, - struct btrfs_qgroup *qgroup) +/* Add refcnt for the newly added reference. */ +static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper, + struct btrfs_qgroup *qgroup, + struct ulist *tmp, struct ulist *qgroups, + u64 seq) { struct ulist_node *unode; struct ulist_iterator uiter; struct btrfs_qgroup *qg; - struct btrfs_qgroup_list *glist; int ret; ulist_reinit(tmp); - ret = ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); + ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup), + GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup), + GFP_ATOMIC); if (ret < 0) return ret; - ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(tmp, &uiter))) { - qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; - if (qg->refcnt < seq) { - /* not visited by step 1 */ - qg->rfer += sgn * num_bytes; - qg->rfer_cmpr += sgn * num_bytes; - if (roots->nnodes == 0) { - qg->excl += sgn * num_bytes; - qg->excl_cmpr += sgn * num_bytes; - } - qgroup_dirty(fs_info, qg); - } - WARN_ON(qg->tag >= seq); - qg->tag = seq; + struct btrfs_qgroup_list *glist; + qg = u64_to_ptr(unode->aux); + if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { + if (qg->new_refcnt < seq) + qg->new_refcnt = seq + 1; + else + qg->new_refcnt++; + } else { + if (qg->old_refcnt < seq) + qg->old_refcnt = seq + 1; + else + qg->old_refcnt++; + } list_for_each_entry(glist, &qg->groups, next_group) { ret = ulist_add(tmp, glist->group->qgroupid, - (uintptr_t)glist->group, GFP_ATOMIC); + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(qgroups, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); if (ret < 0) return ret; } } - return 0; } -static int qgroup_account_ref_step3(struct btrfs_fs_info *fs_info, - struct ulist *roots, struct ulist *tmp, - u64 seq, int sgn, u64 num_bytes) +/* + * This adjusts the counters for all referenced qgroups if need be. + */ +static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info, + u64 root_to_skip, u64 num_bytes, + struct ulist *qgroups, u64 seq, + int old_roots, int new_roots, int rescan) { struct ulist_node *unode; struct ulist_iterator uiter; struct btrfs_qgroup *qg; - struct ulist_node *tmp_unode; - struct ulist_iterator tmp_uiter; - int ret; + u64 cur_new_count, cur_old_count; ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(roots, &uiter))) { - qg = find_qgroup_rb(fs_info, unode->val); - if (!qg) - continue; + while ((unode = ulist_next(qgroups, &uiter))) { + bool dirty = false; - ulist_reinit(tmp); - ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC); - if (ret < 0) - return ret; + qg = u64_to_ptr(unode->aux); + /* + * Wasn't referenced before but is now, add to the reference + * counters. + */ + if (qg->old_refcnt <= seq && qg->new_refcnt > seq) { + qg->rfer += num_bytes; + qg->rfer_cmpr += num_bytes; + dirty = true; + } - ULIST_ITER_INIT(&tmp_uiter); - while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { - struct btrfs_qgroup_list *glist; + /* + * Was referenced before but isn't now, subtract from the + * reference counters. + */ + if (qg->old_refcnt > seq && qg->new_refcnt <= seq) { + qg->rfer -= num_bytes; + qg->rfer_cmpr -= num_bytes; + dirty = true; + } - qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; - if (qg->tag == seq) - continue; + if (qg->old_refcnt < seq) + cur_old_count = 0; + else + cur_old_count = qg->old_refcnt - seq; + if (qg->new_refcnt < seq) + cur_new_count = 0; + else + cur_new_count = qg->new_refcnt - seq; - if (qg->refcnt - seq == roots->nnodes) { - qg->excl -= sgn * num_bytes; - qg->excl_cmpr -= sgn * num_bytes; - qgroup_dirty(fs_info, qg); - } + /* + * If our refcount was the same as the roots previously but our + * new count isn't the same as the number of roots now then we + * went from having a exclusive reference on this range to not. + */ + if (old_roots && cur_old_count == old_roots && + (cur_new_count != new_roots || new_roots == 0)) { + WARN_ON(cur_new_count != new_roots && new_roots == 0); + qg->excl -= num_bytes; + qg->excl_cmpr -= num_bytes; + dirty = true; + } - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - (uintptr_t)glist->group, - GFP_ATOMIC); - if (ret < 0) - return ret; - } + /* + * If we didn't reference all the roots before but now we do we + * have an exclusive reference to this range. + */ + if ((!old_roots || (old_roots && cur_old_count != old_roots)) + && cur_new_count == new_roots) { + qg->excl += num_bytes; + qg->excl_cmpr += num_bytes; + dirty = true; } - } + if (dirty) + qgroup_dirty(fs_info, qg); + } return 0; } /* - * btrfs_qgroup_account_ref is called for every ref that is added to or deleted - * from the fs. First, all roots referencing the extent are searched, and - * then the space is accounted accordingly to the different roots. The - * accounting algorithm works in 3 steps documented inline. + * If we removed a data extent and there were other references for that bytenr + * then we need to lookup all referenced roots to make sure we still don't + * reference this bytenr. If we do then we can just discard this operation. */ -int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op) +static int check_existing_refs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) { - struct btrfs_key ins; - struct btrfs_root *quota_root; - u64 ref_root; - struct btrfs_qgroup *qgroup; struct ulist *roots = NULL; - u64 seq; + struct ulist_node *unode; + struct ulist_iterator uiter; int ret = 0; - int sgn; - if (!fs_info->quota_enabled) - return 0; - - BUG_ON(!fs_info->quota_root); + ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, + oper->elem.seq, &roots); + if (ret < 0) + return ret; + ret = 0; - ins.objectid = node->bytenr; - ins.offset = node->num_bytes; - ins.type = BTRFS_EXTENT_ITEM_KEY; - - if (node->type == BTRFS_TREE_BLOCK_REF_KEY || - node->type == BTRFS_SHARED_BLOCK_REF_KEY) { - struct btrfs_delayed_tree_ref *ref; - ref = btrfs_delayed_node_to_tree_ref(node); - ref_root = ref->root; - } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || - node->type == BTRFS_SHARED_DATA_REF_KEY) { - struct btrfs_delayed_data_ref *ref; - ref = btrfs_delayed_node_to_data_ref(node); - ref_root = ref->root; - } else { - BUG(); + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(roots, &uiter))) { + if (unode->val == oper->ref_root) { + ret = 1; + break; + } } + ulist_free(roots); + btrfs_put_tree_mod_seq(fs_info, &oper->elem); - if (!is_fstree(ref_root)) { - /* - * non-fs-trees are not being accounted - */ - return 0; - } + return ret; +} - switch (node->action) { - case BTRFS_ADD_DELAYED_REF: - case BTRFS_ADD_DELAYED_EXTENT: - sgn = 1; - seq = btrfs_tree_mod_seq_prev(node->seq); - break; - case BTRFS_DROP_DELAYED_REF: - sgn = -1; - seq = node->seq; - break; - case BTRFS_UPDATE_DELAYED_HEAD: - return 0; - default: - BUG(); - } +/* + * If we share a reference across multiple roots then we may need to adjust + * various qgroups referenced and exclusive counters. The basic premise is this + * + * 1) We have seq to represent a 0 count. Instead of looping through all of the + * qgroups and resetting their refcount to 0 we just constantly bump this + * sequence number to act as the base reference count. This means that if + * anybody is equal to or below this sequence they were never referenced. We + * jack this sequence up by the number of roots we found each time in order to + * make sure we don't have any overlap. + * + * 2) We first search all the roots that reference the area _except_ the root + * we're acting on currently. This makes up the old_refcnt of all the qgroups + * before. + * + * 3) We walk all of the qgroups referenced by the root we are currently acting + * on, and will either adjust old_refcnt in the case of a removal or the + * new_refcnt in the case of an addition. + * + * 4) Finally we walk all the qgroups that are referenced by this range + * including the root we are acting on currently. We will adjust the counters + * based on the number of roots we had and will have after this operation. + * + * Take this example as an illustration + * + * [qgroup 1/0] + * / | \ + * [qg 0/0] [qg 0/1] [qg 0/2] + * \ | / + * [ extent ] + * + * Say we are adding a reference that is covered by qg 0/0. The first step + * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with + * old_roots being 2. Because it is adding new_roots will be 1. We then go + * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's + * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we + * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a + * reference and thus must add the size to the referenced bytes. Everything + * else is the same so nothing else changes. + */ +static int qgroup_shared_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct ulist *roots = NULL; + struct ulist *qgroups, *tmp; + struct btrfs_qgroup *qgroup; + struct seq_list elem = {}; + u64 seq; + int old_roots = 0; + int new_roots = 0; + int ret = 0; - mutex_lock(&fs_info->qgroup_rescan_lock); - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { - if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) { - mutex_unlock(&fs_info->qgroup_rescan_lock); + if (oper->elem.seq) { + ret = check_existing_refs(trans, fs_info, oper); + if (ret < 0) + return ret; + if (ret) return 0; - } } - mutex_unlock(&fs_info->qgroup_rescan_lock); - - /* - * the delayed ref sequence number we pass depends on the direction of - * the operation. for add operations, we pass - * tree_mod_log_prev_seq(node->seq) to skip - * the delayed ref's current sequence number, because we need the state - * of the tree before the add operation. for delete operations, we pass - * (node->seq) to include the delayed ref's current sequence number, - * because we need the state of the tree after the delete operation. - */ - ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots); - if (ret < 0) - return ret; - spin_lock(&fs_info->qgroup_lock); + qgroups = ulist_alloc(GFP_NOFS); + if (!qgroups) + return -ENOMEM; - quota_root = fs_info->quota_root; - if (!quota_root) - goto unlock; + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) { + ulist_free(qgroups); + return -ENOMEM; + } - qgroup = find_qgroup_rb(fs_info, ref_root); + btrfs_get_tree_mod_seq(fs_info, &elem); + ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq, + &roots); + btrfs_put_tree_mod_seq(fs_info, &elem); + if (ret < 0) { + ulist_free(qgroups); + ulist_free(tmp); + return ret; + } + spin_lock(&fs_info->qgroup_lock); + qgroup = find_qgroup_rb(fs_info, oper->ref_root); if (!qgroup) - goto unlock; + goto out; + seq = fs_info->qgroup_seq; /* - * step 1: for each old ref, visit all nodes once and inc refcnt + * So roots is the list of all the roots currently pointing at the + * bytenr, including the ref we are adding if we are adding, or not if + * we are removing a ref. So we pass in the ref_root to skip that root + * in our calculations. We set old_refnct and new_refcnt cause who the + * hell knows what everything looked like before, and it doesn't matter + * except... */ - ulist_reinit(fs_info->qgroup_ulist); - seq = fs_info->qgroup_seq; - fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ + ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups, + seq, &old_roots, 0); + if (ret < 0) + goto out; - ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist, - seq); - if (ret) - goto unlock; + /* + * Now adjust the refcounts of the qgroups that care about this + * reference, either the old_count in the case of removal or new_count + * in the case of an addition. + */ + ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups, + seq); + if (ret < 0) + goto out; /* - * step 2: walk from the new root + * ...in the case of removals. If we had a removal before we got around + * to processing this operation then we need to find that guy and count + * his references as if they really existed so we don't end up screwing + * up the exclusive counts. Then whenever we go to process the delete + * everything will be grand and we can account for whatever exclusive + * changes need to be made there. We also have to pass in old_roots so + * we have an accurate count of the roots as it pertains to this + * operations view of the world. */ - ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist, - seq, sgn, node->num_bytes, qgroup); - if (ret) - goto unlock; + ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq, + &old_roots); + if (ret < 0) + goto out; /* - * step 3: walk again from old refs + * We are adding our root, need to adjust up the number of roots, + * otherwise old_roots is the number of roots we want. */ - ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist, - seq, sgn, node->num_bytes); - if (ret) - goto unlock; + if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { + new_roots = old_roots + 1; + } else { + new_roots = old_roots; + old_roots++; + } + fs_info->qgroup_seq += old_roots + 1; -unlock: + + /* + * And now the magic happens, bless Arne for having a pretty elegant + * solution for this. + */ + qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes, + qgroups, seq, old_roots, new_roots, 0); +out: spin_unlock(&fs_info->qgroup_lock); + ulist_free(qgroups); ulist_free(roots); + ulist_free(tmp); + return ret; +} + +/* + * btrfs_qgroup_account_ref is called for every ref that is added to or deleted + * from the fs. First, all roots referencing the extent are searched, and + * then the space is accounted accordingly to the different roots. The + * accounting algorithm works in 3 steps documented inline. + */ +static int btrfs_qgroup_account(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + int ret = 0; + + if (!fs_info->quota_enabled) + return 0; + + BUG_ON(!fs_info->quota_root); + + mutex_lock(&fs_info->qgroup_rescan_lock); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) { + mutex_unlock(&fs_info->qgroup_rescan_lock); + return 0; + } + } + mutex_unlock(&fs_info->qgroup_rescan_lock); + + ASSERT(is_fstree(oper->ref_root)); + + switch (oper->type) { + case BTRFS_QGROUP_OPER_ADD_EXCL: + case BTRFS_QGROUP_OPER_SUB_EXCL: + ret = qgroup_excl_accounting(fs_info, oper); + break; + case BTRFS_QGROUP_OPER_ADD_SHARED: + case BTRFS_QGROUP_OPER_SUB_SHARED: + ret = qgroup_shared_accounting(trans, fs_info, oper); + break; + default: + ASSERT(0); + } + return ret; +} + +/* + * Needs to be called everytime we run delayed refs, even if there is an error + * in order to cleanup outstanding operations. + */ +int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_qgroup_operation *oper; + int ret = 0; + while (!list_empty(&trans->qgroup_ref_list)) { + oper = list_first_entry(&trans->qgroup_ref_list, + struct btrfs_qgroup_operation, list); + list_del_init(&oper->list); + if (!ret || !trans->aborted) + ret = btrfs_qgroup_account(trans, fs_info, oper); + spin_lock(&fs_info->qgroup_op_lock); + rb_erase(&oper->n, &fs_info->qgroup_op_tree); + spin_unlock(&fs_info->qgroup_op_lock); + btrfs_put_tree_mod_seq(fs_info, &oper->elem); + kfree(oper); + } return ret; } @@ -1516,8 +1996,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, ret = qgroup_rescan_init(fs_info, 0, 1); if (!ret) { qgroup_rescan_zero_tracking(fs_info); - btrfs_queue_worker(&fs_info->qgroup_rescan_workers, - &fs_info->qgroup_rescan_work); + btrfs_queue_work(fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); } ret = 0; } @@ -1636,8 +2116,16 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, srcgroup = find_qgroup_rb(fs_info, srcid); if (!srcgroup) goto unlock; - dstgroup->rfer = srcgroup->rfer - level_size; - dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size; + + /* + * We call inherit after we clone the root in order to make sure + * our counts don't go crazy, so at this point the only + * difference between the two roots should be the root node. + */ + dstgroup->rfer = srcgroup->rfer; + dstgroup->rfer_cmpr = srcgroup->rfer_cmpr; + dstgroup->excl = level_size; + dstgroup->excl_cmpr = level_size; srcgroup->excl = level_size; srcgroup->excl_cmpr = level_size; qgroup_dirty(fs_info, dstgroup); @@ -1741,7 +2229,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; + qg = u64_to_ptr(unode->aux); if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && qg->reserved + (s64)qg->rfer + num_bytes > @@ -1773,7 +2261,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { struct btrfs_qgroup *qg; - qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; + qg = u64_to_ptr(unode->aux); qg->reserved += num_bytes; } @@ -1819,7 +2307,7 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; + qg = u64_to_ptr(unode->aux); qg->reserved -= num_bytes; @@ -1840,7 +2328,9 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) { if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) return; - pr_err("btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x\n", + btrfs_err(trans->root->fs_info, + "qgroups not uptodate in trans handle %p: list is%s empty, " + "seq is %#x.%x", trans, list_empty(&trans->qgroup_ref_list) ? "" : " not", (u32)(trans->delayed_ref_elem.seq >> 32), (u32)trans->delayed_ref_elem.seq); @@ -1853,15 +2343,15 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) */ static int qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_trans_handle *trans, struct ulist *tmp, - struct extent_buffer *scratch_leaf) + struct btrfs_trans_handle *trans, struct ulist *qgroups, + struct ulist *tmp, struct extent_buffer *scratch_leaf) { struct btrfs_key found; struct ulist *roots = NULL; - struct ulist_node *unode; - struct ulist_iterator uiter; struct seq_list tree_mod_seq_elem = {}; + u64 num_bytes; u64 seq; + int new_roots; int slot; int ret; @@ -1903,78 +2393,42 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { btrfs_item_key_to_cpu(scratch_leaf, &found, slot); - if (found.type != BTRFS_EXTENT_ITEM_KEY) + if (found.type != BTRFS_EXTENT_ITEM_KEY && + found.type != BTRFS_METADATA_ITEM_KEY) continue; - ret = btrfs_find_all_roots(trans, fs_info, found.objectid, - tree_mod_seq_elem.seq, &roots); + if (found.type == BTRFS_METADATA_ITEM_KEY) + num_bytes = fs_info->extent_root->leafsize; + else + num_bytes = found.offset; + + ulist_reinit(qgroups); + ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, + &roots); if (ret < 0) goto out; spin_lock(&fs_info->qgroup_lock); seq = fs_info->qgroup_seq; fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ - ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq); - if (ret) { + new_roots = 0; + ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups, + seq, &new_roots, 1); + if (ret < 0) { spin_unlock(&fs_info->qgroup_lock); ulist_free(roots); goto out; } - /* - * step2 of btrfs_qgroup_account_ref works from a single root, - * we're doing all at once here. - */ - ulist_reinit(tmp); - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(roots, &uiter))) { - struct btrfs_qgroup *qg; - - qg = find_qgroup_rb(fs_info, unode->val); - if (!qg) - continue; - - ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, - GFP_ATOMIC); - if (ret < 0) { - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); - goto out; - } - } - - /* this loop is similar to step 2 of btrfs_qgroup_account_ref */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - struct btrfs_qgroup *qg; - struct btrfs_qgroup_list *glist; - - qg = (struct btrfs_qgroup *)(uintptr_t) unode->aux; - qg->rfer += found.offset; - qg->rfer_cmpr += found.offset; - WARN_ON(qg->tag >= seq); - if (qg->refcnt - seq == roots->nnodes) { - qg->excl += found.offset; - qg->excl_cmpr += found.offset; - } - qgroup_dirty(fs_info, qg); - - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - (uintptr_t)glist->group, - GFP_ATOMIC); - if (ret < 0) { - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); - goto out; - } - } + ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups, + seq, 0, new_roots, 1); + if (ret < 0) { + spin_unlock(&fs_info->qgroup_lock); + ulist_free(roots); + goto out; } - spin_unlock(&fs_info->qgroup_lock); ulist_free(roots); - ret = 0; } - out: btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); @@ -1987,13 +2441,16 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) qgroup_rescan_work); struct btrfs_path *path; struct btrfs_trans_handle *trans = NULL; - struct ulist *tmp = NULL; + struct ulist *tmp = NULL, *qgroups = NULL; struct extent_buffer *scratch_leaf = NULL; int err = -ENOMEM; path = btrfs_alloc_path(); if (!path) goto out; + qgroups = ulist_alloc(GFP_NOFS); + if (!qgroups) + goto out; tmp = ulist_alloc(GFP_NOFS); if (!tmp) goto out; @@ -2012,7 +2469,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) err = -EINTR; } else { err = qgroup_rescan_leaf(fs_info, path, trans, - tmp, scratch_leaf); + qgroups, tmp, scratch_leaf); } if (err > 0) btrfs_commit_transaction(trans, fs_info->fs_root); @@ -2022,6 +2479,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) out: kfree(scratch_leaf); + ulist_free(qgroups); ulist_free(tmp); btrfs_free_path(path); @@ -2037,10 +2495,10 @@ out: mutex_unlock(&fs_info->qgroup_rescan_lock); if (err >= 0) { - pr_info("btrfs: qgroup scan completed%s\n", + btrfs_info(fs_info, "qgroup scan completed%s", err == 2 ? " (inconsistency flag cleared)" : ""); } else { - pr_err("btrfs: qgroup scan failed with %d\n", err); + btrfs_err(fs_info, "qgroup scan failed with %d", err); } complete_all(&fs_info->qgroup_rescan_completion); @@ -2092,11 +2550,12 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, memset(&fs_info->qgroup_rescan_work, 0, sizeof(fs_info->qgroup_rescan_work)); - fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker; + btrfs_init_work(&fs_info->qgroup_rescan_work, + btrfs_qgroup_rescan_worker, NULL, NULL); if (ret) { err: - pr_info("btrfs: qgroup_rescan_init failed with %d\n", ret); + btrfs_info(fs_info, "qgroup_rescan_init failed with %d", ret); return ret; } @@ -2155,8 +2614,8 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) qgroup_rescan_zero_tracking(fs_info); - btrfs_queue_worker(&fs_info->qgroup_rescan_workers, - &fs_info->qgroup_rescan_work); + btrfs_queue_work(fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); return 0; } @@ -2187,6 +2646,6 @@ void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) { if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) - btrfs_queue_worker(&fs_info->qgroup_rescan_workers, - &fs_info->qgroup_rescan_work); + btrfs_queue_work(fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h new file mode 100644 index 00000000000..5952ff1fbd7 --- /dev/null +++ b/fs/btrfs/qgroup.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2014 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_QGROUP__ +#define __BTRFS_QGROUP__ + +/* + * A description of the operations, all of these operations only happen when we + * are adding the 1st reference for that subvolume in the case of adding space + * or on the last reference delete in the case of subtraction. The only + * exception is the last one, which is added for confusion. + * + * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only + * one pointing at the bytes we are adding. This is called on the first + * allocation. + * + * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be + * shared between subvols. This is called on the creation of a ref that already + * has refs from a different subvolume, so basically reflink. + * + * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only + * one referencing the range. + * + * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with + * refs with other subvolumes. + */ +enum btrfs_qgroup_operation_type { + BTRFS_QGROUP_OPER_ADD_EXCL, + BTRFS_QGROUP_OPER_ADD_SHARED, + BTRFS_QGROUP_OPER_SUB_EXCL, + BTRFS_QGROUP_OPER_SUB_SHARED, +}; + +struct btrfs_qgroup_operation { + u64 ref_root; + u64 bytenr; + u64 num_bytes; + u64 seq; + enum btrfs_qgroup_operation_type type; + struct seq_list elem; + struct rb_node n; + struct list_head list; +}; + +int btrfs_quota_enable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_quota_disable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); +void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info); +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst); +int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst); +int btrfs_create_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid, + char *name); +int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid); +int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid, + struct btrfs_qgroup_limit *limit); +int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); +void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); +struct btrfs_delayed_extent_op; +int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 ref_root, + u64 bytenr, u64 num_bytes, + enum btrfs_qgroup_operation_type type, + int mod_seq); +int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper); +int btrfs_run_qgroups(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, + struct btrfs_qgroup_inherit *inherit); +int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes); +void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes); + +void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, + u64 rfer, u64 excl); +#endif + +#endif /* __BTRFS_QGROUP__ */ diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index d0ecfbd9cc9..4a88f073fdd 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -33,7 +33,6 @@ #include <linux/raid/xor.h> #include <linux/vmalloc.h> #include <asm/div64.h> -#include "compat.h" #include "ctree.h" #include "extent_map.h" #include "disk-io.h" @@ -1033,8 +1032,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, /* see if we can add this page onto our existing bio */ if (last) { - last_end = (u64)last->bi_sector << 9; - last_end += last->bi_size; + last_end = (u64)last->bi_iter.bi_sector << 9; + last_end += last->bi_iter.bi_size; /* * we can't merge these if they are from different @@ -1054,9 +1053,9 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, if (!bio) return -ENOMEM; - bio->bi_size = 0; + bio->bi_iter.bi_size = 0; bio->bi_bdev = stripe->dev->bdev; - bio->bi_sector = disk_start >> 9; + bio->bi_iter.bi_sector = disk_start >> 9; set_bit(BIO_UPTODATE, &bio->bi_flags); bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); @@ -1112,7 +1111,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) spin_lock_irq(&rbio->bio_list_lock); bio_list_for_each(bio, &rbio->bio_list) { - start = (u64)bio->bi_sector << 9; + start = (u64)bio->bi_iter.bi_sector << 9; stripe_offset = start - rbio->raid_map[0]; page_index = stripe_offset >> PAGE_CACHE_SHIFT; @@ -1273,7 +1272,7 @@ cleanup: static int find_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio) { - u64 physical = bio->bi_sector; + u64 physical = bio->bi_iter.bi_sector; u64 stripe_start; int i; struct btrfs_bio_stripe *stripe; @@ -1299,7 +1298,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio) { - u64 logical = bio->bi_sector; + u64 logical = bio->bi_iter.bi_sector; u64 stripe_start; int i; @@ -1417,20 +1416,18 @@ cleanup: static void async_rmw_stripe(struct btrfs_raid_bio *rbio) { - rbio->work.flags = 0; - rbio->work.func = rmw_work; + btrfs_init_work(&rbio->work, rmw_work, NULL, NULL); - btrfs_queue_worker(&rbio->fs_info->rmw_workers, - &rbio->work); + btrfs_queue_work(rbio->fs_info->rmw_workers, + &rbio->work); } static void async_read_rebuild(struct btrfs_raid_bio *rbio) { - rbio->work.flags = 0; - rbio->work.func = read_rebuild_work; + btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL); - btrfs_queue_worker(&rbio->fs_info->rmw_workers, - &rbio->work); + btrfs_queue_work(rbio->fs_info->rmw_workers, + &rbio->work); } /* @@ -1603,8 +1600,8 @@ static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) plug_list); struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, plug_list); - u64 a_sector = ra->bio_list.head->bi_sector; - u64 b_sector = rb->bio_list.head->bi_sector; + u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; + u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; if (a_sector < b_sector) return -1; @@ -1668,10 +1665,9 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) plug = container_of(cb, struct btrfs_plug_cb, cb); if (from_schedule) { - plug->work.flags = 0; - plug->work.func = unplug_work; - btrfs_queue_worker(&plug->info->rmw_workers, - &plug->work); + btrfs_init_work(&plug->work, unplug_work, NULL, NULL); + btrfs_queue_work(plug->info->rmw_workers, + &plug->work); return; } run_plug(plug); @@ -1692,7 +1688,7 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, if (IS_ERR(rbio)) return PTR_ERR(rbio); bio_list_add(&rbio->bio_list, bio); - rbio->bio_list_bytes = bio->bi_size; + rbio->bio_list_bytes = bio->bi_iter.bi_size; /* * don't plug on full rbios, just get them out the door @@ -1960,9 +1956,10 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * pages are going to be uptodate. */ for (stripe = 0; stripe < bbio->num_stripes; stripe++) { - if (rbio->faila == stripe || - rbio->failb == stripe) + if (rbio->faila == stripe || rbio->failb == stripe) { + atomic_inc(&rbio->bbio->error); continue; + } for (pagenr = 0; pagenr < nr_pages; pagenr++) { struct page *p; @@ -2045,7 +2042,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, rbio->read_rebuild = 1; bio_list_add(&rbio->bio_list, bio); - rbio->bio_list_bytes = bio->bi_size; + rbio->bio_list_bytes = bio->bi_iter.bi_size; rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 1031b69252c..09230cf3a24 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -189,8 +189,8 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, */ #ifdef DEBUG if (rec->generation != generation) { - printk(KERN_DEBUG "generation mismatch for " - "(%llu,%d,%llu) %llu != %llu\n", + btrfs_debug(root->fs_info, + "generation mismatch for (%llu,%d,%llu) %llu != %llu", key.objectid, key.type, key.offset, rec->generation, generation); } @@ -365,8 +365,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, goto error; if (bbio->num_stripes > BTRFS_MAX_MIRRORS) { - printk(KERN_ERR "btrfs readahead: more than %d copies not " - "supported", BTRFS_MAX_MIRRORS); + btrfs_err(root->fs_info, + "readahead: more than %d copies not supported", + BTRFS_MAX_MIRRORS); goto error; } @@ -427,8 +428,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, continue; } if (!dev->bdev) { - /* cannot read ahead on missing device */ - continue; + /* + * cannot read ahead on missing device, but for RAID5/6, + * REQ_GET_READ_MIRRORS return 1. So don't skip missing + * device for such case. + */ + if (nzones > 1) + continue; } if (dev_replace_is_ongoing && dev == fs_info->dev_replace.tgtdev) { @@ -792,10 +798,10 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info) /* FIXME we cannot handle this properly right now */ BUG(); } - rmw->work.func = reada_start_machine_worker; + btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL); rmw->fs_info = fs_info; - btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work); + btrfs_queue_work(fs_info->readahead_workers, &rmw->work); } #ifdef DEBUG diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index a5a26320503..65245a07275 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -94,6 +94,7 @@ struct backref_edge { #define LOWER 0 #define UPPER 1 +#define RELOCATION_RESERVED_NODES 256 struct backref_cache { /* red black tree of all backref nodes in the cache */ @@ -176,6 +177,8 @@ struct reloc_control { u64 merging_rsv_size; /* size of relocated tree nodes */ u64 nodes_relocated; + /* reserved size for block group relocation*/ + u64 reserved_bytes; u64 search_start; u64 extents_found; @@ -184,7 +187,6 @@ struct reloc_control { unsigned int create_reloc_tree:1; unsigned int merge_reloc_tree:1; unsigned int found_file_extent:1; - unsigned int commit_transaction:1; }; /* stages of data relocation */ @@ -335,7 +337,7 @@ static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr) if (bnode->root) fs_info = bnode->root->fs_info; btrfs_panic(fs_info, errno, "Inconsistency in backref cache " - "found at offset %llu\n", bytenr); + "found at offset %llu", bytenr); } /* @@ -526,7 +528,7 @@ static int should_ignore_root(struct btrfs_root *root) { struct btrfs_root *reloc_root; - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return 0; reloc_root = root->reloc_root; @@ -571,7 +573,9 @@ static int is_cowonly_root(u64 root_objectid) root_objectid == BTRFS_CHUNK_TREE_OBJECTID || root_objectid == BTRFS_DEV_TREE_OBJECTID || root_objectid == BTRFS_TREE_LOG_OBJECTID || - root_objectid == BTRFS_CSUM_TREE_OBJECTID) + root_objectid == BTRFS_CSUM_TREE_OBJECTID || + root_objectid == BTRFS_UUID_TREE_OBJECTID || + root_objectid == BTRFS_QUOTA_TREE_OBJECTID) return 1; return 0; } @@ -588,7 +592,7 @@ static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, else key.offset = (u64)-1; - return btrfs_read_fs_root_no_name(fs_info, &key); + return btrfs_get_fs_root(fs_info, &key, false); } #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 @@ -606,7 +610,7 @@ struct btrfs_root *find_tree_root(struct reloc_control *rc, root = read_fs_root(rc->extent_root->fs_info, root_objectid); BUG_ON(IS_ERR(root)); - if (root->ref_cows && + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && generation != btrfs_root_generation(&root->root_item)) return NULL; @@ -883,7 +887,7 @@ again: goto out; } - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) cur->cowonly = 1; if (btrfs_root_level(&root->root_item) == cur->level) { @@ -950,7 +954,8 @@ again: upper->bytenr = eb->start; upper->owner = btrfs_header_owner(eb); upper->level = lower->level + 1; - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, + &root->state)) upper->cowonly = 1; /* @@ -1254,7 +1259,7 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) if (rb_node) { btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found " "for start=%llu while inserting into relocation " - "tree\n", node->bytenr); + "tree", node->bytenr); kfree(node); return -EEXIST; } @@ -1264,10 +1269,10 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) } /* - * helper to update/delete the 'address of tree root -> reloc tree' + * helper to delete the 'address of tree root -> reloc tree' * mapping */ -static int __update_reloc_root(struct btrfs_root *root, int del) +static void __del_reloc_root(struct btrfs_root *root) { struct rb_node *rb_node; struct mapping_node *node = NULL; @@ -1275,7 +1280,7 @@ static int __update_reloc_root(struct btrfs_root *root, int del) spin_lock(&rc->reloc_root_tree.lock); rb_node = tree_search(&rc->reloc_root_tree.rb_root, - root->commit_root->start); + root->node->start); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); @@ -1283,23 +1288,45 @@ static int __update_reloc_root(struct btrfs_root *root, int del) spin_unlock(&rc->reloc_root_tree.lock); if (!node) - return 0; + return; BUG_ON((struct btrfs_root *)node->data != root); - if (!del) { - spin_lock(&rc->reloc_root_tree.lock); - node->bytenr = root->node->start; - rb_node = tree_insert(&rc->reloc_root_tree.rb_root, - node->bytenr, &node->rb_node); - spin_unlock(&rc->reloc_root_tree.lock); - if (rb_node) - backref_tree_panic(rb_node, -EEXIST, node->bytenr); - } else { - spin_lock(&root->fs_info->trans_lock); - list_del_init(&root->root_list); - spin_unlock(&root->fs_info->trans_lock); - kfree(node); + spin_lock(&root->fs_info->trans_lock); + list_del_init(&root->root_list); + spin_unlock(&root->fs_info->trans_lock); + kfree(node); +} + +/* + * helper to update the 'address of tree root -> reloc tree' + * mapping + */ +static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) +{ + struct rb_node *rb_node; + struct mapping_node *node = NULL; + struct reloc_control *rc = root->fs_info->reloc_ctl; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_search(&rc->reloc_root_tree.rb_root, + root->node->start); + if (rb_node) { + node = rb_entry(rb_node, struct mapping_node, rb_node); + rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); } + spin_unlock(&rc->reloc_root_tree.lock); + + if (!node) + return 0; + BUG_ON((struct btrfs_root *)node->data != root); + + spin_lock(&rc->reloc_root_tree.lock); + node->bytenr = new_bytenr; + rb_node = tree_insert(&rc->reloc_root_tree.rb_root, + node->bytenr, &node->rb_node); + spin_unlock(&rc->reloc_root_tree.lock); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, node->bytenr); return 0; } @@ -1383,6 +1410,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, { struct btrfs_root *reloc_root; struct reloc_control *rc = root->fs_info->reloc_ctl; + struct btrfs_block_rsv *rsv; int clear_rsv = 0; int ret; @@ -1396,13 +1424,14 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) return 0; - if (!trans->block_rsv) { + if (!trans->reloc_reserved) { + rsv = trans->block_rsv; trans->block_rsv = rc->block_rsv; clear_rsv = 1; } reloc_root = create_reloc_root(trans, root, root->root_key.objectid); if (clear_rsv) - trans->block_rsv = NULL; + trans->block_rsv = rsv; ret = __add_reloc_root(reloc_root); BUG_ON(ret < 0); @@ -1418,7 +1447,6 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, { struct btrfs_root *reloc_root; struct btrfs_root_item *root_item; - int del = 0; int ret; if (!root->reloc_root) @@ -1430,11 +1458,9 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, if (root->fs_info->reloc_ctl->merge_reloc_tree && btrfs_root_refs(root_item) == 0) { root->reloc_root = NULL; - del = 1; + __del_reloc_root(reloc_root); } - __update_reloc_root(reloc_root, del); - if (reloc_root->commit_root != reloc_root->node) { btrfs_set_root_node(root_item, reloc_root->node); free_extent_buffer(reloc_root->commit_root); @@ -1775,8 +1801,7 @@ again: new_ptr_gen = 0; } - if (new_bytenr > 0 && new_bytenr == old_bytenr) { - WARN_ON(1); + if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) { ret = level; break; } @@ -2058,7 +2083,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, LIST_HEAD(inode_list); struct btrfs_key key; struct btrfs_key next_key; - struct btrfs_trans_handle *trans; + struct btrfs_trans_handle *trans = NULL; struct btrfs_root *reloc_root; struct btrfs_root_item *root_item; struct btrfs_path *path; @@ -2107,18 +2132,19 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, memset(&next_key, 0, sizeof(next_key)); while (1) { - trans = btrfs_start_transaction(root, 0); - BUG_ON(IS_ERR(trans)); - trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, BTRFS_RESERVE_FLUSH_ALL); if (ret) { - BUG_ON(ret != -EAGAIN); - ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); - continue; + err = ret; + goto out; + } + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + trans = NULL; + goto out; } + trans->block_rsv = rc->block_rsv; replaced = 0; max_level = level; @@ -2164,6 +2190,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, root_item->drop_level = level; btrfs_end_transaction_throttle(trans, root); + trans = NULL; btrfs_btree_balance_dirty(root); @@ -2192,7 +2219,8 @@ out: btrfs_update_reloc_root(trans, root); } - btrfs_end_transaction_throttle(trans, root); + if (trans) + btrfs_end_transaction_throttle(trans, root); btrfs_btree_balance_dirty(root); @@ -2283,17 +2311,13 @@ void free_reloc_roots(struct list_head *list) while (!list_empty(list)) { reloc_root = list_entry(list->next, struct btrfs_root, root_list); - __update_reloc_root(reloc_root, 1); - free_extent_buffer(reloc_root->node); - free_extent_buffer(reloc_root->commit_root); - kfree(reloc_root); + __del_reloc_root(reloc_root); } } static noinline_for_stack int merge_reloc_roots(struct reloc_control *rc) { - struct btrfs_trans_handle *trans; struct btrfs_root *root; struct btrfs_root *reloc_root; u64 last_snap; @@ -2328,10 +2352,9 @@ again: ret = merge_reloc_root(rc, root); if (ret) { - __update_reloc_root(reloc_root, 1); - free_extent_buffer(reloc_root->node); - free_extent_buffer(reloc_root->commit_root); - kfree(reloc_root); + if (list_empty(&reloc_root->root_list)) + list_add_tail(&reloc_root->root_list, + &reloc_roots); goto out; } } else { @@ -2352,26 +2375,6 @@ again: list_add_tail(&reloc_root->root_list, &reloc_roots); goto out; - } else if (!ret) { - /* - * recover the last snapshot tranid to avoid - * the space balance break NOCOW. - */ - root = read_fs_root(rc->extent_root->fs_info, - objectid); - if (IS_ERR(root)) - continue; - - trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); - - /* Check if the fs/file tree was snapshoted or not. */ - if (btrfs_root_last_snapshot(&root->root_item) == - otransid - 1) - btrfs_set_root_last_snapshot(&root->root_item, - last_snap); - - btrfs_end_transaction(trans, root); } } @@ -2384,6 +2387,13 @@ out: btrfs_std_error(root->fs_info, ret); if (!list_empty(&reloc_roots)) free_reloc_roots(&reloc_roots); + + /* new reloc root may be added */ + mutex_lock(&root->fs_info->reloc_mutex); + list_splice_init(&rc->reloc_roots, &reloc_roots); + mutex_unlock(&root->fs_info->reloc_mutex); + if (!list_empty(&reloc_roots)) + free_reloc_roots(&reloc_roots); } BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); @@ -2420,7 +2430,7 @@ static noinline_for_stack struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct backref_node *node, - struct backref_edge *edges[], int *nr) + struct backref_edge *edges[]) { struct backref_node *next; struct btrfs_root *root; @@ -2432,7 +2442,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, next = walk_up_backref(next, edges, &index); root = next->root; BUG_ON(!root); - BUG_ON(!root->ref_cows); + BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state)); if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { record_reloc_root_in_trans(trans, root); @@ -2462,7 +2472,6 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, if (!root) return NULL; - *nr = index; next = node; /* setup backref node path for btrfs_reloc_cow_block */ while (1) { @@ -2498,7 +2507,7 @@ struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, BUG_ON(!root); /* no other choice for non-references counted tree */ - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return root; if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) @@ -2558,28 +2567,36 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, struct btrfs_root *root = rc->extent_root; u64 num_bytes; int ret; + u64 tmp; num_bytes = calcu_metadata_size(rc, node, 1) * 2; trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, - BTRFS_RESERVE_FLUSH_ALL); + rc->reserved_bytes += num_bytes; + ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); if (ret) { - if (ret == -EAGAIN) - rc->commit_transaction = 1; + if (ret == -EAGAIN) { + tmp = rc->extent_root->nodesize * + RELOCATION_RESERVED_NODES; + while (tmp <= rc->reserved_bytes) + tmp <<= 1; + /* + * only one thread can access block_rsv at this point, + * so we don't need hold lock to protect block_rsv. + * we expand more reservation size here to allow enough + * space for relocation and we will return eailer in + * enospc case. + */ + rc->block_rsv->size = tmp + rc->extent_root->nodesize * + RELOCATION_RESERVED_NODES; + } return ret; } return 0; } -static void release_metadata_space(struct reloc_control *rc, - struct backref_node *node) -{ - u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2; - btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes); -} - /* * relocate a block tree, and then update pointers in upper level * blocks that reference the block to point to the new location. @@ -2601,7 +2618,6 @@ static int do_relocation(struct btrfs_trans_handle *trans, u32 blocksize; u64 bytenr; u64 generation; - int nr; int slot; int ret; int err = 0; @@ -2614,7 +2630,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, cond_resched(); upper = edge->node[UPPER]; - root = select_reloc_root(trans, rc, upper, edges, &nr); + root = select_reloc_root(trans, rc, upper, edges); BUG_ON(!root); if (upper->eb && !upper->locked) { @@ -2866,7 +2882,6 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, struct btrfs_path *path) { struct btrfs_root *root; - int release = 0; int ret = 0; if (!node) @@ -2879,15 +2894,14 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, goto out; } - if (!root || root->ref_cows) { + if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { ret = reserve_metadata_space(trans, rc, node); if (ret) goto out; - release = 1; } if (root) { - if (root->ref_cows) { + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { BUG_ON(node->new_bytenr); BUG_ON(!list_empty(&node->list)); btrfs_record_root_in_trans(trans, root); @@ -2908,11 +2922,8 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, ret = do_relocation(trans, rc, node, key, path, 1); } out: - if (ret || node->level == 0 || node->cowonly) { - if (release) - release_metadata_space(rc, node); + if (ret || node->level == 0 || node->cowonly) remove_backref_node(&rc->backref_cache, node); - } return ret; } @@ -3258,7 +3269,7 @@ static int add_tree_block(struct reloc_control *rc, struct rb_node *rb_node; u32 item_size; int level = -1; - int generation; + u64 generation; eb = path->nodes[0]; item_size = btrfs_item_size_nr(eb, path->slots[0]); @@ -3407,7 +3418,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info, struct inode *inode, u64 ino) { struct btrfs_key key; - struct btrfs_path *path; struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; int ret = 0; @@ -3432,22 +3442,14 @@ truncate: if (ret) goto out; - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { - btrfs_free_path(path); ret = PTR_ERR(trans); goto out; } - ret = btrfs_truncate_free_space_cache(root, trans, path, inode); + ret = btrfs_truncate_free_space_cache(root, trans, inode); - btrfs_free_path(path); btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); out: @@ -3549,10 +3551,8 @@ static int find_data_references(struct reloc_control *rc, err = ret; goto out; } - if (ret > 0) { - WARN_ON(1); + if (WARN_ON(ret > 0)) goto out; - } leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); @@ -3572,11 +3572,9 @@ static int find_data_references(struct reloc_control *rc, } btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != ref_objectid || - key.type != BTRFS_EXTENT_DATA_KEY) { - WARN_ON(1); + if (WARN_ON(key.objectid != ref_objectid || + key.type != BTRFS_EXTENT_DATA_KEY)) break; - } fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -3848,29 +3846,20 @@ static noinline_for_stack int prepare_to_relocate(struct reloc_control *rc) { struct btrfs_trans_handle *trans; - int ret; rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root, BTRFS_BLOCK_RSV_TEMP); if (!rc->block_rsv) return -ENOMEM; - /* - * reserve some space for creating reloc trees. - * btrfs_init_reloc_root will use them when there - * is no reservation in transaction handle. - */ - ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv, - rc->extent_root->nodesize * 256, - BTRFS_RESERVE_FLUSH_ALL); - if (ret) - return ret; - memset(&rc->cluster, 0, sizeof(rc->cluster)); rc->search_start = rc->block_group->key.objectid; rc->extents_found = 0; rc->nodes_relocated = 0; rc->merging_rsv_size = 0; + rc->reserved_bytes = 0; + rc->block_rsv->size = rc->extent_root->nodesize * + RELOCATION_RESERVED_NODES; rc->create_reloc_tree = 1; set_reloc_control(rc); @@ -3914,6 +3903,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) } while (1) { + rc->reserved_bytes = 0; + ret = btrfs_block_rsv_refill(rc->extent_root, + rc->block_rsv, rc->block_rsv->size, + BTRFS_RESERVE_FLUSH_ALL); + if (ret) { + err = ret; + break; + } progress++; trans = btrfs_start_transaction(rc->extent_root, 0); if (IS_ERR(trans)) { @@ -3992,6 +3989,12 @@ restart: if (!RB_EMPTY_ROOT(&blocks)) { ret = relocate_tree_blocks(trans, rc, &blocks); if (ret < 0) { + /* + * if we fail to relocate tree blocks, force to update + * backref cache when committing transaction. + */ + rc->backref_cache.last_trans = trans->transid - 1; + if (ret != -EAGAIN) { err = ret; break; @@ -4001,24 +4004,8 @@ restart: } } - ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5); - if (ret < 0) { - if (ret != -ENOSPC) { - err = ret; - WARN_ON(1); - break; - } - rc->commit_transaction = 1; - } - - if (rc->commit_transaction) { - rc->commit_transaction = 0; - ret = btrfs_commit_transaction(trans, rc->extent_root); - BUG_ON(ret); - } else { - btrfs_end_transaction_throttle(trans, rc->extent_root); - btrfs_btree_balance_dirty(rc->extent_root); - } + btrfs_end_transaction_throttle(trans, rc->extent_root); + btrfs_btree_balance_dirty(rc->extent_root); trans = NULL; if (rc->stage == MOVE_DATA_EXTENTS && @@ -4238,15 +4225,15 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) goto out; } - printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n", + btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu", rc->block_group->key.objectid, rc->block_group->flags); - ret = btrfs_start_all_delalloc_inodes(fs_info, 0); + ret = btrfs_start_delalloc_roots(fs_info, 0, -1); if (ret < 0) { err = ret; goto out; } - btrfs_wait_all_ordered_extents(fs_info); + btrfs_wait_ordered_roots(fs_info, -1); while (1) { mutex_lock(&fs_info->cleaner_mutex); @@ -4260,22 +4247,22 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) if (rc->extents_found == 0) break; - printk(KERN_INFO "btrfs: found %llu extents\n", + btrfs_info(extent_root->fs_info, "found %llu extents", rc->extents_found); if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { - btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1); + ret = btrfs_wait_ordered_range(rc->data_inode, 0, + (u64)-1); + if (ret) { + err = ret; + goto out; + } invalidate_mapping_pages(rc->data_inode->i_mapping, 0, -1); rc->stage = UPDATE_DATA_PTRS; } } - filemap_write_and_wait_range(fs_info->btree_inode->i_mapping, - rc->block_group->key.objectid, - rc->block_group->key.objectid + - rc->block_group->key.offset - 1); - WARN_ON(rc->block_group->pinned > 0); WARN_ON(rc->block_group->reserved > 0); WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); @@ -4481,6 +4468,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) struct btrfs_root *root = BTRFS_I(inode)->root; int ret; u64 disk_bytenr; + u64 new_bytenr; LIST_HEAD(list); ordered = btrfs_lookup_ordered_extent(inode, file_pos); @@ -4492,13 +4480,24 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) if (ret) goto out; - disk_bytenr = ordered->start; while (!list_empty(&list)) { sums = list_entry(list.next, struct btrfs_ordered_sum, list); list_del_init(&sums->list); - sums->bytenr = disk_bytenr; - disk_bytenr += sums->len; + /* + * We need to offset the new_bytenr based on where the csum is. + * We need to do this because we will read in entire prealloc + * extents but we may have written to say the middle of the + * prealloc extent, so we need to make sure the csum goes with + * the right disk offset. + * + * We can do this because the data reloc inode refers strictly + * to the on disk bytes, so we don't have to worry about + * disk_len vs real len like with real inodes since it's all + * disk length. + */ + new_bytenr = ordered->start + (sums->bytenr - disk_bytenr); + sums->bytenr = new_bytenr; btrfs_add_ordered_sum(inode, ordered, sums); } @@ -4524,6 +4523,11 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, BUG_ON(rc->stage == UPDATE_DATA_PTRS && root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (buf == root->node) + __update_reloc_root(root, cow->start); + } + level = btrfs_header_level(buf); if (btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item)) diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 0b1f4ef8db9..360a728a639 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/err.h> #include <linux/uuid.h> #include "ctree.h" #include "transaction.h" @@ -44,7 +45,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, if (!need_reset && btrfs_root_generation(item) != btrfs_root_generation_v2(item)) { if (btrfs_root_generation_v2(item) != 0) { - printk(KERN_WARNING "btrfs: mismatching " + printk(KERN_WARNING "BTRFS: mismatching " "generation and generation_v2 " "found in root item. This root " "was probably mounted with an " @@ -154,7 +155,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root if (ret != 0) { btrfs_print_leaf(root, path->nodes[0]); - printk(KERN_CRIT "unable to update root key %llu %u %llu\n", + btrfs_crit(root->fs_info, "unable to update root key %llu %u %llu", key->objectid, key->type, key->offset); BUG_ON(1); } @@ -271,7 +272,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) key.offset++; root = btrfs_read_fs_root(tree_root, &root_key); - err = PTR_RET(root); + err = PTR_ERR_OR_ZERO(root); if (err && err != -ENOENT) { break; } else if (err == -ENOENT) { @@ -299,18 +300,13 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) continue; } - if (btrfs_root_refs(&root->root_item) == 0) { - btrfs_add_dead_root(root); - continue; - } - err = btrfs_init_fs_root(root); if (err) { btrfs_free_fs_root(root); break; } - root->orphan_item_inserted = 1; + set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); err = btrfs_insert_fs_root(root->fs_info, root); if (err) { @@ -318,6 +314,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) btrfs_free_fs_root(root); break; } + + if (btrfs_root_refs(&root->root_item) == 0) + btrfs_add_dead_root(root); } btrfs_free_path(path); @@ -402,21 +401,6 @@ out: return err; } -int btrfs_find_root_ref(struct btrfs_root *tree_root, - struct btrfs_path *path, - u64 root_id, u64 ref_id) -{ - struct btrfs_key key; - int ret; - - key.objectid = root_id; - key.type = BTRFS_ROOT_REF_KEY; - key.offset = ref_id; - - ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); - return ret; -} - /* * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY * or BTRFS_ROOT_BACKREF_KEY. diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a18e0e23f6a..b6d198f5181 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -208,7 +208,6 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, int is_metadata, int have_csum, const u8 *csum, u64 generation, u16 csum_size); -static void scrub_complete_bio_end_io(struct bio *bio, int err); static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, int force_write); @@ -257,6 +256,8 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, int mirror_num, u64 physical_for_dev_replace); static void copy_nocow_pages_worker(struct btrfs_work *work); +static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); +static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); static void scrub_pending_bio_inc(struct scrub_ctx *sctx) @@ -270,6 +271,29 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx) wake_up(&sctx->list_wait); } +static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) +{ + while (atomic_read(&fs_info->scrub_pause_req)) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + atomic_read(&fs_info->scrub_pause_req) == 0); + mutex_lock(&fs_info->scrub_lock); + } +} + +static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) +{ + atomic_inc(&fs_info->scrubs_paused); + wake_up(&fs_info->scrub_pause_wait); + + mutex_lock(&fs_info->scrub_lock); + __scrub_blocked_if_needed(fs_info); + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + + wake_up(&fs_info->scrub_pause_wait); +} + /* * used for workers that require transaction commits (i.e., for the * NOCOW case) @@ -291,6 +315,16 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) atomic_inc(&fs_info->scrubs_running); atomic_inc(&fs_info->scrubs_paused); mutex_unlock(&fs_info->scrub_lock); + + /* + * check if @scrubs_running=@scrubs_paused condition + * inside wait_event() is not an atomic operation. + * which means we may inc/dec @scrub_running/paused + * at any time. Let's wake up @scrub_pause_wait as + * much as we can to let commit transaction blocked less. + */ + wake_up(&fs_info->scrub_pause_wait); + atomic_inc(&sctx->workers_pending); } @@ -394,7 +428,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sbio->index = i; sbio->sctx = sctx; sbio->page_count = 0; - sbio->work.func = scrub_bio_end_io_worker; + btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, + NULL, NULL); if (i != SCRUB_BIOS_PER_SCTX - 1) sctx->bios[i]->next_free = i + 1; @@ -481,7 +516,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, * hold all of the paths here */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) - printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev " + printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev " "%s, sector %llu, root %llu, inode %llu, offset %llu, " "length %llu, links %u (path: %s)\n", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), @@ -493,7 +528,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, return 0; err: - printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev " + printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev " "%s, sector %llu, root %llu, inode %llu, offset %llu: path " "resolving failed with ret=%d\n", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), @@ -553,10 +588,11 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { do { - ret = tree_backref_for_extent(&ptr, eb, ei, item_size, - &ref_root, &ref_level); + ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, + item_size, &ref_root, + &ref_level); printk_in_rcu(KERN_WARNING - "btrfs: %s at logical %llu on dev %s, " + "BTRFS: %s at logical %llu on dev %s, " "sector %llu: metadata %s (level %d) in tree " "%llu\n", errstr, swarn.logical, rcu_str_deref(dev->name), @@ -682,8 +718,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) out: if (page) put_page(page); - if (inode) - iput(inode); + + iput(inode); if (ret < 0) return ret; @@ -705,13 +741,11 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work) struct scrub_fixup_nodatasum *fixup; struct scrub_ctx *sctx; struct btrfs_trans_handle *trans = NULL; - struct btrfs_fs_info *fs_info; struct btrfs_path *path; int uncorrectable = 0; fixup = container_of(work, struct scrub_fixup_nodatasum, work); sctx = fixup->sctx; - fs_info = fixup->root->fs_info; path = btrfs_alloc_path(); if (!path) { @@ -760,8 +794,8 @@ out: btrfs_dev_replace_stats_inc( &sctx->dev_root->fs_info->dev_replace. num_uncorrectable_read_errors); - printk_ratelimited_in_rcu(KERN_ERR - "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", + printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " + "unable to fixup (nodatasum) error at logical %llu on dev %s\n", fixup->logical, rcu_str_deref(fixup->dev->name)); } @@ -938,8 +972,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) BTRFS_DEV_STAT_CORRUPTION_ERRS); } - if (sctx->readonly && !sctx->is_dev_replace) - goto did_not_correct_error; + if (sctx->readonly) { + ASSERT(!sctx->is_dev_replace); + goto out; + } if (!is_metadata && !have_csum) { struct scrub_fixup_nodatasum *fixup_nodatasum; @@ -963,9 +999,10 @@ nodatasum_case: fixup_nodatasum->root = fs_info->extent_root; fixup_nodatasum->mirror_num = failed_mirror_index + 1; scrub_pending_trans_workers_inc(sctx); - fixup_nodatasum->work.func = scrub_fixup_nodatasum; - btrfs_queue_worker(&fs_info->scrub_workers, - &fixup_nodatasum->work); + btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum, + NULL, NULL); + btrfs_queue_work(fs_info->scrub_workers, + &fixup_nodatasum->work); goto out; } @@ -1160,7 +1197,7 @@ corrected_error: sctx->stat.corrected_errors++; spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR - "btrfs: fixed up error at logical %llu on dev %s\n", + "BTRFS: fixed up error at logical %llu on dev %s\n", logical, rcu_str_deref(dev->name)); } } else { @@ -1169,7 +1206,7 @@ did_not_correct_error: sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR - "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", + "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n", logical, rcu_str_deref(dev->name)); } @@ -1292,7 +1329,6 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, for (page_num = 0; page_num < sblock->page_count; page_num++) { struct bio *bio; struct scrub_page *page = sblock->pagev[page_num]; - DECLARE_COMPLETION_ONSTACK(complete); if (page->dev->bdev == NULL) { page->io_error = 1; @@ -1308,19 +1344,12 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, continue; } bio->bi_bdev = page->dev->bdev; - bio->bi_sector = page->physical >> 9; - bio->bi_end_io = scrub_complete_bio_end_io; - bio->bi_private = &complete; + bio->bi_iter.bi_sector = page->physical >> 9; bio_add_page(bio, page->page, PAGE_SIZE, 0); - btrfsic_submit_bio(READ, bio); - - /* this will also unplug the queue */ - wait_for_completion(&complete); - - page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags); - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + if (btrfsic_submit_bio_wait(READ, bio)) sblock->no_io_error_seen = 0; + bio_put(bio); } @@ -1389,11 +1418,6 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, sblock->checksum_error = 1; } -static void scrub_complete_bio_end_io(struct bio *bio, int err) -{ - complete((struct completion *)bio->bi_private); -} - static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, int force_write) @@ -1428,11 +1452,11 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, sblock_bad->checksum_error || page_bad->io_error) { struct bio *bio; int ret; - DECLARE_COMPLETION_ONSTACK(complete); if (!page_bad->dev->bdev) { - printk_ratelimited(KERN_WARNING - "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n"); + printk_ratelimited(KERN_WARNING "BTRFS: " + "scrub_repair_page_from_good_copy(bdev == NULL) " + "is unexpected!\n"); return -EIO; } @@ -1440,20 +1464,15 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, if (!bio) return -EIO; bio->bi_bdev = page_bad->dev->bdev; - bio->bi_sector = page_bad->physical >> 9; - bio->bi_end_io = scrub_complete_bio_end_io; - bio->bi_private = &complete; + bio->bi_iter.bi_sector = page_bad->physical >> 9; ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); if (PAGE_SIZE != ret) { bio_put(bio); return -EIO; } - btrfsic_submit_bio(WRITE, bio); - /* this will also unplug the queue */ - wait_for_completion(&complete); - if (!bio_flagged(bio, BIO_UPTODATE)) { + if (btrfsic_submit_bio_wait(WRITE, bio)) { btrfs_dev_stat_inc_and_print(page_bad->dev, BTRFS_DEV_STAT_WRITE_ERRS); btrfs_dev_replace_stats_inc( @@ -1538,7 +1557,7 @@ again: bio->bi_private = sbio; bio->bi_end_io = scrub_wr_bio_end_io; bio->bi_bdev = sbio->dev->bdev; - bio->bi_sector = sbio->physical >> 9; + bio->bi_iter.bi_sector = sbio->physical >> 9; sbio->err = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical_for_dev_replace || @@ -1597,8 +1616,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err) sbio->err = err; sbio->bio = bio; - sbio->work.func = scrub_wr_bio_end_io_worker; - btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work); + btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL); + btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); } static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) @@ -1895,7 +1914,7 @@ static void scrub_submit(struct scrub_ctx *sctx) * This case is handled correctly (but _very_ slowly). */ printk_ratelimited(KERN_WARNING - "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n"); + "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n"); bio_endio(sbio->bio, -EIO); } else { btrfsic_submit_bio(READ, sbio->bio); @@ -1944,7 +1963,7 @@ again: bio->bi_private = sbio; bio->bi_end_io = scrub_bio_end_io; bio->bi_bdev = sbio->dev->bdev; - bio->bi_sector = sbio->physical >> 9; + bio->bi_iter.bi_sector = sbio->physical >> 9; sbio->err = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical || @@ -2066,7 +2085,7 @@ static void scrub_bio_end_io(struct bio *bio, int err) sbio->err = err; sbio->bio = bio; - btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); + btrfs_queue_work(fs_info->scrub_workers, &sbio->work); } static void scrub_bio_end_io_worker(struct btrfs_work *work) @@ -2217,6 +2236,47 @@ behind_scrub_pages: return 0; } +/* + * Given a physical address, this will calculate it's + * logical offset. if this is a parity stripe, it will return + * the most left data stripe's logical offset. + * + * return 0 if it is a data stripe, 1 means parity stripe. + */ +static int get_raid56_logic_offset(u64 physical, int num, + struct map_lookup *map, u64 *offset) +{ + int i; + int j = 0; + u64 stripe_nr; + u64 last_offset; + int stripe_index; + int rot; + + last_offset = (physical - map->stripes[num].physical) * + nr_data_stripes(map); + *offset = last_offset; + for (i = 0; i < nr_data_stripes(map); i++) { + *offset = last_offset + i * map->stripe_len; + + stripe_nr = *offset; + do_div(stripe_nr, map->stripe_len); + do_div(stripe_nr, nr_data_stripes(map)); + + /* Work out the disk rotation on this stripe-set */ + rot = do_div(stripe_nr, map->num_stripes); + /* calculate which stripe this data locates */ + rot += i; + stripe_index = rot % map->num_stripes; + if (stripe_index == num) + return 0; + if (stripe_index < num) + j++; + } + *offset = last_offset + j * map->stripe_len; + return 1; +} + static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *scrub_dev, @@ -2238,6 +2298,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 physical; u64 logical; u64 logic_end; + u64 physical_end; u64 generation; int mirror_num; struct reada_control *reada1; @@ -2251,16 +2312,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 extent_len; struct btrfs_device *extent_dev; int extent_mirror_num; - int stop_loop; - - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { - if (num >= nr_data_stripes(map)) { - return 0; - } - } + int stop_loop = 0; nstripes = length; + physical = map->stripes[num].physical; offset = 0; do_div(nstripes, map->stripe_len); if (map->type & BTRFS_BLOCK_GROUP_RAID0) { @@ -2278,6 +2333,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { increment = map->stripe_len; mirror_num = num % map->num_stripes + 1; + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + get_raid56_logic_offset(physical, num, map, &offset); + increment = map->stripe_len * nr_data_stripes(map); + mirror_num = 1; } else { increment = map->stripe_len; mirror_num = 1; @@ -2301,17 +2361,24 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, * to not hold off transaction commits */ logical = base + offset; - + physical_end = physical + nstripes * map->stripe_len; + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + get_raid56_logic_offset(physical_end, num, + map, &logic_end); + logic_end += base; + } else { + logic_end = logical + increment * nstripes; + } wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); - atomic_inc(&fs_info->scrubs_paused); - wake_up(&fs_info->scrub_pause_wait); + scrub_blocked_if_needed(fs_info); /* FIXME it might be better to start readahead at commit root */ key_start.objectid = logical; key_start.type = BTRFS_EXTENT_ITEM_KEY; key_start.offset = (u64)0; - key_end.objectid = base + offset + nstripes * increment; + key_end.objectid = logic_end; key_end.type = BTRFS_METADATA_ITEM_KEY; key_end.offset = (u64)-1; reada1 = btrfs_reada_add(root, &key_start, &key_end); @@ -2321,7 +2388,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, key_start.offset = logical; key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; key_end.type = BTRFS_EXTENT_CSUM_KEY; - key_end.offset = base + offset + nstripes * increment; + key_end.offset = logic_end; reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); if (!IS_ERR(reada1)) @@ -2329,16 +2396,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (!IS_ERR(reada2)) btrfs_reada_wait(reada2); - mutex_lock(&fs_info->scrub_lock); - while (atomic_read(&fs_info->scrub_pause_req)) { - mutex_unlock(&fs_info->scrub_lock); - wait_event(fs_info->scrub_pause_wait, - atomic_read(&fs_info->scrub_pause_req) == 0); - mutex_lock(&fs_info->scrub_lock); - } - atomic_dec(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - wake_up(&fs_info->scrub_pause_wait); /* * collect all data csums for the stripe to avoid seeking during @@ -2349,11 +2406,17 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, /* * now find all extents for each stripe and scrub them */ - logical = base + offset; - physical = map->stripes[num].physical; - logic_end = logical + increment * nstripes; ret = 0; - while (logical < logic_end) { + while (physical < physical_end) { + /* for raid56, we skip parity stripe */ + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + ret = get_raid56_logic_offset(physical, num, + map, &logical); + logical += base; + if (ret) + goto skip; + } /* * canceled? */ @@ -2375,22 +2438,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); atomic_set(&sctx->wr_ctx.flush_all_writes, 0); - atomic_inc(&fs_info->scrubs_paused); - wake_up(&fs_info->scrub_pause_wait); - mutex_lock(&fs_info->scrub_lock); - while (atomic_read(&fs_info->scrub_pause_req)) { - mutex_unlock(&fs_info->scrub_lock); - wait_event(fs_info->scrub_pause_wait, - atomic_read(&fs_info->scrub_pause_req) == 0); - mutex_lock(&fs_info->scrub_lock); - } - atomic_dec(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - wake_up(&fs_info->scrub_pause_wait); + scrub_blocked_if_needed(fs_info); } + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; key.objectid = logical; - key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -2398,8 +2453,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, goto out; if (ret > 0) { - ret = btrfs_previous_item(root, path, 0, - BTRFS_EXTENT_ITEM_KEY); + ret = btrfs_previous_extent_item(root, path, 0); if (ret < 0) goto out; if (ret > 0) { @@ -2457,9 +2511,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (key.objectid < logical && (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { - printk(KERN_ERR - "btrfs scrub: tree block %llu spanning " - "stripes, ignored. logical=%llu\n", + btrfs_err(fs_info, + "scrub: tree block %llu spanning " + "stripes, ignored. logical=%llu", key.objectid, logical); goto next; } @@ -2506,15 +2560,29 @@ again: scrub_free_csums(sctx); if (extent_logical + extent_len < key.objectid + bytes) { - logical += increment; - physical += map->stripe_len; - + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + /* + * loop until we find next data stripe + * or we have finished all stripes. + */ + do { + physical += map->stripe_len; + ret = get_raid56_logic_offset( + physical, num, + map, &logical); + logical += base; + } while (physical < physical_end && ret); + } else { + physical += map->stripe_len; + logical += increment; + } if (logical < key.objectid + bytes) { cond_resched(); goto again; } - if (logical >= logic_end) { + if (physical >= physical_end) { stop_loop = 1; break; } @@ -2523,6 +2591,7 @@ next: path->slots[0]++; } btrfs_release_path(path); +skip: logical += increment; physical += map->stripe_len; spin_lock(&sctx->stat_lock); @@ -2656,11 +2725,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); length = btrfs_dev_extent_length(l, dev_extent); - if (found_key.offset + length <= start) { - key.offset = found_key.offset + length; - btrfs_release_path(path); - continue; - } + if (found_key.offset + length <= start) + goto skip; chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); @@ -2671,10 +2737,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * the chunk from going away while we scrub it */ cache = btrfs_lookup_block_group(fs_info, chunk_offset); - if (!cache) { - ret = -ENOENT; - break; - } + + /* some chunks are removed but not committed to disk yet, + * continue scrubbing */ + if (!cache) + goto skip; + dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; @@ -2700,25 +2768,24 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); - atomic_set(&sctx->wr_ctx.flush_all_writes, 0); atomic_inc(&fs_info->scrubs_paused); wake_up(&fs_info->scrub_pause_wait); + + /* + * must be called before we decrease @scrub_paused. + * make sure we don't block transaction commit while + * we are waiting pending workers finished. + */ wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); + atomic_set(&sctx->wr_ctx.flush_all_writes, 0); mutex_lock(&fs_info->scrub_lock); - while (atomic_read(&fs_info->scrub_pause_req)) { - mutex_unlock(&fs_info->scrub_lock); - wait_event(fs_info->scrub_pause_wait, - atomic_read(&fs_info->scrub_pause_req) == 0); - mutex_lock(&fs_info->scrub_lock); - } + __scrub_blocked_if_needed(fs_info); atomic_dec(&fs_info->scrubs_paused); mutex_unlock(&fs_info->scrub_lock); wake_up(&fs_info->scrub_pause_wait); - dev_replace->cursor_left = dev_replace->cursor_right; - dev_replace->item_needs_writeback = 1; btrfs_put_block_group(cache); if (ret) break; @@ -2732,6 +2799,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, break; } + dev_replace->cursor_left = dev_replace->cursor_right; + dev_replace->item_needs_writeback = 1; +skip: key.offset = found_key.offset + length; btrfs_release_path(path); } @@ -2782,52 +2852,49 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, int is_dev_replace) { int ret = 0; + int flags = WQ_FREEZABLE | WQ_UNBOUND; + int max_active = fs_info->thread_pool_size; - mutex_lock(&fs_info->scrub_lock); if (fs_info->scrub_workers_refcnt == 0) { if (is_dev_replace) - btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1, - &fs_info->generic_worker); + fs_info->scrub_workers = + btrfs_alloc_workqueue("btrfs-scrub", flags, + 1, 4); else - btrfs_init_workers(&fs_info->scrub_workers, "scrub", - fs_info->thread_pool_size, - &fs_info->generic_worker); - fs_info->scrub_workers.idle_thresh = 4; - ret = btrfs_start_workers(&fs_info->scrub_workers); - if (ret) + fs_info->scrub_workers = + btrfs_alloc_workqueue("btrfs-scrub", flags, + max_active, 4); + if (!fs_info->scrub_workers) { + ret = -ENOMEM; goto out; - btrfs_init_workers(&fs_info->scrub_wr_completion_workers, - "scrubwrc", - fs_info->thread_pool_size, - &fs_info->generic_worker); - fs_info->scrub_wr_completion_workers.idle_thresh = 2; - ret = btrfs_start_workers( - &fs_info->scrub_wr_completion_workers); - if (ret) + } + fs_info->scrub_wr_completion_workers = + btrfs_alloc_workqueue("btrfs-scrubwrc", flags, + max_active, 2); + if (!fs_info->scrub_wr_completion_workers) { + ret = -ENOMEM; goto out; - btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1, - &fs_info->generic_worker); - ret = btrfs_start_workers(&fs_info->scrub_nocow_workers); - if (ret) + } + fs_info->scrub_nocow_workers = + btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0); + if (!fs_info->scrub_nocow_workers) { + ret = -ENOMEM; goto out; + } } ++fs_info->scrub_workers_refcnt; out: - mutex_unlock(&fs_info->scrub_lock); - return ret; } static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) { - mutex_lock(&fs_info->scrub_lock); if (--fs_info->scrub_workers_refcnt == 0) { - btrfs_stop_workers(&fs_info->scrub_workers); - btrfs_stop_workers(&fs_info->scrub_wr_completion_workers); - btrfs_stop_workers(&fs_info->scrub_nocow_workers); + btrfs_destroy_workqueue(fs_info->scrub_workers); + btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); + btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); } WARN_ON(fs_info->scrub_workers_refcnt < 0); - mutex_unlock(&fs_info->scrub_lock); } int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, @@ -2845,8 +2912,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, * check some assumptions */ if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) { - printk(KERN_ERR - "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", + btrfs_err(fs_info, + "scrub: size assumption nodesize == leafsize (%d == %d) fails", fs_info->chunk_root->nodesize, fs_info->chunk_root->leafsize); return -EINVAL; @@ -2858,16 +2925,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, * the way scrub is implemented. Do not handle this * situation at all because it won't ever happen. */ - printk(KERN_ERR - "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", + btrfs_err(fs_info, + "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails", fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN); return -EINVAL; } if (fs_info->chunk_root->sectorsize != PAGE_SIZE) { /* not supported for data w/o checksums */ - printk(KERN_ERR - "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails\n", + btrfs_err(fs_info, + "scrub: size assumption sectorsize != PAGE_SIZE " + "(%d != %lu) fails", fs_info->chunk_root->sectorsize, PAGE_SIZE); return -EINVAL; } @@ -2880,7 +2948,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, * would exhaust the array bounds of pagev member in * struct scrub_block */ - pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n", + btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize " + "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails", fs_info->chunk_root->nodesize, SCRUB_MAX_PAGES_PER_BLOCK, fs_info->chunk_root->sectorsize, @@ -2888,23 +2957,18 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -EINVAL; } - ret = scrub_workers_get(fs_info, is_dev_replace); - if (ret) - return ret; mutex_lock(&fs_info->fs_devices->device_list_mutex); dev = btrfs_find_device(fs_info, devid, NULL, NULL); if (!dev || (dev->missing && !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); - scrub_workers_put(fs_info); return -ENODEV; } - mutex_lock(&fs_info->scrub_lock); + mutex_lock(&fs_info->scrub_lock); if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - scrub_workers_put(fs_info); return -EIO; } @@ -2915,10 +2979,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, btrfs_dev_replace_unlock(&fs_info->dev_replace); mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - scrub_workers_put(fs_info); return -EINPROGRESS; } btrfs_dev_replace_unlock(&fs_info->dev_replace); + + ret = scrub_workers_get(fs_info, is_dev_replace); + if (ret) { + mutex_unlock(&fs_info->scrub_lock); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + return ret; + } + sctx = scrub_setup_ctx(dev, is_dev_replace); if (IS_ERR(sctx)) { mutex_unlock(&fs_info->scrub_lock); @@ -2928,15 +2999,24 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, } sctx->readonly = readonly; dev->scrub_device = sctx; + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + /* + * checking @scrub_pause_req here, we can avoid + * race between committing transaction and scrubbing. + */ + __scrub_blocked_if_needed(fs_info); atomic_inc(&fs_info->scrubs_running); mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); if (!is_dev_replace) { - down_read(&fs_info->scrub_super_lock); + /* + * by holding device list mutex, we can + * kick off writing super in log tree sync. + */ + mutex_lock(&fs_info->fs_devices->device_list_mutex); ret = scrub_supers(sctx, dev); - up_read(&fs_info->scrub_super_lock); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); } if (!ret) @@ -2954,10 +3034,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_lock(&fs_info->scrub_lock); dev->scrub_device = NULL; + scrub_workers_put(fs_info); mutex_unlock(&fs_info->scrub_lock); scrub_free_ctx(sctx); - scrub_workers_put(fs_info); return ret; } @@ -2987,16 +3067,6 @@ void btrfs_scrub_continue(struct btrfs_root *root) wake_up(&fs_info->scrub_pause_wait); } -void btrfs_scrub_pause_super(struct btrfs_root *root) -{ - down_write(&root->fs_info->scrub_super_lock); -} - -void btrfs_scrub_continue_super(struct btrfs_root *root) -{ - up_write(&root->fs_info->scrub_super_lock); -} - int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) { mutex_lock(&fs_info->scrub_lock); @@ -3133,10 +3203,10 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, nocow_ctx->len = len; nocow_ctx->mirror_num = mirror_num; nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; - nocow_ctx->work.func = copy_nocow_pages_worker; + btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL); INIT_LIST_HEAD(&nocow_ctx->inodes); - btrfs_queue_worker(&fs_info->scrub_nocow_workers, - &nocow_ctx->work); + btrfs_queue_work(fs_info->scrub_nocow_workers, + &nocow_ctx->work); return 0; } @@ -3195,7 +3265,8 @@ static void copy_nocow_pages_worker(struct btrfs_work *work) ret = iterate_inodes_from_logical(logical, fs_info, path, record_inode_for_nocow, nocow_ctx); if (ret != 0 && ret != -ENOENT) { - pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d\n", + btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, " + "phys %llu, len %llu, mir %u, ret %d", logical, physical_for_dev_replace, len, mirror_num, ret); not_written = 1; @@ -3317,7 +3388,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, again: page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); if (!page) { - pr_err("find_or_create_page() failed\n"); + btrfs_err(fs_info, "find_or_create_page() failed"); ret = -ENOMEM; goto out; } @@ -3383,14 +3454,13 @@ static int write_page_nocow(struct scrub_ctx *sctx, struct bio *bio; struct btrfs_device *dev; int ret; - DECLARE_COMPLETION_ONSTACK(compl); dev = sctx->wr_ctx.tgtdev; if (!dev) return -EIO; if (!dev->bdev) { printk_ratelimited(KERN_WARNING - "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); + "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); return -EIO; } bio = btrfs_io_bio_alloc(GFP_NOFS, 1); @@ -3400,10 +3470,8 @@ static int write_page_nocow(struct scrub_ctx *sctx, spin_unlock(&sctx->stat_lock); return -ENOMEM; } - bio->bi_private = &compl; - bio->bi_end_io = scrub_complete_bio_end_io; - bio->bi_size = 0; - bio->bi_sector = physical_for_dev_replace >> 9; + bio->bi_iter.bi_size = 0; + bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; bio->bi_bdev = dev->bdev; ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); if (ret != PAGE_CACHE_SIZE) { @@ -3412,10 +3480,8 @@ leave_with_eio: btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); return -EIO; } - btrfsic_submit_bio(WRITE_SYNC, bio); - wait_for_completion(&compl); - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) goto leave_with_eio; bio_put(bio); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e46e0ed7492..6528aa66218 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -24,12 +24,12 @@ #include <linux/xattr.h> #include <linux/posix_acl_xattr.h> #include <linux/radix-tree.h> -#include <linux/crc32c.h> #include <linux/vmalloc.h> #include <linux/string.h> #include "send.h" #include "backref.h" +#include "hash.h" #include "locking.h" #include "disk-io.h" #include "btrfs_inode.h" @@ -51,15 +51,18 @@ struct fs_path { struct { char *start; char *end; - char *prepared; char *buf; - int buf_len; - unsigned int reversed:1; - unsigned int virtual_mem:1; + unsigned short buf_len:15; + unsigned short reversed:1; char inline_buf[]; }; - char pad[PAGE_SIZE]; + /* + * Average path length does not exceed 200 bytes, we'll have + * better packing in the slab and higher chance to satisfy + * a allocation later during send. + */ + char pad[256]; }; }; #define FS_PATH_INLINE_SIZE \ @@ -88,8 +91,6 @@ struct send_ctx { u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ - struct vfsmount *mnt; - struct btrfs_root *send_root; struct btrfs_root *parent_root; struct clone_root *clone_roots; @@ -111,6 +112,8 @@ struct send_ctx { int cur_inode_deleted; u64 cur_inode_size; u64 cur_inode_mode; + u64 cur_inode_rdev; + u64 cur_inode_last_extent; u64 send_progress; @@ -121,8 +124,130 @@ struct send_ctx { struct list_head name_cache_list; int name_cache_size; - struct file *cur_inode_filp; + struct file_ra_state ra; + char *read_buf; + + /* + * We process inodes by their increasing order, so if before an + * incremental send we reverse the parent/child relationship of + * directories such that a directory with a lower inode number was + * the parent of a directory with a higher inode number, and the one + * becoming the new parent got renamed too, we can't rename/move the + * directory with lower inode number when we finish processing it - we + * must process the directory with higher inode number first, then + * rename/move it and then rename/move the directory with lower inode + * number. Example follows. + * + * Tree state when the first send was performed: + * + * . + * |-- a (ino 257) + * |-- b (ino 258) + * | + * | + * |-- c (ino 259) + * | |-- d (ino 260) + * | + * |-- c2 (ino 261) + * + * Tree state when the second (incremental) send is performed: + * + * . + * |-- a (ino 257) + * |-- b (ino 258) + * |-- c2 (ino 261) + * |-- d2 (ino 260) + * |-- cc (ino 259) + * + * The sequence of steps that lead to the second state was: + * + * mv /a/b/c/d /a/b/c2/d2 + * mv /a/b/c /a/b/c2/d2/cc + * + * "c" has lower inode number, but we can't move it (2nd mv operation) + * before we move "d", which has higher inode number. + * + * So we just memorize which move/rename operations must be performed + * later when their respective parent is processed and moved/renamed. + */ + + /* Indexed by parent directory inode number. */ + struct rb_root pending_dir_moves; + + /* + * Reverse index, indexed by the inode number of a directory that + * is waiting for the move/rename of its immediate parent before its + * own move/rename can be performed. + */ + struct rb_root waiting_dir_moves; + + /* + * A directory that is going to be rm'ed might have a child directory + * which is in the pending directory moves index above. In this case, + * the directory can only be removed after the move/rename of its child + * is performed. Example: + * + * Parent snapshot: + * + * . (ino 256) + * |-- a/ (ino 257) + * |-- b/ (ino 258) + * |-- c/ (ino 259) + * | |-- x/ (ino 260) + * | + * |-- y/ (ino 261) + * + * Send snapshot: + * + * . (ino 256) + * |-- a/ (ino 257) + * |-- b/ (ino 258) + * |-- YY/ (ino 261) + * |-- x/ (ino 260) + * + * Sequence of steps that lead to the send snapshot: + * rm -f /a/b/c/foo.txt + * mv /a/b/y /a/b/YY + * mv /a/b/c/x /a/b/YY + * rmdir /a/b/c + * + * When the child is processed, its move/rename is delayed until its + * parent is processed (as explained above), but all other operations + * like update utimes, chown, chgrp, etc, are performed and the paths + * that it uses for those operations must use the orphanized name of + * its parent (the directory we're going to rm later), so we need to + * memorize that name. + * + * Indexed by the inode number of the directory to be deleted. + */ + struct rb_root orphan_dirs; +}; + +struct pending_dir_move { + struct rb_node node; + struct list_head list; + u64 parent_ino; + u64 ino; + u64 gen; + struct list_head update_refs; +}; + +struct waiting_dir_move { + struct rb_node node; + u64 ino; + /* + * There might be some directory that could not be removed because it + * was waiting for this directory inode to be moved first. Therefore + * after this directory is moved, we can try to rmdir the ino rmdir_ino. + */ + u64 rmdir_ino; +}; + +struct orphan_dir_info { + struct rb_node node; + u64 ino; + u64 gen; }; struct name_cache_entry { @@ -146,6 +271,20 @@ struct name_cache_entry { char name[]; }; +static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); + +static struct waiting_dir_move * +get_waiting_dir_move(struct send_ctx *sctx, u64 ino); + +static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino); + +static int need_send_hole(struct send_ctx *sctx) +{ + return (sctx->parent_root && !sctx->cur_inode_new && + !sctx->cur_inode_new_gen && !sctx->cur_inode_deleted && + S_ISREG(sctx->cur_inode_mode)); +} + static void fs_path_reset(struct fs_path *p) { if (p->reversed) { @@ -167,7 +306,6 @@ static struct fs_path *fs_path_alloc(void) if (!p) return NULL; p->reversed = 0; - p->virtual_mem = 0; p->buf = p->inline_buf; p->buf_len = FS_PATH_INLINE_SIZE; fs_path_reset(p); @@ -190,12 +328,8 @@ static void fs_path_free(struct fs_path *p) { if (!p) return; - if (p->buf != p->inline_buf) { - if (p->virtual_mem) - vfree(p->buf); - else - kfree(p->buf); - } + if (p->buf != p->inline_buf) + kfree(p->buf); kfree(p); } @@ -215,42 +349,33 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) if (p->buf_len >= len) return 0; + if (len > PATH_MAX) { + WARN_ON(1); + return -ENOMEM; + } + path_len = p->end - p->start; old_buf_len = p->buf_len; - len = PAGE_ALIGN(len); + /* + * First time the inline_buf does not suffice + */ if (p->buf == p->inline_buf) { - tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN); - if (!tmp_buf) { - tmp_buf = vmalloc(len); - if (!tmp_buf) - return -ENOMEM; - p->virtual_mem = 1; - } - memcpy(tmp_buf, p->buf, p->buf_len); - p->buf = tmp_buf; - p->buf_len = len; + tmp_buf = kmalloc(len, GFP_NOFS); + if (tmp_buf) + memcpy(tmp_buf, p->buf, old_buf_len); } else { - if (p->virtual_mem) { - tmp_buf = vmalloc(len); - if (!tmp_buf) - return -ENOMEM; - memcpy(tmp_buf, p->buf, p->buf_len); - vfree(p->buf); - } else { - tmp_buf = krealloc(p->buf, len, GFP_NOFS); - if (!tmp_buf) { - tmp_buf = vmalloc(len); - if (!tmp_buf) - return -ENOMEM; - memcpy(tmp_buf, p->buf, p->buf_len); - kfree(p->buf); - p->virtual_mem = 1; - } - } - p->buf = tmp_buf; - p->buf_len = len; + tmp_buf = krealloc(p->buf, len, GFP_NOFS); } + if (!tmp_buf) + return -ENOMEM; + p->buf = tmp_buf; + /* + * The real size of the buffer is bigger, this will let the fast path + * happen most of the time + */ + p->buf_len = ksize(p->buf); + if (p->reversed) { tmp_buf = p->buf + old_buf_len - path_len - 1; p->end = p->buf + p->buf_len - 1; @@ -263,7 +388,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) return 0; } -static int fs_path_prepare_for_add(struct fs_path *p, int name_len) +static int fs_path_prepare_for_add(struct fs_path *p, int name_len, + char **prepared) { int ret; int new_len; @@ -279,11 +405,11 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len) if (p->start != p->end) *--p->start = '/'; p->start -= name_len; - p->prepared = p->start; + *prepared = p->start; } else { if (p->start != p->end) *p->end++ = '/'; - p->prepared = p->end; + *prepared = p->end; p->end += name_len; *p->end = 0; } @@ -295,12 +421,12 @@ out: static int fs_path_add(struct fs_path *p, const char *name, int name_len) { int ret; + char *prepared; - ret = fs_path_prepare_for_add(p, name_len); + ret = fs_path_prepare_for_add(p, name_len, &prepared); if (ret < 0) goto out; - memcpy(p->prepared, name, name_len); - p->prepared = NULL; + memcpy(prepared, name, name_len); out: return ret; @@ -309,12 +435,12 @@ out: static int fs_path_add_path(struct fs_path *p, struct fs_path *p2) { int ret; + char *prepared; - ret = fs_path_prepare_for_add(p, p2->end - p2->start); + ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared); if (ret < 0) goto out; - memcpy(p->prepared, p2->start, p2->end - p2->start); - p->prepared = NULL; + memcpy(prepared, p2->start, p2->end - p2->start); out: return ret; @@ -325,28 +451,18 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p, unsigned long off, int len) { int ret; + char *prepared; - ret = fs_path_prepare_for_add(p, len); + ret = fs_path_prepare_for_add(p, len, &prepared); if (ret < 0) goto out; - read_extent_buffer(eb, p->prepared, off, len); - p->prepared = NULL; + read_extent_buffer(eb, prepared, off, len); out: return ret; } -#if 0 -static void fs_path_remove(struct fs_path *p) -{ - BUG_ON(p->reversed); - while (p->start != p->end && *p->end != '/') - p->end--; - *p->end = 0; -} -#endif - static int fs_path_copy(struct fs_path *p, struct fs_path *from) { int ret; @@ -385,6 +501,7 @@ static struct btrfs_path *alloc_path_for_send(void) return NULL; path->search_commit_root = 1; path->skip_locking = 1; + path->need_commit_sem = 1; return path; } @@ -437,30 +554,15 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) return 0; } -#if 0 -static int tlv_put_u8(struct send_ctx *sctx, u16 attr, u8 value) -{ - return tlv_put(sctx, attr, &value, sizeof(value)); -} - -static int tlv_put_u16(struct send_ctx *sctx, u16 attr, u16 value) -{ - __le16 tmp = cpu_to_le16(value); - return tlv_put(sctx, attr, &tmp, sizeof(tmp)); -} - -static int tlv_put_u32(struct send_ctx *sctx, u16 attr, u32 value) -{ - __le32 tmp = cpu_to_le32(value); - return tlv_put(sctx, attr, &tmp, sizeof(tmp)); -} -#endif +#define TLV_PUT_DEFINE_INT(bits) \ + static int tlv_put_u##bits(struct send_ctx *sctx, \ + u##bits attr, u##bits value) \ + { \ + __le##bits __tmp = cpu_to_le##bits(value); \ + return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \ + } -static int tlv_put_u64(struct send_ctx *sctx, u16 attr, u64 value) -{ - __le64 tmp = cpu_to_le64(value); - return tlv_put(sctx, attr, &tmp, sizeof(tmp)); -} +TLV_PUT_DEFINE_INT(64) static int tlv_put_string(struct send_ctx *sctx, u16 attr, const char *str, int len) @@ -476,17 +578,6 @@ static int tlv_put_uuid(struct send_ctx *sctx, u16 attr, return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE); } -#if 0 -static int tlv_put_timespec(struct send_ctx *sctx, u16 attr, - struct timespec *ts) -{ - struct btrfs_timespec bts; - bts.sec = cpu_to_le64(ts->tv_sec); - bts.nsec = cpu_to_le32(ts->tv_nsec); - return tlv_put(sctx, attr, &bts, sizeof(bts)); -} -#endif - static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr, struct extent_buffer *eb, struct btrfs_timespec *ts) @@ -534,12 +625,6 @@ static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr, if (ret < 0) \ goto tlv_put_failure; \ } while (0) -#define TLV_PUT_TIMESPEC(sctx, attrtype, ts) \ - do { \ - ret = tlv_put_timespec(sctx, attrtype, ts); \ - if (ret < 0) \ - goto tlv_put_failure; \ - } while (0) #define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \ do { \ ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \ @@ -565,10 +650,8 @@ static int begin_cmd(struct send_ctx *sctx, int cmd) { struct btrfs_cmd_header *hdr; - if (!sctx->send_buf) { - WARN_ON(1); + if (WARN_ON(!sctx->send_buf)) return -EINVAL; - } BUG_ON(sctx->send_size); @@ -589,7 +672,7 @@ static int send_cmd(struct send_ctx *sctx) hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr)); hdr->crc = 0; - crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); + crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); hdr->crc = cpu_to_le32(crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, @@ -697,29 +780,22 @@ out: /* * Helper function to retrieve some fields from an inode item. */ -static int get_inode_info(struct btrfs_root *root, - u64 ino, u64 *size, u64 *gen, - u64 *mode, u64 *uid, u64 *gid, - u64 *rdev) +static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, + u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid, + u64 *gid, u64 *rdev) { int ret; struct btrfs_inode_item *ii; struct btrfs_key key; - struct btrfs_path *path; - - path = alloc_path_for_send(); - if (!path) - return -ENOMEM; key.objectid = ino; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; if (ret) { - ret = -ENOENT; - goto out; + if (ret > 0) + ret = -ENOENT; + return ret; } ii = btrfs_item_ptr(path->nodes[0], path->slots[0], @@ -737,7 +813,22 @@ static int get_inode_info(struct btrfs_root *root, if (rdev) *rdev = btrfs_inode_rdev(path->nodes[0], ii); -out: + return ret; +} + +static int get_inode_info(struct btrfs_root *root, + u64 ino, u64 *size, u64 *gen, + u64 *mode, u64 *uid, u64 *gid, + u64 *rdev) +{ + struct btrfs_path *path; + int ret; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid, + rdev); btrfs_free_path(path); return ret; } @@ -791,7 +882,7 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, if (found_key->type == BTRFS_INODE_REF_KEY) { ptr = (unsigned long)btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); - item = btrfs_item_nr(eb, slot); + item = btrfs_item_nr(slot); total = btrfs_item_size(eb, item); elem_size = sizeof(*iref); } else { @@ -884,9 +975,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_dir_item *di; struct btrfs_key di_key; char *buf = NULL; - char *buf2 = NULL; int buf_len; - int buf_virtual = 0; u32 name_len; u32 data_len; u32 cur; @@ -896,7 +985,11 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, int num; u8 type; - buf_len = PAGE_SIZE; + if (found_key->type == BTRFS_XATTR_ITEM_KEY) + buf_len = BTRFS_MAX_XATTR_SIZE(root); + else + buf_len = PATH_MAX; + buf = kmalloc(buf_len, GFP_NOFS); if (!buf) { ret = -ENOMEM; @@ -905,7 +998,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, eb = path->nodes[0]; slot = path->slots[0]; - item = btrfs_item_nr(eb, slot); + item = btrfs_item_nr(slot); di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); cur = 0; len = 0; @@ -918,30 +1011,23 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, type = btrfs_dir_type(eb, di); btrfs_dir_item_key_to_cpu(eb, di, &di_key); - if (name_len + data_len > buf_len) { - buf_len = PAGE_ALIGN(name_len + data_len); - if (buf_virtual) { - buf2 = vmalloc(buf_len); - if (!buf2) { - ret = -ENOMEM; - goto out; - } - vfree(buf); - } else { - buf2 = krealloc(buf, buf_len, GFP_NOFS); - if (!buf2) { - buf2 = vmalloc(buf_len); - if (!buf2) { - ret = -ENOMEM; - goto out; - } - kfree(buf); - buf_virtual = 1; - } + if (type == BTRFS_FT_XATTR) { + if (name_len > XATTR_NAME_MAX) { + ret = -ENAMETOOLONG; + goto out; + } + if (name_len + data_len > buf_len) { + ret = -E2BIG; + goto out; + } + } else { + /* + * Path too long + */ + if (name_len + data_len > buf_len) { + ret = -ENAMETOOLONG; + goto out; } - - buf = buf2; - buf2 = NULL; } read_extent_buffer(eb, buf, (unsigned long)(di + 1), @@ -964,10 +1050,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, } out: - if (buf_virtual) - vfree(buf); - else - kfree(buf); + kfree(buf); return ret; } @@ -1035,6 +1118,7 @@ out: struct backref_ctx { struct send_ctx *sctx; + struct btrfs_path *path; /* number of total found references */ u64 found; @@ -1105,8 +1189,9 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) * There are inodes that have extents that lie behind its i_size. Don't * accept clones from these extents. */ - ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL, - NULL); + ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL, + NULL, NULL, NULL); + btrfs_release_path(bctx->path); if (ret < 0) return ret; @@ -1185,12 +1270,17 @@ static int find_extent_clone(struct send_ctx *sctx, if (!tmp_path) return -ENOMEM; + /* We only use this path under the commit sem */ + tmp_path->need_commit_sem = 0; + backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS); if (!backref_ctx) { ret = -ENOMEM; goto out; } + backref_ctx->path = tmp_path; + if (data_offset >= ino_size) { /* * There may be extents that lie behind the file's size. @@ -1218,8 +1308,10 @@ static int find_extent_clone(struct send_ctx *sctx, } logical = disk_byte + btrfs_file_extent_offset(eb, fi); + down_read(&sctx->send_root->fs_info->commit_root_sem); ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path, &found_key, &flags); + up_read(&sctx->send_root->fs_info->commit_root_sem); btrfs_release_path(tmp_path); if (ret < 0) @@ -1261,8 +1353,6 @@ static int find_extent_clone(struct send_ctx *sctx, extent_item_pos = logical - found_key.objectid; else extent_item_pos = 0; - - extent_item_pos = logical - found_key.objectid; ret = iterate_extent_inodes(sctx->send_root->fs_info, found_key.objectid, extent_item_pos, 1, __iterate_backrefs, backref_ctx); @@ -1273,9 +1363,9 @@ static int find_extent_clone(struct send_ctx *sctx, if (!backref_ctx->found_itself) { /* found a bug in backref code? */ ret = -EIO; - printk(KERN_ERR "btrfs: ERROR did not find backref in " + btrfs_err(sctx->send_root->fs_info, "did not find backref in " "send_root. inode=%llu, offset=%llu, " - "disk_byte=%llu found extent=%llu\n", + "disk_byte=%llu found extent=%llu", ino, data_offset, disk_byte, found_key.objectid); goto out; } @@ -1301,6 +1391,16 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, " } if (cur_clone_root) { + if (compressed != BTRFS_COMPRESS_NONE) { + /* + * Offsets given by iterate_extent_inodes() are relative + * to the start of the extent, we need to add logical + * offset from the file extent item. + * (See why at backref.c:check_extent_in_eb()) + */ + cur_clone_root->offset += btrfs_file_extent_offset(eb, + fi); + } *found = cur_clone_root; ret = 0; } else { @@ -1346,7 +1446,7 @@ static int read_symlink(struct btrfs_root *root, BUG_ON(compression); off = btrfs_file_extent_inline_start(ei); - len = btrfs_file_extent_inline_len(path->nodes[0], ei); + len = btrfs_file_extent_inline_len(path->nodes[0], path->slots[0], ei); ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); @@ -1375,13 +1475,9 @@ static int gen_unique_name(struct send_ctx *sctx, return -ENOMEM; while (1) { - len = snprintf(tmp, sizeof(tmp) - 1, "o%llu-%llu-%llu", + len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", ino, gen, idx); - if (len >= sizeof(tmp)) { - /* should really not happen */ - ret = -EOVERFLOW; - goto out; - } + ASSERT(len < sizeof(tmp)); di = btrfs_lookup_dir_item(NULL, sctx->send_root, path, BTRFS_FIRST_FREE_OBJECTID, @@ -1548,6 +1644,10 @@ static int lookup_dir_item_inode(struct btrfs_root *root, goto out; } btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); + if (key.type == BTRFS_ROOT_ITEM_KEY) { + ret = -ENOENT; + goto out; + } *found_inode = key.objectid; *found_type = btrfs_dir_type(path->nodes[0], di); @@ -1591,7 +1691,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, goto out; } - if (key.type == BTRFS_INODE_REF_KEY) { + if (found_key.type == BTRFS_INODE_REF_KEY) { struct btrfs_inode_ref *iref; iref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); @@ -1613,10 +1713,12 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, goto out; btrfs_release_path(path); - ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, NULL, - NULL, NULL); - if (ret < 0) - goto out; + if (dir_gen) { + ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, + NULL, NULL, NULL); + if (ret < 0) + goto out; + } *dir = parent_dir; @@ -1632,13 +1734,12 @@ static int is_first_ref(struct btrfs_root *root, int ret; struct fs_path *tmp_name; u64 tmp_dir; - u64 tmp_dir_gen; tmp_name = fs_path_alloc(); if (!tmp_name) return -ENOMEM; - ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name); + ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name); if (ret < 0) goto out; @@ -1857,13 +1958,20 @@ static void name_cache_delete(struct send_ctx *sctx, nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)nce->ino); - BUG_ON(!nce_head); + if (!nce_head) { + btrfs_err(sctx->send_root->fs_info, + "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", + nce->ino, sctx->name_cache_size); + } list_del(&nce->radix_list); list_del(&nce->list); sctx->name_cache_size--; - if (list_empty(nce_head)) { + /* + * We may not get to the final release of nce_head if the lookup fails + */ + if (nce_head && list_empty(nce_head)) { radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); kfree(nce_head); } @@ -1942,7 +2050,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, { int ret; int nce_ret; - struct btrfs_path *path = NULL; struct name_cache_entry *nce = NULL; /* @@ -1968,10 +2075,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, } } - path = alloc_path_for_send(); - if (!path) - return -ENOMEM; - /* * If the inode is not existent yet, add the orphan name and return 1. * This should only happen for the parent dir that we determine in @@ -2047,7 +2150,6 @@ out_cache: name_cache_clean_unused(sctx); out: - btrfs_free_path(path); return ret; } @@ -2097,12 +2199,27 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { fs_path_reset(name); - ret = __get_cur_name_and_parent(sctx, ino, gen, - &parent_inode, &parent_gen, name); + if (is_waiting_for_rm(sctx, ino)) { + ret = gen_unique_name(sctx, ino, gen, name); + if (ret < 0) + goto out; + ret = fs_path_add_path(dest, name); + break; + } + + if (is_waiting_for_move(sctx, ino)) { + ret = get_first_ref(sctx->parent_root, ino, + &parent_inode, &parent_gen, name); + } else { + ret = __get_cur_name_and_parent(sctx, ino, gen, + &parent_inode, + &parent_gen, name); + if (ret) + stop = 1; + } + if (ret < 0) goto out; - if (ret) - stop = 1; ret = fs_path_add_path(dest, name); if (ret < 0) @@ -2120,77 +2237,6 @@ out: } /* - * Called for regular files when sending extents data. Opens a struct file - * to read from the file. - */ -static int open_cur_inode_file(struct send_ctx *sctx) -{ - int ret = 0; - struct btrfs_key key; - struct path path; - struct inode *inode; - struct dentry *dentry; - struct file *filp; - int new = 0; - - if (sctx->cur_inode_filp) - goto out; - - key.objectid = sctx->cur_ino; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - - inode = btrfs_iget(sctx->send_root->fs_info->sb, &key, sctx->send_root, - &new); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - goto out; - } - - dentry = d_obtain_alias(inode); - inode = NULL; - if (IS_ERR(dentry)) { - ret = PTR_ERR(dentry); - goto out; - } - - path.mnt = sctx->mnt; - path.dentry = dentry; - filp = dentry_open(&path, O_RDONLY | O_LARGEFILE, current_cred()); - dput(dentry); - dentry = NULL; - if (IS_ERR(filp)) { - ret = PTR_ERR(filp); - goto out; - } - sctx->cur_inode_filp = filp; - -out: - /* - * no xxxput required here as every vfs op - * does it by itself on failure - */ - return ret; -} - -/* - * Closes the struct file that was created in open_cur_inode_file - */ -static int close_cur_inode_file(struct send_ctx *sctx) -{ - int ret = 0; - - if (!sctx->cur_inode_filp) - goto out; - - ret = filp_close(sctx->cur_inode_filp, NULL); - sctx->cur_inode_filp = NULL; - -out: - return ret; -} - -/* * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace */ static int send_subvol_begin(struct send_ctx *sctx) @@ -2205,7 +2251,7 @@ static int send_subvol_begin(struct send_ctx *sctx) char *name = NULL; int namelen; - path = alloc_path_for_send(); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2254,12 +2300,12 @@ static int send_subvol_begin(struct send_ctx *sctx) TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, sctx->send_root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, - sctx->send_root->root_item.ctransid); + le64_to_cpu(sctx->send_root->root_item.ctransid)); if (parent_root) { TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, sctx->parent_root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, - sctx->parent_root->root_item.ctransid); + le64_to_cpu(sctx->parent_root->root_item.ctransid)); } ret = send_cmd(sctx); @@ -2437,10 +2483,16 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino); if (!p) return -ENOMEM; - ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL, - NULL, &rdev); - if (ret < 0) - goto out; + if (ino != sctx->cur_ino) { + ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, + NULL, NULL, &rdev); + if (ret < 0) + goto out; + } else { + gen = sctx->cur_inode_gen; + mode = sctx->cur_inode_mode; + rdev = sctx->cur_inode_rdev; + } if (S_ISREG(mode)) { cmd = BTRFS_SEND_C_MKFILE; @@ -2520,17 +2572,26 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; key.offset = 0; + ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); + if (ret < 0) + goto out; + while (1) { - ret = btrfs_search_slot_for_read(sctx->send_root, &key, path, - 1, 0); - if (ret < 0) - goto out; - if (!ret) { - eb = path->nodes[0]; - slot = path->slots[0]; - btrfs_item_key_to_cpu(eb, &found_key, slot); + eb = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(sctx->send_root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + break; + } + continue; } - if (ret || found_key.objectid != key.objectid || + + btrfs_item_key_to_cpu(eb, &found_key, slot); + if (found_key.objectid != key.objectid || found_key.type != key.type) { ret = 0; goto out; @@ -2545,8 +2606,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) goto out; } - key.offset = found_key.offset + 1; - btrfs_release_path(path); + path->slots[0]++; } out: @@ -2598,7 +2658,7 @@ struct recorded_ref { * everything mixed. So we first record all refs and later process them. * This function is a helper to record one ref. */ -static int record_ref(struct list_head *head, u64 dir, +static int __record_ref(struct list_head *head, u64 dir, u64 dir_gen, struct fs_path *path) { struct recorded_ref *ref; @@ -2684,12 +2744,78 @@ out: return ret; } +static struct orphan_dir_info * +add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino) +{ + struct rb_node **p = &sctx->orphan_dirs.rb_node; + struct rb_node *parent = NULL; + struct orphan_dir_info *entry, *odi; + + odi = kmalloc(sizeof(*odi), GFP_NOFS); + if (!odi) + return ERR_PTR(-ENOMEM); + odi->ino = dir_ino; + odi->gen = 0; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct orphan_dir_info, node); + if (dir_ino < entry->ino) { + p = &(*p)->rb_left; + } else if (dir_ino > entry->ino) { + p = &(*p)->rb_right; + } else { + kfree(odi); + return entry; + } + } + + rb_link_node(&odi->node, parent, p); + rb_insert_color(&odi->node, &sctx->orphan_dirs); + return odi; +} + +static struct orphan_dir_info * +get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino) +{ + struct rb_node *n = sctx->orphan_dirs.rb_node; + struct orphan_dir_info *entry; + + while (n) { + entry = rb_entry(n, struct orphan_dir_info, node); + if (dir_ino < entry->ino) + n = n->rb_left; + else if (dir_ino > entry->ino) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino) +{ + struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino); + + return odi != NULL; +} + +static void free_orphan_dir_info(struct send_ctx *sctx, + struct orphan_dir_info *odi) +{ + if (!odi) + return; + rb_erase(&odi->node, &sctx->orphan_dirs); + kfree(odi); +} + /* * Returns 1 if a directory can be removed at this point in time. * We check this by iterating all dir items and checking if the inode behind * the dir item was already processed. */ -static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress) +static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, + u64 send_progress) { int ret = 0; struct btrfs_root *root = sctx->parent_root; @@ -2712,31 +2838,52 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress) key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; while (1) { - ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); - if (ret < 0) - goto out; - if (!ret) { - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); + struct waiting_dir_move *dm; + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; } - if (ret || found_key.objectid != key.objectid || - found_key.type != key.type) { + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != key.objectid || + found_key.type != key.type) break; - } di = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); + dm = get_waiting_dir_move(sctx, loc.objectid); + if (dm) { + struct orphan_dir_info *odi; + + odi = add_orphan_dir_info(sctx, dir); + if (IS_ERR(odi)) { + ret = PTR_ERR(odi); + goto out; + } + odi->gen = dir_gen; + dm->rmdir_ino = dir; + ret = 0; + goto out; + } + if (loc.objectid > send_progress) { ret = 0; goto out; } - btrfs_release_path(path); - key.offset = found_key.offset + 1; + path->slots[0]++; } ret = 1; @@ -2746,10 +2893,452 @@ out: return ret; } +static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) +{ + struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino); + + return entry != NULL; +} + +static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) +{ + struct rb_node **p = &sctx->waiting_dir_moves.rb_node; + struct rb_node *parent = NULL; + struct waiting_dir_move *entry, *dm; + + dm = kmalloc(sizeof(*dm), GFP_NOFS); + if (!dm) + return -ENOMEM; + dm->ino = ino; + dm->rmdir_ino = 0; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct waiting_dir_move, node); + if (ino < entry->ino) { + p = &(*p)->rb_left; + } else if (ino > entry->ino) { + p = &(*p)->rb_right; + } else { + kfree(dm); + return -EEXIST; + } + } + + rb_link_node(&dm->node, parent, p); + rb_insert_color(&dm->node, &sctx->waiting_dir_moves); + return 0; +} + +static struct waiting_dir_move * +get_waiting_dir_move(struct send_ctx *sctx, u64 ino) +{ + struct rb_node *n = sctx->waiting_dir_moves.rb_node; + struct waiting_dir_move *entry; + + while (n) { + entry = rb_entry(n, struct waiting_dir_move, node); + if (ino < entry->ino) + n = n->rb_left; + else if (ino > entry->ino) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +static void free_waiting_dir_move(struct send_ctx *sctx, + struct waiting_dir_move *dm) +{ + if (!dm) + return; + rb_erase(&dm->node, &sctx->waiting_dir_moves); + kfree(dm); +} + +static int add_pending_dir_move(struct send_ctx *sctx, + u64 ino, + u64 ino_gen, + u64 parent_ino, + struct list_head *new_refs, + struct list_head *deleted_refs) +{ + struct rb_node **p = &sctx->pending_dir_moves.rb_node; + struct rb_node *parent = NULL; + struct pending_dir_move *entry = NULL, *pm; + struct recorded_ref *cur; + int exists = 0; + int ret; + + pm = kmalloc(sizeof(*pm), GFP_NOFS); + if (!pm) + return -ENOMEM; + pm->parent_ino = parent_ino; + pm->ino = ino; + pm->gen = ino_gen; + INIT_LIST_HEAD(&pm->list); + INIT_LIST_HEAD(&pm->update_refs); + RB_CLEAR_NODE(&pm->node); + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct pending_dir_move, node); + if (parent_ino < entry->parent_ino) { + p = &(*p)->rb_left; + } else if (parent_ino > entry->parent_ino) { + p = &(*p)->rb_right; + } else { + exists = 1; + break; + } + } + + list_for_each_entry(cur, deleted_refs, list) { + ret = dup_ref(cur, &pm->update_refs); + if (ret < 0) + goto out; + } + list_for_each_entry(cur, new_refs, list) { + ret = dup_ref(cur, &pm->update_refs); + if (ret < 0) + goto out; + } + + ret = add_waiting_dir_move(sctx, pm->ino); + if (ret) + goto out; + + if (exists) { + list_add_tail(&pm->list, &entry->list); + } else { + rb_link_node(&pm->node, parent, p); + rb_insert_color(&pm->node, &sctx->pending_dir_moves); + } + ret = 0; +out: + if (ret) { + __free_recorded_refs(&pm->update_refs); + kfree(pm); + } + return ret; +} + +static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx, + u64 parent_ino) +{ + struct rb_node *n = sctx->pending_dir_moves.rb_node; + struct pending_dir_move *entry; + + while (n) { + entry = rb_entry(n, struct pending_dir_move, node); + if (parent_ino < entry->parent_ino) + n = n->rb_left; + else if (parent_ino > entry->parent_ino) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +static int path_loop(struct send_ctx *sctx, struct fs_path *name, + u64 ino, u64 gen, u64 *ancestor_ino) +{ + int ret = 0; + u64 parent_inode = 0; + u64 parent_gen = 0; + u64 start_ino = ino; + + *ancestor_ino = 0; + while (ino != BTRFS_FIRST_FREE_OBJECTID) { + fs_path_reset(name); + + if (is_waiting_for_rm(sctx, ino)) + break; + if (is_waiting_for_move(sctx, ino)) { + if (*ancestor_ino == 0) + *ancestor_ino = ino; + ret = get_first_ref(sctx->parent_root, ino, + &parent_inode, &parent_gen, name); + } else { + ret = __get_cur_name_and_parent(sctx, ino, gen, + &parent_inode, + &parent_gen, name); + if (ret > 0) { + ret = 0; + break; + } + } + if (ret < 0) + break; + if (parent_inode == start_ino) { + ret = 1; + if (*ancestor_ino == 0) + *ancestor_ino = ino; + break; + } + ino = parent_inode; + gen = parent_gen; + } + return ret; +} + +static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) +{ + struct fs_path *from_path = NULL; + struct fs_path *to_path = NULL; + struct fs_path *name = NULL; + u64 orig_progress = sctx->send_progress; + struct recorded_ref *cur; + u64 parent_ino, parent_gen; + struct waiting_dir_move *dm = NULL; + u64 rmdir_ino = 0; + int ret; + u64 ancestor = 0; + + name = fs_path_alloc(); + from_path = fs_path_alloc(); + if (!name || !from_path) { + ret = -ENOMEM; + goto out; + } + + dm = get_waiting_dir_move(sctx, pm->ino); + ASSERT(dm); + rmdir_ino = dm->rmdir_ino; + free_waiting_dir_move(sctx, dm); + + ret = get_first_ref(sctx->parent_root, pm->ino, + &parent_ino, &parent_gen, name); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, parent_ino, parent_gen, + from_path); + if (ret < 0) + goto out; + ret = fs_path_add_path(from_path, name); + if (ret < 0) + goto out; + + sctx->send_progress = sctx->cur_ino + 1; + ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor); + if (ret) { + LIST_HEAD(deleted_refs); + ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID); + ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor, + &pm->update_refs, &deleted_refs); + if (ret < 0) + goto out; + if (rmdir_ino) { + dm = get_waiting_dir_move(sctx, pm->ino); + ASSERT(dm); + dm->rmdir_ino = rmdir_ino; + } + goto out; + } + fs_path_reset(name); + to_path = name; + name = NULL; + ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); + if (ret < 0) + goto out; + + ret = send_rename(sctx, from_path, to_path); + if (ret < 0) + goto out; + + if (rmdir_ino) { + struct orphan_dir_info *odi; + + odi = get_orphan_dir_info(sctx, rmdir_ino); + if (!odi) { + /* already deleted */ + goto finish; + } + ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1); + if (ret < 0) + goto out; + if (!ret) + goto finish; + + name = fs_path_alloc(); + if (!name) { + ret = -ENOMEM; + goto out; + } + ret = get_cur_path(sctx, rmdir_ino, odi->gen, name); + if (ret < 0) + goto out; + ret = send_rmdir(sctx, name); + if (ret < 0) + goto out; + free_orphan_dir_info(sctx, odi); + } + +finish: + ret = send_utimes(sctx, pm->ino, pm->gen); + if (ret < 0) + goto out; + + /* + * After rename/move, need to update the utimes of both new parent(s) + * and old parent(s). + */ + list_for_each_entry(cur, &pm->update_refs, list) { + if (cur->dir == rmdir_ino) + continue; + ret = send_utimes(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + } + +out: + fs_path_free(name); + fs_path_free(from_path); + fs_path_free(to_path); + sctx->send_progress = orig_progress; + + return ret; +} + +static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m) +{ + if (!list_empty(&m->list)) + list_del(&m->list); + if (!RB_EMPTY_NODE(&m->node)) + rb_erase(&m->node, &sctx->pending_dir_moves); + __free_recorded_refs(&m->update_refs); + kfree(m); +} + +static void tail_append_pending_moves(struct pending_dir_move *moves, + struct list_head *stack) +{ + if (list_empty(&moves->list)) { + list_add_tail(&moves->list, stack); + } else { + LIST_HEAD(list); + list_splice_init(&moves->list, &list); + list_add_tail(&moves->list, stack); + list_splice_tail(&list, stack); + } +} + +static int apply_children_dir_moves(struct send_ctx *sctx) +{ + struct pending_dir_move *pm; + struct list_head stack; + u64 parent_ino = sctx->cur_ino; + int ret = 0; + + pm = get_pending_dir_moves(sctx, parent_ino); + if (!pm) + return 0; + + INIT_LIST_HEAD(&stack); + tail_append_pending_moves(pm, &stack); + + while (!list_empty(&stack)) { + pm = list_first_entry(&stack, struct pending_dir_move, list); + parent_ino = pm->ino; + ret = apply_dir_move(sctx, pm); + free_pending_move(sctx, pm); + if (ret) + goto out; + pm = get_pending_dir_moves(sctx, parent_ino); + if (pm) + tail_append_pending_moves(pm, &stack); + } + return 0; + +out: + while (!list_empty(&stack)) { + pm = list_first_entry(&stack, struct pending_dir_move, list); + free_pending_move(sctx, pm); + } + return ret; +} + +static int wait_for_parent_move(struct send_ctx *sctx, + struct recorded_ref *parent_ref) +{ + int ret = 0; + u64 ino = parent_ref->dir; + u64 parent_ino_before, parent_ino_after; + struct fs_path *path_before = NULL; + struct fs_path *path_after = NULL; + int len1, len2; + + path_after = fs_path_alloc(); + path_before = fs_path_alloc(); + if (!path_after || !path_before) { + ret = -ENOMEM; + goto out; + } + + /* + * Our current directory inode may not yet be renamed/moved because some + * ancestor (immediate or not) has to be renamed/moved first. So find if + * such ancestor exists and make sure our own rename/move happens after + * that ancestor is processed. + */ + while (ino > BTRFS_FIRST_FREE_OBJECTID) { + if (is_waiting_for_move(sctx, ino)) { + ret = 1; + break; + } + + fs_path_reset(path_before); + fs_path_reset(path_after); + + ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, + NULL, path_after); + if (ret < 0) + goto out; + ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, + NULL, path_before); + if (ret < 0 && ret != -ENOENT) { + goto out; + } else if (ret == -ENOENT) { + ret = 1; + break; + } + + len1 = fs_path_len(path_before); + len2 = fs_path_len(path_after); + if (ino > sctx->cur_ino && + (parent_ino_before != parent_ino_after || len1 != len2 || + memcmp(path_before->start, path_after->start, len1))) { + ret = 1; + break; + } + ino = parent_ino_after; + } + +out: + fs_path_free(path_before); + fs_path_free(path_after); + + if (ret == 1) { + ret = add_pending_dir_move(sctx, + sctx->cur_ino, + sctx->cur_inode_gen, + ino, + &sctx->new_refs, + &sctx->deleted_refs); + if (!ret) + ret = 1; + } + + return ret; +} + /* * This does all the move/link/unlink/rmdir magic. */ -static int process_recorded_refs(struct send_ctx *sctx) +static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) { int ret = 0; struct recorded_ref *cur; @@ -2760,6 +3349,7 @@ static int process_recorded_refs(struct send_ctx *sctx) u64 ow_gen; int did_overwrite = 0; int is_orphan = 0; + u64 last_dir_ino_rm = 0; verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); @@ -2898,11 +3488,18 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); * dirs, we always have one new and one deleted * ref. The deleted ref is ignored later. */ - ret = send_rename(sctx, valid_path, - cur->full_path); + ret = wait_for_parent_move(sctx, cur); if (ret < 0) goto out; - ret = fs_path_copy(valid_path, cur->full_path); + if (ret) { + *pending_move = 1; + } else { + ret = send_rename(sctx, valid_path, + cur->full_path); + if (!ret) + ret = fs_path_copy(valid_path, + cur->full_path); + } if (ret < 0) goto out; } else { @@ -2924,7 +3521,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); * later, we do this check again and rmdir it then if possible. * See the use of check_dirs for more details. */ - ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino); + ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen, + sctx->cur_ino); if (ret < 0) goto out; if (ret) { @@ -3015,8 +3613,10 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); ret = send_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; - } else if (ret == inode_state_did_delete) { - ret = can_rmdir(sctx, cur->dir, sctx->cur_ino); + } else if (ret == inode_state_did_delete && + cur->dir != last_dir_ino_rm) { + ret = can_rmdir(sctx, cur->dir, cur->dir_gen, + sctx->cur_ino); if (ret < 0) goto out; if (ret) { @@ -3027,6 +3627,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); ret = send_rmdir(sctx, valid_path); if (ret < 0) goto out; + last_dir_ino_rm = cur->dir; } } } @@ -3040,9 +3641,8 @@ out: return ret; } -static int __record_new_ref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx) +static int record_ref(struct btrfs_root *root, int num, u64 dir, int index, + struct fs_path *name, void *ctx, struct list_head *refs) { int ret = 0; struct send_ctx *sctx = ctx; @@ -3053,7 +3653,7 @@ static int __record_new_ref(int num, u64 dir, int index, if (!p) return -ENOMEM; - ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL, + ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL, NULL, NULL); if (ret < 0) goto out; @@ -3065,7 +3665,7 @@ static int __record_new_ref(int num, u64 dir, int index, if (ret < 0) goto out; - ret = record_ref(&sctx->new_refs, dir, gen, p); + ret = __record_ref(refs, dir, gen, p); out: if (ret) @@ -3073,37 +3673,23 @@ out: return ret; } +static int __record_new_ref(int num, u64 dir, int index, + struct fs_path *name, + void *ctx) +{ + struct send_ctx *sctx = ctx; + return record_ref(sctx->send_root, num, dir, index, name, + ctx, &sctx->new_refs); +} + + static int __record_deleted_ref(int num, u64 dir, int index, struct fs_path *name, void *ctx) { - int ret = 0; struct send_ctx *sctx = ctx; - struct fs_path *p; - u64 gen; - - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - - ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL, - NULL, NULL); - if (ret < 0) - goto out; - - ret = get_cur_path(sctx, dir, gen, p); - if (ret < 0) - goto out; - ret = fs_path_add_path(p, name); - if (ret < 0) - goto out; - - ret = record_ref(&sctx->deleted_refs, dir, gen, p); - -out: - if (ret) - fs_path_free(p); - return ret; + return record_ref(sctx->parent_root, num, dir, index, name, + ctx, &sctx->deleted_refs); } static int record_new_ref(struct send_ctx *sctx) @@ -3271,6 +3857,7 @@ static int process_all_refs(struct send_ctx *sctx, struct extent_buffer *eb; int slot; iterate_inode_ref_t cb; + int pending_move = 0; path = alloc_path_for_send(); if (!path) @@ -3283,21 +3870,31 @@ static int process_all_refs(struct send_ctx *sctx, root = sctx->parent_root; cb = __record_deleted_ref; } else { - BUG(); + btrfs_err(sctx->send_root->fs_info, + "Wrong command %d in process_all_refs", cmd); + ret = -EINVAL; + goto out; } key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_INODE_REF_KEY; key.offset = 0; - while (1) { - ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); - if (ret < 0) - goto out; - if (ret) - break; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + while (1) { eb = path->nodes[0]; slot = path->slots[0]; + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || @@ -3306,15 +3903,16 @@ static int process_all_refs(struct send_ctx *sctx, break; ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); - btrfs_release_path(path); if (ret < 0) goto out; - key.offset = found_key.offset + 1; + path->slots[0]++; } btrfs_release_path(path); - ret = process_recorded_refs(sctx); + ret = process_recorded_refs(sctx, &pending_move); + /* Only applicable to an incremental send. */ + ASSERT(pending_move == 0); out: btrfs_free_path(path); @@ -3589,19 +4187,25 @@ static int process_all_new_xattrs(struct send_ctx *sctx) key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_XATTR_ITEM_KEY; key.offset = 0; - while (1) { - ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); - if (ret < 0) - goto out; - if (ret) { - ret = 0; - goto out; - } + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + while (1) { eb = path->nodes[0]; slot = path->slots[0]; - btrfs_item_key_to_cpu(eb, &found_key, slot); + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + break; + } + continue; + } + btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || found_key.type != key.type) { ret = 0; @@ -3613,8 +4217,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx) if (ret < 0) goto out; - btrfs_release_path(path); - key.offset = found_key.offset + 1; + path->slots[0]++; } out: @@ -3622,6 +4225,79 @@ out: return ret; } +static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) +{ + struct btrfs_root *root = sctx->send_root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct inode *inode; + struct page *page; + char *addr; + struct btrfs_key key; + pgoff_t index = offset >> PAGE_CACHE_SHIFT; + pgoff_t last_index; + unsigned pg_offset = offset & ~PAGE_CACHE_MASK; + ssize_t ret = 0; + + key.objectid = sctx->cur_ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + inode = btrfs_iget(fs_info->sb, &key, root, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (offset + len > i_size_read(inode)) { + if (offset > i_size_read(inode)) + len = 0; + else + len = offset - i_size_read(inode); + } + if (len == 0) + goto out; + + last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT; + + /* initial readahead */ + memset(&sctx->ra, 0, sizeof(struct file_ra_state)); + file_ra_state_init(&sctx->ra, inode->i_mapping); + btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index, + last_index - index + 1); + + while (index <= last_index) { + unsigned cur_len = min_t(unsigned, len, + PAGE_CACHE_SIZE - pg_offset); + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (!page) { + ret = -ENOMEM; + break; + } + + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + ret = -EIO; + break; + } + } + + addr = kmap(page); + memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len); + kunmap(page); + unlock_page(page); + page_cache_release(page); + index++; + pg_offset = 0; + len -= cur_len; + ret += cur_len; + } +out: + iput(inode); + return ret; +} + /* * Read some bytes from the current inode/file and send a write command to * user space. @@ -3630,35 +4306,20 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) { int ret = 0; struct fs_path *p; - loff_t pos = offset; - int num_read = 0; - mm_segment_t old_fs; + ssize_t num_read = 0; p = fs_path_alloc(); if (!p) return -ENOMEM; - /* - * vfs normally only accepts user space buffers for security reasons. - * we only read from the file and also only provide the read_buf buffer - * to vfs. As this buffer does not come from a user space call, it's - * ok to temporary allow kernel space buffers. - */ - old_fs = get_fs(); - set_fs(KERNEL_DS); - verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len); - ret = open_cur_inode_file(sctx); - if (ret < 0) - goto out; - - ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos); - if (ret < 0) - goto out; - num_read = ret; - if (!num_read) + num_read = fill_read_buf(sctx, offset, len); + if (num_read <= 0) { + if (num_read < 0) + ret = num_read; goto out; + } ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) @@ -3677,7 +4338,6 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len); tlv_put_failure: out: fs_path_free(p); - set_fs(old_fs); if (ret < 0) return ret; return num_read; @@ -3730,7 +4390,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, clone_root->root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, - clone_root->root->root_item.ctransid); + le64_to_cpu(clone_root->root->root_item.ctransid)); TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET, clone_root->offset); @@ -3776,6 +4436,39 @@ out: return ret; } +static int send_hole(struct send_ctx *sctx, u64 end) +{ + struct fs_path *p = NULL; + u64 offset = sctx->cur_inode_last_extent; + u64 len; + int ret = 0; + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto tlv_put_failure; + memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE); + while (offset < end) { + len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE); + + ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); + if (ret < 0) + break; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len); + ret = send_cmd(sctx); + if (ret < 0) + break; + offset += len; + } +tlv_put_failure: + fs_path_free(p); + return ret; +} + static int send_write_or_clone(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key, @@ -3788,12 +4481,14 @@ static int send_write_or_clone(struct send_ctx *sctx, u64 len; u32 l; u8 type; + u64 bs = sctx->send_root->fs_info->sb->s_blocksize; ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); type = btrfs_file_extent_type(path->nodes[0], ei); if (type == BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_inline_len(path->nodes[0], ei); + len = btrfs_file_extent_inline_len(path->nodes[0], + path->slots[0], ei); /* * it is possible the inline item won't cover the whole page, * but there may be items after this page. Make @@ -3811,7 +4506,7 @@ static int send_write_or_clone(struct send_ctx *sctx, goto out; } - if (clone_root) { + if (clone_root && IS_ALIGNED(offset + len, bs)) { ret = send_clone(sctx, offset, len, clone_root); } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) { ret = send_update_extent(sctx, offset, len); @@ -3926,16 +4621,16 @@ static int is_extent_unchanged(struct send_ctx *sctx, while (key.offset < ekey->offset + left_len) { ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); right_type = btrfs_file_extent_type(eb, ei); - right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); - right_len = btrfs_file_extent_num_bytes(eb, ei); - right_offset = btrfs_file_extent_offset(eb, ei); - right_gen = btrfs_file_extent_generation(eb, ei); - if (right_type != BTRFS_FILE_EXTENT_REG) { ret = 0; goto out; } + right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); + right_len = btrfs_file_extent_num_bytes(eb, ei); + right_offset = btrfs_file_extent_offset(eb, ei); + right_gen = btrfs_file_extent_generation(eb, ei); + /* * Are we at extent 8? If yes, we know the extent is changed. * This may only happen on the first iteration. @@ -4003,6 +4698,101 @@ out: return ret; } +static int get_last_extent(struct send_ctx *sctx, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_root *root = sctx->send_root; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 extent_end; + u8 type; + int ret; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + sctx->cur_inode_last_extent = 0; + + key.objectid = sctx->cur_ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = offset; + ret = btrfs_search_slot_for_read(root, &key, path, 0, 1); + if (ret < 0) + goto out; + ret = 0; + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY) + goto out; + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], fi); + if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 size = btrfs_file_extent_inline_len(path->nodes[0], + path->slots[0], fi); + extent_end = ALIGN(key.offset + size, + sctx->send_root->sectorsize); + } else { + extent_end = key.offset + + btrfs_file_extent_num_bytes(path->nodes[0], fi); + } + sctx->cur_inode_last_extent = extent_end; +out: + btrfs_free_path(path); + return ret; +} + +static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, + struct btrfs_key *key) +{ + struct btrfs_file_extent_item *fi; + u64 extent_end; + u8 type; + int ret = 0; + + if (sctx->cur_ino != key->objectid || !need_send_hole(sctx)) + return 0; + + if (sctx->cur_inode_last_extent == (u64)-1) { + ret = get_last_extent(sctx, key->offset - 1); + if (ret) + return ret; + } + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], fi); + if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 size = btrfs_file_extent_inline_len(path->nodes[0], + path->slots[0], fi); + extent_end = ALIGN(key->offset + size, + sctx->send_root->sectorsize); + } else { + extent_end = key->offset + + btrfs_file_extent_num_bytes(path->nodes[0], fi); + } + + if (path->slots[0] == 0 && + sctx->cur_inode_last_extent < key->offset) { + /* + * We might have skipped entire leafs that contained only + * file extent items for our current inode. These leafs have + * a generation number smaller (older) than the one in the + * current leaf and the leaf our last extent came from, and + * are located between these 2 leafs. + */ + ret = get_last_extent(sctx, key->offset - 1); + if (ret) + return ret; + } + + if (sctx->cur_inode_last_extent < key->offset) + ret = send_hole(sctx, key->offset); + sctx->cur_inode_last_extent = extent_end; + return ret; +} + static int process_extent(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key) @@ -4019,7 +4809,7 @@ static int process_extent(struct send_ctx *sctx, goto out; if (ret) { ret = 0; - goto out; + goto out_hole; } } else { struct btrfs_file_extent_item *ei; @@ -4055,7 +4845,10 @@ static int process_extent(struct send_ctx *sctx, goto out; ret = send_write_or_clone(sctx, path, key, found_clone); - + if (ret) + goto out; +out_hole: + ret = maybe_send_hole(sctx, path, key); out: return ret; } @@ -4078,17 +4871,25 @@ static int process_all_extents(struct send_ctx *sctx) key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; - while (1) { - ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); - if (ret < 0) - goto out; - if (ret) { - ret = 0; - goto out; - } + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + while (1) { eb = path->nodes[0]; slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + break; + } + continue; + } + btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || @@ -4101,8 +4902,7 @@ static int process_all_extents(struct send_ctx *sctx) if (ret < 0) goto out; - btrfs_release_path(path); - key.offset = found_key.offset + 1; + path->slots[0]++; } out: @@ -4110,7 +4910,9 @@ out: return ret; } -static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end) +static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end, + int *pending_move, + int *refs_processed) { int ret = 0; @@ -4122,17 +4924,11 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end) if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs)) goto out; - ret = process_recorded_refs(sctx); + ret = process_recorded_refs(sctx, pending_move); if (ret < 0) goto out; - /* - * We have processed the refs and thus need to advance send_progress. - * Now, calls to get_cur_xxx will take the updated refs of the current - * inode into account. - */ - sctx->send_progress = sctx->cur_ino + 1; - + *refs_processed = 1; out: return ret; } @@ -4148,11 +4944,29 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) u64 right_gid; int need_chmod = 0; int need_chown = 0; + int pending_move = 0; + int refs_processed = 0; - ret = process_recorded_refs_if_needed(sctx, at_end); + ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move, + &refs_processed); if (ret < 0) goto out; + /* + * We have processed the refs and thus need to advance send_progress. + * Now, calls to get_cur_xxx will take the updated refs of the current + * inode into account. + * + * On the other hand, if our current inode is a directory and couldn't + * be moved/renamed because its parent was renamed/moved too and it has + * a higher inode number, we can only move/rename our current inode + * after we moved/renamed its parent. Therefore in this case operate on + * the old path (pre move/rename) of our current inode, and the + * move/rename will be performed later. + */ + if (refs_processed && !pending_move) + sctx->send_progress = sctx->cur_ino + 1; + if (sctx->cur_ino == 0 || sctx->cur_inode_deleted) goto out; if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino) @@ -4181,6 +4995,21 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) } if (S_ISREG(sctx->cur_inode_mode)) { + if (need_send_hole(sctx)) { + if (sctx->cur_inode_last_extent == (u64)-1 || + sctx->cur_inode_last_extent < + sctx->cur_inode_size) { + ret = get_last_extent(sctx, (u64)-1); + if (ret) + goto out; + } + if (sctx->cur_inode_last_extent < + sctx->cur_inode_size) { + ret = send_hole(sctx, sctx->cur_inode_size); + if (ret) + goto out; + } + } ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen, sctx->cur_inode_size); if (ret < 0) @@ -4201,12 +5030,25 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) } /* - * Need to send that every time, no matter if it actually changed - * between the two trees as we have done changes to the inode before. + * If other directory inodes depended on our current directory + * inode's move/rename, now do their move/rename operations. */ - ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); - if (ret < 0) - goto out; + if (!is_waiting_for_move(sctx, sctx->cur_ino)) { + ret = apply_children_dir_moves(sctx); + if (ret) + goto out; + /* + * Need to send that every time, no matter if it actually + * changed between the two trees as we have done changes to + * the inode before. If our inode is a directory and it's + * waiting to be moved/renamed, we will send its utimes when + * it's moved/renamed, therefore we don't need to do it here. + */ + sctx->send_progress = sctx->cur_ino + 1; + ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + if (ret < 0) + goto out; + } out: return ret; @@ -4222,12 +5064,9 @@ static int changed_inode(struct send_ctx *sctx, u64 left_gen = 0; u64 right_gen = 0; - ret = close_cur_inode_file(sctx); - if (ret < 0) - goto out; - sctx->cur_ino = key->objectid; sctx->cur_inode_new_gen = 0; + sctx->cur_inode_last_extent = (u64)-1; /* * Set send_progress to current inode. This will tell all get_cur_xxx @@ -4276,6 +5115,8 @@ static int changed_inode(struct send_ctx *sctx, sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( sctx->left_path->nodes[0], left_ii); + sctx->cur_inode_rdev = btrfs_inode_rdev( + sctx->left_path->nodes[0], left_ii); if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) ret = send_create_inode_if_needed(sctx); } else if (result == BTRFS_COMPARE_TREE_DELETED) { @@ -4320,6 +5161,8 @@ static int changed_inode(struct send_ctx *sctx, sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( sctx->left_path->nodes[0], left_ii); + sctx->cur_inode_rdev = btrfs_inode_rdev( + sctx->left_path->nodes[0], left_ii); ret = send_create_inode_if_needed(sctx); if (ret < 0) goto out; @@ -4508,14 +5351,18 @@ static int changed_cb(struct btrfs_root *left_root, struct send_ctx *sctx = ctx; if (result == BTRFS_COMPARE_TREE_SAME) { - if (key->type != BTRFS_INODE_REF_KEY && - key->type != BTRFS_INODE_EXTREF_KEY) - return 0; - ret = compare_refs(sctx, left_path, key); - if (!ret) + if (key->type == BTRFS_INODE_REF_KEY || + key->type == BTRFS_INODE_EXTREF_KEY) { + ret = compare_refs(sctx, left_path, key); + if (!ret) + return 0; + if (ret < 0) + return ret; + } else if (key->type == BTRFS_EXTENT_DATA_KEY) { + return maybe_send_hole(sctx, left_path, key); + } else { return 0; - if (ret < 0) - return ret; + } result = BTRFS_COMPARE_TREE_CHANGED; ret = 0; } @@ -4550,57 +5397,21 @@ out: static int full_send_tree(struct send_ctx *sctx) { int ret; - struct btrfs_trans_handle *trans = NULL; struct btrfs_root *send_root = sctx->send_root; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_path *path; struct extent_buffer *eb; int slot; - u64 start_ctransid; - u64 ctransid; path = alloc_path_for_send(); if (!path) return -ENOMEM; - spin_lock(&send_root->root_item_lock); - start_ctransid = btrfs_root_ctransid(&send_root->root_item); - spin_unlock(&send_root->root_item_lock); - key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; -join_trans: - /* - * We need to make sure the transaction does not get committed - * while we do anything on commit roots. Join a transaction to prevent - * this. - */ - trans = btrfs_join_transaction(send_root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - - /* - * Make sure the tree has not changed after re-joining. We detect this - * by comparing start_ctransid and ctransid. They should always match. - */ - spin_lock(&send_root->root_item_lock); - ctransid = btrfs_root_ctransid(&send_root->root_item); - spin_unlock(&send_root->root_item_lock); - - if (ctransid != start_ctransid) { - WARN(1, KERN_WARNING "btrfs: the root that you're trying to " - "send was modified in between. This is " - "probably a bug.\n"); - ret = -EIO; - goto out; - } - ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0); if (ret < 0) goto out; @@ -4608,19 +5419,6 @@ join_trans: goto out_finish; while (1) { - /* - * When someone want to commit while we iterate, end the - * joined transaction and rejoin. - */ - if (btrfs_should_end_transaction(trans, send_root)) { - ret = btrfs_end_transaction(trans, send_root); - trans = NULL; - if (ret < 0) - goto out; - btrfs_release_path(path); - goto join_trans; - } - eb = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(eb, &found_key, slot); @@ -4648,12 +5446,6 @@ out_finish: out: btrfs_free_path(path); - if (trans) { - if (!ret) - ret = btrfs_end_transaction(trans, send_root); - else - btrfs_end_transaction(trans, send_root); - } return ret; } @@ -4686,15 +5478,25 @@ static int send_subvol(struct send_ctx *sctx) } out: - if (!ret) - ret = close_cur_inode_file(sctx); - else - close_cur_inode_file(sctx); - free_recorded_refs(sctx); return ret; } +static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) +{ + spin_lock(&root->root_item_lock); + root->send_in_progress--; + /* + * Not much left to do, we don't know why it's unbalanced and + * can't blindly reset it to 0. + */ + if (root->send_in_progress < 0) + btrfs_err(root->fs_info, + "send_in_progres unbalanced %d root %llu", + root->send_in_progress, root->root_key.objectid); + spin_unlock(&root->root_item_lock); +} + long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) { int ret = 0; @@ -4706,6 +5508,9 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) struct send_ctx *sctx = NULL; u32 i; u64 *clone_sources_tmp = NULL; + int clone_sources_to_rollback = 0; + int sort_clone_roots = 0; + int index; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4714,38 +5519,26 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) fs_info = send_root->fs_info; /* + * The subvolume must remain read-only during send, protect against + * making it RW. This also protects against deletion. + */ + spin_lock(&send_root->root_item_lock); + send_root->send_in_progress++; + spin_unlock(&send_root->root_item_lock); + + /* * This is done when we lookup the root, it should already be complete * by the time we get here. */ WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE); /* - * If we just created this root we need to make sure that the orphan - * cleanup has been done and committed since we search the commit root, - * so check its commit root transid with our otransid and if they match - * commit the transaction to make sure everything is updated. + * Userspace tools do the checks and warn the user if it's + * not RO. */ - down_read(&send_root->fs_info->extent_commit_sem); - if (btrfs_header_generation(send_root->commit_root) == - btrfs_root_otransid(&send_root->root_item)) { - struct btrfs_trans_handle *trans; - - up_read(&send_root->fs_info->extent_commit_sem); - - trans = btrfs_attach_transaction_barrier(send_root); - if (IS_ERR(trans)) { - if (PTR_ERR(trans) != -ENOENT) { - ret = PTR_ERR(trans); - goto out; - } - /* ENOENT means theres no transaction */ - } else { - ret = btrfs_commit_transaction(trans, send_root); - if (ret) - goto out; - } - } else { - up_read(&send_root->fs_info->extent_commit_sem); + if (!btrfs_root_readonly(send_root)) { + ret = -EPERM; + goto out; } arg = memdup_user(arg_, sizeof(*arg)); @@ -4756,8 +5549,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) } if (!access_ok(VERIFY_READ, arg->clone_sources, - sizeof(*arg->clone_sources * - arg->clone_sources_count))) { + sizeof(*arg->clone_sources) * + arg->clone_sources_count)) { ret = -EFAULT; goto out; } @@ -4786,9 +5579,16 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; } - sctx->mnt = mnt_file->f_path.mnt; - sctx->send_root = send_root; + /* + * Unlikely but possible, if the subvolume is marked for deletion but + * is slow to remove the directory entry, send can still be started + */ + if (btrfs_root_dead(sctx->send_root)) { + ret = -EPERM; + goto out; + } + sctx->clone_roots_cnt = arg->clone_sources_count; sctx->send_max_size = BTRFS_SEND_BUF_SIZE; @@ -4804,6 +5604,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; } + sctx->pending_dir_moves = RB_ROOT; + sctx->waiting_dir_moves = RB_ROOT; + sctx->orphan_dirs = RB_ROOT; + sctx->clone_roots = vzalloc(sizeof(struct clone_root) * (arg->clone_sources_count + 1)); if (!sctx->clone_roots) { @@ -4831,11 +5635,27 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) key.objectid = clone_sources_tmp[i]; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + clone_root = btrfs_read_fs_root_no_name(fs_info, &key); if (IS_ERR(clone_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(clone_root); goto out; } + clone_sources_to_rollback = i + 1; + spin_lock(&clone_root->root_item_lock); + clone_root->send_in_progress++; + if (!btrfs_root_readonly(clone_root)) { + spin_unlock(&clone_root->root_item_lock); + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = -EPERM; + goto out; + } + spin_unlock(&clone_root->root_item_lock); + srcu_read_unlock(&fs_info->subvol_srcu, index); + sctx->clone_roots[i].root = clone_root; } vfree(clone_sources_tmp); @@ -4846,11 +5666,28 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) key.objectid = arg->parent_root; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key); if (IS_ERR(sctx->parent_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(sctx->parent_root); goto out; } + + spin_lock(&sctx->parent_root->root_item_lock); + sctx->parent_root->send_in_progress++; + if (!btrfs_root_readonly(sctx->parent_root) || + btrfs_root_dead(sctx->parent_root)) { + spin_unlock(&sctx->parent_root->root_item_lock); + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = -EPERM; + goto out; + } + spin_unlock(&sctx->parent_root->root_item_lock); + + srcu_read_unlock(&fs_info->subvol_srcu, index); } /* @@ -4864,8 +5701,11 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sort(sctx->clone_roots, sctx->clone_roots_cnt, sizeof(*sctx->clone_roots), __clone_root_cmp_sort, NULL); + sort_clone_roots = 1; + current->journal_info = (void *)BTRFS_SEND_TRANS_STUB; ret = send_subvol(sctx); + current->journal_info = NULL; if (ret < 0) goto out; @@ -4879,6 +5719,58 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) } out: + WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)); + while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) { + struct rb_node *n; + struct pending_dir_move *pm; + + n = rb_first(&sctx->pending_dir_moves); + pm = rb_entry(n, struct pending_dir_move, node); + while (!list_empty(&pm->list)) { + struct pending_dir_move *pm2; + + pm2 = list_first_entry(&pm->list, + struct pending_dir_move, list); + free_pending_move(sctx, pm2); + } + free_pending_move(sctx, pm); + } + + WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)); + while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) { + struct rb_node *n; + struct waiting_dir_move *dm; + + n = rb_first(&sctx->waiting_dir_moves); + dm = rb_entry(n, struct waiting_dir_move, node); + rb_erase(&dm->node, &sctx->waiting_dir_moves); + kfree(dm); + } + + WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs)); + while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) { + struct rb_node *n; + struct orphan_dir_info *odi; + + n = rb_first(&sctx->orphan_dirs); + odi = rb_entry(n, struct orphan_dir_info, node); + free_orphan_dir_info(sctx, odi); + } + + if (sort_clone_roots) { + for (i = 0; i < sctx->clone_roots_cnt; i++) + btrfs_root_dec_send_in_progress( + sctx->clone_roots[i].root); + } else { + for (i = 0; sctx && i < clone_sources_to_rollback; i++) + btrfs_root_dec_send_in_progress( + sctx->clone_roots[i].root); + + btrfs_root_dec_send_in_progress(send_root); + } + if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) + btrfs_root_dec_send_in_progress(sctx->parent_root); + kfree(arg); vfree(clone_sources_tmp); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index e913328d0f2..8e16bca69c5 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -42,13 +42,14 @@ #include <linux/cleancache.h> #include <linux/ratelimit.h> #include <linux/btrfs.h> -#include "compat.h" #include "delayed-inode.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" #include "print-tree.h" +#include "hash.h" +#include "props.h" #include "xattr.h" #include "volumes.h" #include "export.h" @@ -65,6 +66,8 @@ static const struct super_operations btrfs_super_ops; static struct file_system_type btrfs_fs_type; +static int btrfs_remount(struct super_block *sb, int *flags, char *data); + static const char *btrfs_decode_error(int errno) { char *errstr = "unknown"; @@ -153,11 +156,12 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, vaf.fmt = fmt; vaf.va = &args; - printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s (%pV)\n", + printk(KERN_CRIT + "BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n", sb->s_id, function, line, errno, errstr, &vaf); va_end(args); } else { - printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s\n", + printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n", sb->s_id, function, line, errno, errstr); } @@ -251,7 +255,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, */ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, &root->fs_info->fs_state)) { - WARN(1, KERN_DEBUG "btrfs: Transaction aborted (error %d)\n", + WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n", errno); } trans->aborted = errno; @@ -295,8 +299,8 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", s_id, function, line, &vaf, errno, errstr); - printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", - s_id, function, line, &vaf, errno, errstr); + btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)", + function, line, &vaf, errno, errstr); va_end(args); /* Caller calls BUG() */ } @@ -323,7 +327,9 @@ enum { Opt_no_space_cache, Opt_recovery, Opt_skip_balance, Opt_check_integrity, Opt_check_integrity_including_extent_data, Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree, - Opt_commit_interval, + Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard, + Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow, + Opt_datasum, Opt_treelog, Opt_noinode_cache, Opt_err, }; @@ -333,8 +339,11 @@ static match_table_t tokens = { {Opt_subvolid, "subvolid=%s"}, {Opt_device, "device=%s"}, {Opt_nodatasum, "nodatasum"}, + {Opt_datasum, "datasum"}, {Opt_nodatacow, "nodatacow"}, + {Opt_datacow, "datacow"}, {Opt_nobarrier, "nobarrier"}, + {Opt_barrier, "barrier"}, {Opt_max_inline, "max_inline=%s"}, {Opt_alloc_start, "alloc_start=%s"}, {Opt_thread_pool, "thread_pool=%d"}, @@ -345,18 +354,25 @@ static match_table_t tokens = { {Opt_ssd, "ssd"}, {Opt_ssd_spread, "ssd_spread"}, {Opt_nossd, "nossd"}, + {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, {Opt_notreelog, "notreelog"}, + {Opt_treelog, "treelog"}, {Opt_flushoncommit, "flushoncommit"}, + {Opt_noflushoncommit, "noflushoncommit"}, {Opt_ratio, "metadata_ratio=%d"}, {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, {Opt_space_cache, "space_cache"}, {Opt_clear_cache, "clear_cache"}, {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, {Opt_enospc_debug, "enospc_debug"}, + {Opt_noenospc_debug, "noenospc_debug"}, {Opt_subvolrootid, "subvolrootid=%d"}, {Opt_defrag, "autodefrag"}, + {Opt_nodefrag, "noautodefrag"}, {Opt_inode_cache, "inode_cache"}, + {Opt_noinode_cache, "noinode_cache"}, {Opt_no_space_cache, "nospace_cache"}, {Opt_recovery, "recovery"}, {Opt_skip_balance, "skip_balance"}, @@ -384,6 +400,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) int ret = 0; char *compress_type; bool compress_force = false; + bool compress = false; cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); if (cache_gen) @@ -410,7 +427,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) token = match_token(p, tokens, args); switch (token) { case Opt_degraded: - printk(KERN_INFO "btrfs: allowing degraded mounts\n"); + btrfs_info(root->fs_info, "allowing degraded mounts"); btrfs_set_opt(info->mount_opt, DEGRADED); break; case Opt_subvol: @@ -423,28 +440,45 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) */ break; case Opt_nodatasum: - printk(KERN_INFO "btrfs: setting nodatasum\n"); - btrfs_set_opt(info->mount_opt, NODATASUM); + btrfs_set_and_info(root, NODATASUM, + "setting nodatasum"); + break; + case Opt_datasum: + if (btrfs_test_opt(root, NODATASUM)) { + if (btrfs_test_opt(root, NODATACOW)) + btrfs_info(root->fs_info, "setting datasum, datacow enabled"); + else + btrfs_info(root->fs_info, "setting datasum"); + } + btrfs_clear_opt(info->mount_opt, NODATACOW); + btrfs_clear_opt(info->mount_opt, NODATASUM); break; case Opt_nodatacow: - if (!btrfs_test_opt(root, COMPRESS) || - !btrfs_test_opt(root, FORCE_COMPRESS)) { - printk(KERN_INFO "btrfs: setting nodatacow, compression disabled\n"); - } else { - printk(KERN_INFO "btrfs: setting nodatacow\n"); + if (!btrfs_test_opt(root, NODATACOW)) { + if (!btrfs_test_opt(root, COMPRESS) || + !btrfs_test_opt(root, FORCE_COMPRESS)) { + btrfs_info(root->fs_info, + "setting nodatacow, compression disabled"); + } else { + btrfs_info(root->fs_info, "setting nodatacow"); + } } - info->compress_type = BTRFS_COMPRESS_NONE; btrfs_clear_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); btrfs_set_opt(info->mount_opt, NODATACOW); btrfs_set_opt(info->mount_opt, NODATASUM); break; + case Opt_datacow: + btrfs_clear_and_info(root, NODATACOW, + "setting datacow"); + break; case Opt_compress_force: case Opt_compress_force_type: compress_force = true; /* Fallthrough */ case Opt_compress: case Opt_compress_type: + compress = true; if (token == Opt_compress || token == Opt_compress_force || strcmp(args[0].from, "zlib") == 0) { @@ -462,7 +496,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_fs_incompat(info, COMPRESS_LZO); } else if (strncmp(args[0].from, "no", 2) == 0) { compress_type = "no"; - info->compress_type = BTRFS_COMPRESS_NONE; btrfs_clear_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); compress_force = false; @@ -472,33 +505,37 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } if (compress_force) { - btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); - pr_info("btrfs: force %s compression\n", - compress_type); - } else - pr_info("btrfs: use %s compression\n", - compress_type); + btrfs_set_and_info(root, FORCE_COMPRESS, + "force %s compression", + compress_type); + } else if (compress) { + if (!btrfs_test_opt(root, COMPRESS)) + btrfs_info(root->fs_info, + "btrfs: use %s compression", + compress_type); + } break; case Opt_ssd: - printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); - btrfs_set_opt(info->mount_opt, SSD); + btrfs_set_and_info(root, SSD, + "use ssd allocation scheme"); break; case Opt_ssd_spread: - printk(KERN_INFO "btrfs: use spread ssd " - "allocation scheme\n"); + btrfs_set_and_info(root, SSD_SPREAD, + "use spread ssd allocation scheme"); btrfs_set_opt(info->mount_opt, SSD); - btrfs_set_opt(info->mount_opt, SSD_SPREAD); break; case Opt_nossd: - printk(KERN_INFO "btrfs: not using ssd allocation " - "scheme\n"); - btrfs_set_opt(info->mount_opt, NOSSD); + btrfs_set_and_info(root, NOSSD, + "not using ssd allocation scheme"); btrfs_clear_opt(info->mount_opt, SSD); - btrfs_clear_opt(info->mount_opt, SSD_SPREAD); + break; + case Opt_barrier: + btrfs_clear_and_info(root, NOBARRIER, + "turning on barriers"); break; case Opt_nobarrier: - printk(KERN_INFO "btrfs: turning off barriers\n"); - btrfs_set_opt(info->mount_opt, NOBARRIER); + btrfs_set_and_info(root, NOBARRIER, + "turning off barriers"); break; case Opt_thread_pool: ret = match_int(&args[0], &intarg); @@ -518,11 +555,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) kfree(num); if (info->max_inline) { - info->max_inline = max_t(u64, + info->max_inline = min_t(u64, info->max_inline, root->sectorsize); } - printk(KERN_INFO "btrfs: max_inline at %llu\n", + btrfs_info(root->fs_info, "max_inline at %llu", info->max_inline); } else { ret = -ENOMEM; @@ -536,24 +573,41 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) info->alloc_start = memparse(num, NULL); mutex_unlock(&info->chunk_mutex); kfree(num); - printk(KERN_INFO - "btrfs: allocations start at %llu\n", + btrfs_info(root->fs_info, "allocations start at %llu", info->alloc_start); } else { ret = -ENOMEM; goto out; } break; + case Opt_acl: +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + root->fs_info->sb->s_flags |= MS_POSIXACL; + break; +#else + btrfs_err(root->fs_info, + "support for ACL not compiled in!"); + ret = -EINVAL; + goto out; +#endif case Opt_noacl: root->fs_info->sb->s_flags &= ~MS_POSIXACL; break; case Opt_notreelog: - printk(KERN_INFO "btrfs: disabling tree log\n"); - btrfs_set_opt(info->mount_opt, NOTREELOG); + btrfs_set_and_info(root, NOTREELOG, + "disabling tree log"); + break; + case Opt_treelog: + btrfs_clear_and_info(root, NOTREELOG, + "enabling tree log"); break; case Opt_flushoncommit: - printk(KERN_INFO "btrfs: turning on flush-on-commit\n"); - btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT); + btrfs_set_and_info(root, FLUSHONCOMMIT, + "turning on flush-on-commit"); + break; + case Opt_noflushoncommit: + btrfs_clear_and_info(root, FLUSHONCOMMIT, + "turning off flush-on-commit"); break; case Opt_ratio: ret = match_int(&args[0], &intarg); @@ -561,7 +615,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) goto out; } else if (intarg >= 0) { info->metadata_ratio = intarg; - printk(KERN_INFO "btrfs: metadata ratio %d\n", + btrfs_info(root->fs_info, "metadata ratio %d", info->metadata_ratio); } else { ret = -EINVAL; @@ -569,25 +623,35 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } break; case Opt_discard: - btrfs_set_opt(info->mount_opt, DISCARD); + btrfs_set_and_info(root, DISCARD, + "turning on discard"); + break; + case Opt_nodiscard: + btrfs_clear_and_info(root, DISCARD, + "turning off discard"); break; case Opt_space_cache: - btrfs_set_opt(info->mount_opt, SPACE_CACHE); + btrfs_set_and_info(root, SPACE_CACHE, + "enabling disk space caching"); break; case Opt_rescan_uuid_tree: btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); break; case Opt_no_space_cache: - printk(KERN_INFO "btrfs: disabling disk space caching\n"); - btrfs_clear_opt(info->mount_opt, SPACE_CACHE); + btrfs_clear_and_info(root, SPACE_CACHE, + "disabling disk space caching"); break; case Opt_inode_cache: - printk(KERN_INFO "btrfs: enabling inode map caching\n"); - btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); + btrfs_set_and_info(root, CHANGE_INODE_CACHE, + "enabling inode map caching"); + break; + case Opt_noinode_cache: + btrfs_clear_and_info(root, CHANGE_INODE_CACHE, + "disabling inode map caching"); break; case Opt_clear_cache: - printk(KERN_INFO "btrfs: force clearing of disk cache\n"); - btrfs_set_opt(info->mount_opt, CLEAR_CACHE); + btrfs_set_and_info(root, CLEAR_CACHE, + "force clearing of disk cache"); break; case Opt_user_subvol_rm_allowed: btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); @@ -595,12 +659,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_enospc_debug: btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); break; + case Opt_noenospc_debug: + btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG); + break; case Opt_defrag: - printk(KERN_INFO "btrfs: enabling auto defrag\n"); - btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); + btrfs_set_and_info(root, AUTO_DEFRAG, + "enabling auto defrag"); + break; + case Opt_nodefrag: + btrfs_clear_and_info(root, AUTO_DEFRAG, + "disabling auto defrag"); break; case Opt_recovery: - printk(KERN_INFO "btrfs: enabling auto recovery\n"); + btrfs_info(root->fs_info, "enabling auto recovery"); btrfs_set_opt(info->mount_opt, RECOVERY); break; case Opt_skip_balance: @@ -608,14 +679,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) break; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY case Opt_check_integrity_including_extent_data: - printk(KERN_INFO "btrfs: enabling check integrity" - " including extent data\n"); + btrfs_info(root->fs_info, + "enabling check integrity including extent data"); btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA); btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); break; case Opt_check_integrity: - printk(KERN_INFO "btrfs: enabling check integrity\n"); + btrfs_info(root->fs_info, "enabling check integrity"); btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); break; case Opt_check_integrity_print_mask: @@ -624,8 +695,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) goto out; } else if (intarg >= 0) { info->check_integrity_print_mask = intarg; - printk(KERN_INFO "btrfs:" - " check_integrity_print_mask 0x%x\n", + btrfs_info(root->fs_info, "check_integrity_print_mask 0x%x", info->check_integrity_print_mask); } else { ret = -EINVAL; @@ -636,8 +706,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_check_integrity_including_extent_data: case Opt_check_integrity: case Opt_check_integrity_print_mask: - printk(KERN_ERR "btrfs: support for check_integrity*" - " not compiled in!\n"); + btrfs_err(root->fs_info, + "support for check_integrity* not compiled in!"); ret = -EINVAL; goto out; #endif @@ -657,28 +727,24 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) intarg = 0; ret = match_int(&args[0], &intarg); if (ret < 0) { - printk(KERN_ERR - "btrfs: invalid commit interval\n"); + btrfs_err(root->fs_info, "invalid commit interval"); ret = -EINVAL; goto out; } if (intarg > 0) { if (intarg > 300) { - printk(KERN_WARNING - "btrfs: excessive commit interval %d\n", + btrfs_warn(root->fs_info, "excessive commit interval %d", intarg); } info->commit_interval = intarg; } else { - printk(KERN_INFO - "btrfs: using default commit interval %ds\n", + btrfs_info(root->fs_info, "using default commit interval %ds", BTRFS_DEFAULT_COMMIT_INTERVAL); info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; } break; case Opt_err: - printk(KERN_INFO "btrfs: unrecognized mount option " - "'%s'\n", p); + btrfs_info(root->fs_info, "unrecognized mount option '%s'", p); ret = -EINVAL; goto out; default: @@ -687,7 +753,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } out: if (!ret && btrfs_test_opt(root, SPACE_CACHE)) - printk(KERN_INFO "btrfs: disk space caching is enabled\n"); + btrfs_info(root->fs_info, "disk space caching is enabled"); kfree(orig); return ret; } @@ -750,7 +816,8 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, break; case Opt_subvolrootid: printk(KERN_WARNING - "btrfs: 'subvolrootid' mount option is deprecated and has no effect\n"); + "BTRFS: 'subvolrootid' mount option is deprecated and has " + "no effect\n"); break; case Opt_device: device_name = match_strdup(&args[0]); @@ -784,6 +851,7 @@ static struct dentry *get_default_root(struct super_block *sb, struct btrfs_path *path; struct btrfs_key location; struct inode *inode; + struct dentry *dentry; u64 dir_id; int new = 0; @@ -854,7 +922,13 @@ setup_root: return dget(sb->s_root); } - return d_obtain_alias(inode); + dentry = d_obtain_alias(inode); + if (!IS_ERR(dentry)) { + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_DISCONNECTED; + spin_unlock(&dentry->d_lock); + } + return dentry; } static int btrfs_fill_super(struct super_block *sb, @@ -879,7 +953,7 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_flags |= MS_I_VERSION; err = open_ctree(sb, fs_devices, (char *)data); if (err) { - printk("btrfs: open_ctree failed\n"); + printk(KERN_ERR "BTRFS: open_ctree failed\n"); return err; } @@ -921,7 +995,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_wait_all_ordered_extents(fs_info); + btrfs_wait_ordered_roots(fs_info, -1); trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { @@ -1106,7 +1180,31 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, return ERR_PTR(-ENOMEM); mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs); + + if (PTR_RET(mnt) == -EBUSY) { + if (flags & MS_RDONLY) { + mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name, + newargs); + } else { + int r; + mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name, + newargs); + if (IS_ERR(mnt)) { + kfree(newargs); + return ERR_CAST(mnt); + } + + r = btrfs_remount(mnt->mnt_sb, &flags, NULL); + if (r < 0) { + /* FIXME: release vfsmount mnt ??*/ + kfree(newargs); + return ERR_PTR(r); + } + } + } + kfree(newargs); + if (IS_ERR(mnt)) return ERR_CAST(mnt); @@ -1117,7 +1215,7 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, dput(root); root = ERR_PTR(-EINVAL); deactivate_locked_super(s); - printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", + printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n", subvol_name); } @@ -1227,13 +1325,6 @@ error_fs_info: return ERR_PTR(error); } -static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit) -{ - spin_lock_irq(&workers->lock); - workers->max_workers = new_limit; - spin_unlock_irq(&workers->lock); -} - static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, int new_pool_size, int old_pool_size) { @@ -1242,24 +1333,23 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, fs_info->thread_pool_size = new_pool_size; - printk(KERN_INFO "btrfs: resize thread pool %d -> %d\n", + btrfs_info(fs_info, "resize thread pool %d -> %d", old_pool_size, new_pool_size); - btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size); - btrfs_set_max_workers(&fs_info->workers, new_pool_size); - btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); - btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers, - new_pool_size); + btrfs_workqueue_set_max(fs_info->workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_meta_write_workers, + new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size); + btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers, + new_pool_size); } static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info) @@ -1310,6 +1400,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) unsigned int old_metadata_ratio = fs_info->metadata_ratio; int ret; + sync_filesystem(sb); btrfs_remount_prepare(fs_info); ret = btrfs_parse_options(root, data); @@ -1330,6 +1421,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) * this also happens on 'umount -rf' or on shutdown, when * the filesystem is busy. */ + cancel_work_sync(&fs_info->async_reclaim_work); + + /* wait for the uuid_scan task to finish */ + down(&fs_info->uuid_tree_rescan_sem); + /* avoid complains from lockdep et al. */ + up(&fs_info->uuid_tree_rescan_sem); + sb->s_flags |= MS_RDONLY; btrfs_dev_replace_suspend_for_unmount(fs_info); @@ -1342,7 +1440,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } else { if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { btrfs_err(fs_info, - "Remounting read-write after error is not allowed\n"); + "Remounting read-write after error is not allowed"); ret = -EINVAL; goto restore; } @@ -1354,8 +1452,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (fs_info->fs_devices->missing_devices > fs_info->num_tolerated_disk_barrier_failures && !(*flags & MS_RDONLY)) { - printk(KERN_WARNING - "Btrfs: too many missing devices, writeable remount is not allowed\n"); + btrfs_warn(fs_info, + "too many missing devices, writeable remount is not allowed"); ret = -EACCES; goto restore; } @@ -1370,7 +1468,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; /* recover relocation */ + mutex_lock(&fs_info->cleaner_mutex); ret = btrfs_recover_relocation(root); + mutex_unlock(&fs_info->cleaner_mutex); if (ret) goto restore; @@ -1380,22 +1480,22 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) ret = btrfs_resume_dev_replace_async(fs_info); if (ret) { - pr_warn("btrfs: failed to resume dev_replace\n"); + btrfs_warn(fs_info, "failed to resume dev_replace"); goto restore; } if (!fs_info->uuid_root) { - pr_info("btrfs: creating UUID tree\n"); + btrfs_info(fs_info, "creating UUID tree"); ret = btrfs_create_uuid_tree(fs_info); if (ret) { - pr_warn("btrfs: failed to create the uuid tree" - "%d\n", ret); + btrfs_warn(fs_info, "failed to create the UUID tree %d", ret); goto restore; } } sb->s_flags &= ~MS_RDONLY; } out: + wake_up_process(fs_info->transaction_kthread); btrfs_remount_cleanup(fs_info, old_opts); return 0; @@ -1465,7 +1565,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) nr_devices = fs_info->fs_devices->open_devices; BUG_ON(!nr_devices); - devices_info = kmalloc(sizeof(*devices_info) * nr_devices, + devices_info = kmalloc_array(nr_devices, sizeof(*devices_info), GFP_NOFS); if (!devices_info) return -ENOMEM; @@ -1711,6 +1811,8 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) list_for_each_entry(dev, head, dev_list) { if (dev->missing) continue; + if (!dev->name) + continue; if (!first_dev || dev->devid < first_dev->devid) first_dev = dev; } @@ -1769,7 +1871,7 @@ static int btrfs_interface_init(void) static void btrfs_interface_exit(void) { if (misc_deregister(&btrfs_misc) < 0) - printk(KERN_INFO "btrfs: misc_deregister failed for control device\n"); + printk(KERN_INFO "BTRFS: misc_deregister failed for control device\n"); } static void btrfs_print_info(void) @@ -1789,17 +1891,44 @@ static void btrfs_print_info(void) static int btrfs_run_sanity_tests(void) { - return btrfs_test_free_space_cache(); + int ret; + + ret = btrfs_init_test_fs(); + if (ret) + return ret; + + ret = btrfs_test_free_space_cache(); + if (ret) + goto out; + ret = btrfs_test_extent_buffer_operations(); + if (ret) + goto out; + ret = btrfs_test_extent_io(); + if (ret) + goto out; + ret = btrfs_test_inodes(); + if (ret) + goto out; + ret = btrfs_test_qgroups(); +out: + btrfs_destroy_test_fs(); + return ret; } static int __init init_btrfs_fs(void) { int err; - err = btrfs_init_sysfs(); + err = btrfs_hash_init(); if (err) return err; + btrfs_props_init(); + + err = btrfs_init_sysfs(); + if (err) + goto free_hash; + btrfs_init_compress(); err = btrfs_init_cachep(); @@ -1873,6 +2002,8 @@ free_cachep: free_compress: btrfs_exit_compress(); btrfs_exit_sysfs(); +free_hash: + btrfs_hash_exit(); return err; } @@ -1891,9 +2022,10 @@ static void __exit exit_btrfs_fs(void) btrfs_exit_sysfs(); btrfs_cleanup_fs_uuids(); btrfs_exit_compress(); + btrfs_hash_exit(); } -module_init(init_btrfs_fs) +late_initcall(init_btrfs_fs); module_exit(exit_btrfs_fs) MODULE_LICENSE("GPL"); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 5b326cd60a4..78699364f53 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -22,24 +22,732 @@ #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/kobject.h> +#include <linux/bug.h> +#include <linux/genhd.h> +#include <linux/debugfs.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" +#include "sysfs.h" +#include "volumes.h" + +static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); + +static u64 get_features(struct btrfs_fs_info *fs_info, + enum btrfs_feature_set set) +{ + struct btrfs_super_block *disk_super = fs_info->super_copy; + if (set == FEAT_COMPAT) + return btrfs_super_compat_flags(disk_super); + else if (set == FEAT_COMPAT_RO) + return btrfs_super_compat_ro_flags(disk_super); + else + return btrfs_super_incompat_flags(disk_super); +} + +static void set_features(struct btrfs_fs_info *fs_info, + enum btrfs_feature_set set, u64 features) +{ + struct btrfs_super_block *disk_super = fs_info->super_copy; + if (set == FEAT_COMPAT) + btrfs_set_super_compat_flags(disk_super, features); + else if (set == FEAT_COMPAT_RO) + btrfs_set_super_compat_ro_flags(disk_super, features); + else + btrfs_set_super_incompat_flags(disk_super, features); +} + +static int can_modify_feature(struct btrfs_feature_attr *fa) +{ + int val = 0; + u64 set, clear; + switch (fa->feature_set) { + case FEAT_COMPAT: + set = BTRFS_FEATURE_COMPAT_SAFE_SET; + clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR; + break; + case FEAT_COMPAT_RO: + set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET; + clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR; + break; + case FEAT_INCOMPAT: + set = BTRFS_FEATURE_INCOMPAT_SAFE_SET; + clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR; + break; + default: + printk(KERN_WARNING "btrfs: sysfs: unknown feature set %d\n", + fa->feature_set); + return 0; + } + + if (set & fa->feature_bit) + val |= 1; + if (clear & fa->feature_bit) + val |= 2; + + return val; +} + +static ssize_t btrfs_feature_attr_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + int val = 0; + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); + if (fs_info) { + u64 features = get_features(fs_info, fa->feature_set); + if (features & fa->feature_bit) + val = 1; + } else + val = can_modify_feature(fa); + + return snprintf(buf, PAGE_SIZE, "%d\n", val); +} + +static ssize_t btrfs_feature_attr_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t count) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); + struct btrfs_trans_handle *trans; + u64 features, set, clear; + unsigned long val; + int ret; + + fs_info = to_fs_info(kobj); + if (!fs_info) + return -EPERM; + + ret = kstrtoul(skip_spaces(buf), 0, &val); + if (ret) + return ret; + + if (fa->feature_set == FEAT_COMPAT) { + set = BTRFS_FEATURE_COMPAT_SAFE_SET; + clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR; + } else if (fa->feature_set == FEAT_COMPAT_RO) { + set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET; + clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR; + } else { + set = BTRFS_FEATURE_INCOMPAT_SAFE_SET; + clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR; + } + + features = get_features(fs_info, fa->feature_set); + + /* Nothing to do */ + if ((val && (features & fa->feature_bit)) || + (!val && !(features & fa->feature_bit))) + return count; + + if ((val && !(set & fa->feature_bit)) || + (!val && !(clear & fa->feature_bit))) { + btrfs_info(fs_info, + "%sabling feature %s on mounted fs is not supported.", + val ? "En" : "Dis", fa->kobj_attr.attr.name); + return -EPERM; + } + + btrfs_info(fs_info, "%s %s feature flag", + val ? "Setting" : "Clearing", fa->kobj_attr.attr.name); + + trans = btrfs_start_transaction(fs_info->fs_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + spin_lock(&fs_info->super_lock); + features = get_features(fs_info, fa->feature_set); + if (val) + features |= fa->feature_bit; + else + features &= ~fa->feature_bit; + set_features(fs_info, fa->feature_set, features); + spin_unlock(&fs_info->super_lock); + + ret = btrfs_commit_transaction(trans, fs_info->fs_root); + if (ret) + return ret; + + return count; +} + +static umode_t btrfs_feature_visible(struct kobject *kobj, + struct attribute *attr, int unused) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + umode_t mode = attr->mode; + + if (fs_info) { + struct btrfs_feature_attr *fa; + u64 features; + + fa = attr_to_btrfs_feature_attr(attr); + features = get_features(fs_info, fa->feature_set); + + if (can_modify_feature(fa)) + mode |= S_IWUSR; + else if (!(features & fa->feature_bit)) + mode = 0; + } + + return mode; +} + +BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF); +BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL); +BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS); +BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO); +BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA); +BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); +BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); +BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); +BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); + +static struct attribute *btrfs_supported_feature_attrs[] = { + BTRFS_FEAT_ATTR_PTR(mixed_backref), + BTRFS_FEAT_ATTR_PTR(default_subvol), + BTRFS_FEAT_ATTR_PTR(mixed_groups), + BTRFS_FEAT_ATTR_PTR(compress_lzo), + BTRFS_FEAT_ATTR_PTR(big_metadata), + BTRFS_FEAT_ATTR_PTR(extended_iref), + BTRFS_FEAT_ATTR_PTR(raid56), + BTRFS_FEAT_ATTR_PTR(skinny_metadata), + BTRFS_FEAT_ATTR_PTR(no_holes), + NULL +}; + +static const struct attribute_group btrfs_feature_attr_group = { + .name = "features", + .is_visible = btrfs_feature_visible, + .attrs = btrfs_supported_feature_attrs, +}; + +static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf) +{ + u64 val; + if (lock) + spin_lock(lock); + val = *value_ptr; + if (lock) + spin_unlock(lock); + return snprintf(buf, PAGE_SIZE, "%llu\n", val); +} + +static ssize_t global_rsv_size_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent); + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf); +} +BTRFS_ATTR(global_rsv_size, 0444, global_rsv_size_show); + +static ssize_t global_rsv_reserved_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent); + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf); +} +BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show); + +#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) +#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj) + +static ssize_t raid_bytes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf); +BTRFS_RAID_ATTR(total_bytes, raid_bytes_show); +BTRFS_RAID_ATTR(used_bytes, raid_bytes_show); + +static ssize_t raid_bytes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) + +{ + struct btrfs_space_info *sinfo = to_space_info(kobj->parent); + struct btrfs_block_group_cache *block_group; + int index = to_raid_kobj(kobj)->raid_type; + u64 val = 0; + + down_read(&sinfo->groups_sem); + list_for_each_entry(block_group, &sinfo->block_groups[index], list) { + if (&attr->attr == BTRFS_RAID_ATTR_PTR(total_bytes)) + val += block_group->key.offset; + else + val += btrfs_block_group_used(&block_group->item); + } + up_read(&sinfo->groups_sem); + return snprintf(buf, PAGE_SIZE, "%llu\n", val); +} + +static struct attribute *raid_attributes[] = { + BTRFS_RAID_ATTR_PTR(total_bytes), + BTRFS_RAID_ATTR_PTR(used_bytes), + NULL +}; + +static void release_raid_kobj(struct kobject *kobj) +{ + kfree(to_raid_kobj(kobj)); +} + +struct kobj_type btrfs_raid_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = release_raid_kobj, + .default_attrs = raid_attributes, +}; + +#define SPACE_INFO_ATTR(field) \ +static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \ + struct kobj_attribute *a, \ + char *buf) \ +{ \ + struct btrfs_space_info *sinfo = to_space_info(kobj); \ + return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \ +} \ +BTRFS_ATTR(field, 0444, btrfs_space_info_show_##field) + +static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_space_info *sinfo = to_space_info(kobj); + s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned); + return snprintf(buf, PAGE_SIZE, "%lld\n", val); +} + +SPACE_INFO_ATTR(flags); +SPACE_INFO_ATTR(total_bytes); +SPACE_INFO_ATTR(bytes_used); +SPACE_INFO_ATTR(bytes_pinned); +SPACE_INFO_ATTR(bytes_reserved); +SPACE_INFO_ATTR(bytes_may_use); +SPACE_INFO_ATTR(disk_used); +SPACE_INFO_ATTR(disk_total); +BTRFS_ATTR(total_bytes_pinned, 0444, btrfs_space_info_show_total_bytes_pinned); + +static struct attribute *space_info_attrs[] = { + BTRFS_ATTR_PTR(flags), + BTRFS_ATTR_PTR(total_bytes), + BTRFS_ATTR_PTR(bytes_used), + BTRFS_ATTR_PTR(bytes_pinned), + BTRFS_ATTR_PTR(bytes_reserved), + BTRFS_ATTR_PTR(bytes_may_use), + BTRFS_ATTR_PTR(disk_used), + BTRFS_ATTR_PTR(disk_total), + BTRFS_ATTR_PTR(total_bytes_pinned), + NULL, +}; + +static void space_info_release(struct kobject *kobj) +{ + struct btrfs_space_info *sinfo = to_space_info(kobj); + percpu_counter_destroy(&sinfo->total_bytes_pinned); + kfree(sinfo); +} + +struct kobj_type space_info_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = space_info_release, + .default_attrs = space_info_attrs, +}; + +static const struct attribute *allocation_attrs[] = { + BTRFS_ATTR_PTR(global_rsv_reserved), + BTRFS_ATTR_PTR(global_rsv_size), + NULL, +}; + +static ssize_t btrfs_label_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + return snprintf(buf, PAGE_SIZE, "%s\n", fs_info->super_copy->label); +} + +static ssize_t btrfs_label_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + struct btrfs_trans_handle *trans; + struct btrfs_root *root = fs_info->fs_root; + int ret; + + if (len >= BTRFS_LABEL_SIZE) + return -EINVAL; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + spin_lock(&root->fs_info->super_lock); + strcpy(fs_info->super_copy->label, buf); + spin_unlock(&root->fs_info->super_lock); + ret = btrfs_commit_transaction(trans, root); + + if (!ret) + return len; + + return ret; +} +BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store); + +static ssize_t btrfs_no_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + return -EPERM; +} + +static ssize_t btrfs_nodesize_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); +} + +BTRFS_ATTR_RW(nodesize, 0444, btrfs_nodesize_show, btrfs_no_store); + +static ssize_t btrfs_sectorsize_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); +} + +BTRFS_ATTR_RW(sectorsize, 0444, btrfs_sectorsize_show, btrfs_no_store); + +static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); +} + +BTRFS_ATTR_RW(clone_alignment, 0444, btrfs_clone_alignment_show, btrfs_no_store); + +static struct attribute *btrfs_attrs[] = { + BTRFS_ATTR_PTR(label), + BTRFS_ATTR_PTR(nodesize), + BTRFS_ATTR_PTR(sectorsize), + BTRFS_ATTR_PTR(clone_alignment), + NULL, +}; + +static void btrfs_release_super_kobj(struct kobject *kobj) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + complete(&fs_info->kobj_unregister); +} + +static struct kobj_type btrfs_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = btrfs_release_super_kobj, + .default_attrs = btrfs_attrs, +}; + +static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) +{ + if (kobj->ktype != &btrfs_ktype) + return NULL; + return container_of(kobj, struct btrfs_fs_info, super_kobj); +} + +#define NUM_FEATURE_BITS 64 +static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13]; +static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS]; + +static u64 supported_feature_masks[3] = { + [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP, + [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP, + [FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP, +}; + +static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) +{ + int set; + + for (set = 0; set < FEAT_MAX; set++) { + int i; + struct attribute *attrs[2]; + struct attribute_group agroup = { + .name = "features", + .attrs = attrs, + }; + u64 features = get_features(fs_info, set); + features &= ~supported_feature_masks[set]; + + if (!features) + continue; + + attrs[1] = NULL; + for (i = 0; i < NUM_FEATURE_BITS; i++) { + struct btrfs_feature_attr *fa; + + if (!(features & (1ULL << i))) + continue; + + fa = &btrfs_feature_attrs[set][i]; + attrs[0] = &fa->kobj_attr.attr; + if (add) { + int ret; + ret = sysfs_merge_group(&fs_info->super_kobj, + &agroup); + if (ret) + return ret; + } else + sysfs_unmerge_group(&fs_info->super_kobj, + &agroup); + } + + } + return 0; +} + +static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) +{ + kobject_del(&fs_info->super_kobj); + kobject_put(&fs_info->super_kobj); + wait_for_completion(&fs_info->kobj_unregister); +} + +void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) +{ + if (fs_info->space_info_kobj) { + sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs); + kobject_del(fs_info->space_info_kobj); + kobject_put(fs_info->space_info_kobj); + } + kobject_del(fs_info->device_dir_kobj); + kobject_put(fs_info->device_dir_kobj); + addrm_unknown_feature_attrs(fs_info, false); + sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group); + __btrfs_sysfs_remove_one(fs_info); +} + +const char * const btrfs_feature_set_names[3] = { + [FEAT_COMPAT] = "compat", + [FEAT_COMPAT_RO] = "compat_ro", + [FEAT_INCOMPAT] = "incompat", +}; + +char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags) +{ + size_t bufsize = 4096; /* safe max, 64 names * 64 bytes */ + int len = 0; + int i; + char *str; + + str = kmalloc(bufsize, GFP_KERNEL); + if (!str) + return str; + + for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) { + const char *name; + + if (!(flags & (1ULL << i))) + continue; + + name = btrfs_feature_attrs[set][i].kobj_attr.attr.name; + len += snprintf(str + len, bufsize - len, "%s%s", + len ? "," : "", name); + } + + return str; +} + +static void init_feature_attrs(void) +{ + struct btrfs_feature_attr *fa; + int set, i; + + BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names) != + ARRAY_SIZE(btrfs_feature_attrs)); + BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) != + ARRAY_SIZE(btrfs_feature_attrs[0])); + + memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs)); + memset(btrfs_unknown_feature_names, 0, + sizeof(btrfs_unknown_feature_names)); + + for (i = 0; btrfs_supported_feature_attrs[i]; i++) { + struct btrfs_feature_attr *sfa; + struct attribute *a = btrfs_supported_feature_attrs[i]; + int bit; + sfa = attr_to_btrfs_feature_attr(a); + bit = ilog2(sfa->feature_bit); + fa = &btrfs_feature_attrs[sfa->feature_set][bit]; + + fa->kobj_attr.attr.name = sfa->kobj_attr.attr.name; + } + + for (set = 0; set < FEAT_MAX; set++) { + for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) { + char *name = btrfs_unknown_feature_names[set][i]; + fa = &btrfs_feature_attrs[set][i]; + + if (fa->kobj_attr.attr.name) + continue; + + snprintf(name, 13, "%s:%u", + btrfs_feature_set_names[set], i); + + fa->kobj_attr.attr.name = name; + fa->kobj_attr.attr.mode = S_IRUGO; + fa->feature_set = set; + fa->feature_bit = 1ULL << i; + } + } +} + +int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device) +{ + struct hd_struct *disk; + struct kobject *disk_kobj; + + if (!fs_info->device_dir_kobj) + return -EINVAL; + + if (one_device) { + disk = one_device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + + sysfs_remove_link(fs_info->device_dir_kobj, + disk_kobj->name); + } + + return 0; +} + +int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device) +{ + int error = 0; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *dev; + + if (!fs_info->device_dir_kobj) + fs_info->device_dir_kobj = kobject_create_and_add("devices", + &fs_info->super_kobj); + + if (!fs_info->device_dir_kobj) + return -ENOMEM; + + list_for_each_entry(dev, &fs_devices->devices, dev_list) { + struct hd_struct *disk; + struct kobject *disk_kobj; + + if (!dev->bdev) + continue; + + if (one_device && one_device != dev) + continue; + + disk = dev->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + + error = sysfs_create_link(fs_info->device_dir_kobj, + disk_kobj, disk_kobj->name); + if (error) + break; + } + + return error; +} /* /sys/fs/btrfs/ entry */ static struct kset *btrfs_kset; +/* /sys/kernel/debug/btrfs */ +static struct dentry *btrfs_debugfs_root_dentry; + +/* Debugging tunables and exported data */ +u64 btrfs_debugfs_test; + +int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) +{ + int error; + + init_completion(&fs_info->kobj_unregister); + fs_info->super_kobj.kset = btrfs_kset; + error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL, + "%pU", fs_info->fsid); + if (error) + return error; + + error = sysfs_create_group(&fs_info->super_kobj, + &btrfs_feature_attr_group); + if (error) { + __btrfs_sysfs_remove_one(fs_info); + return error; + } + + error = addrm_unknown_feature_attrs(fs_info, true); + if (error) + goto failure; + + error = btrfs_kobj_add_device(fs_info, NULL); + if (error) + goto failure; + + fs_info->space_info_kobj = kobject_create_and_add("allocation", + &fs_info->super_kobj); + if (!fs_info->space_info_kobj) { + error = -ENOMEM; + goto failure; + } + + error = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs); + if (error) + goto failure; + + return 0; +failure: + btrfs_sysfs_remove_one(fs_info); + return error; +} + +static int btrfs_init_debugfs(void) +{ +#ifdef CONFIG_DEBUG_FS + btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL); + if (!btrfs_debugfs_root_dentry) + return -ENOMEM; + + debugfs_create_u64("test", S_IRUGO | S_IWUGO, btrfs_debugfs_root_dentry, + &btrfs_debugfs_test); +#endif + return 0; +} + int btrfs_init_sysfs(void) { + int ret; + btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); if (!btrfs_kset) return -ENOMEM; - return 0; + + ret = btrfs_init_debugfs(); + if (ret) + return ret; + + init_feature_attrs(); + ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); + + return ret; } void btrfs_exit_sysfs(void) { + sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); kset_unregister(btrfs_kset); + debugfs_remove_recursive(btrfs_debugfs_root_dentry); } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h new file mode 100644 index 00000000000..ac46df37504 --- /dev/null +++ b/fs/btrfs/sysfs.h @@ -0,0 +1,73 @@ +#ifndef _BTRFS_SYSFS_H_ +#define _BTRFS_SYSFS_H_ + +/* + * Data exported through sysfs + */ +extern u64 btrfs_debugfs_test; + +enum btrfs_feature_set { + FEAT_COMPAT, + FEAT_COMPAT_RO, + FEAT_INCOMPAT, + FEAT_MAX +}; + +#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ +{ \ + .attr = { .name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ +} + +#define BTRFS_ATTR_RW(_name, _mode, _show, _store) \ +static struct kobj_attribute btrfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, _mode, _show, _store) +#define BTRFS_ATTR(_name, _mode, _show) \ + BTRFS_ATTR_RW(_name, _mode, _show, NULL) +#define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr) + +#define BTRFS_RAID_ATTR(_name, _show) \ +static struct kobj_attribute btrfs_raid_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) +#define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr) + + +struct btrfs_feature_attr { + struct kobj_attribute kobj_attr; + enum btrfs_feature_set feature_set; + u64 feature_bit; +}; + +#define BTRFS_FEAT_ATTR(_name, _feature_set, _prefix, _feature_bit) \ +static struct btrfs_feature_attr btrfs_attr_##_name = { \ + .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \ + btrfs_feature_attr_show, \ + btrfs_feature_attr_store), \ + .feature_set = _feature_set, \ + .feature_bit = _prefix ##_## _feature_bit, \ +} +#define BTRFS_FEAT_ATTR_PTR(_name) (&btrfs_attr_##_name.kobj_attr.attr) + +#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature) +#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT, feature) +#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature) + +/* convert from attribute */ +#define to_btrfs_feature_attr(a) \ + container_of(a, struct btrfs_feature_attr, kobj_attr) +#define attr_to_btrfs_attr(a) container_of(a, struct kobj_attribute, attr) +#define attr_to_btrfs_feature_attr(a) \ + to_btrfs_feature_attr(attr_to_btrfs_attr(a)) +char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); +extern const char * const btrfs_feature_set_names[3]; +extern struct kobj_type space_info_ktype; +extern struct kobj_type btrfs_raid_ktype; +int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device); +int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device); +#endif /* _BTRFS_SYSFS_H_ */ diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c new file mode 100644 index 00000000000..9626252ee6b --- /dev/null +++ b/fs/btrfs/tests/btrfs-tests.c @@ -0,0 +1,171 @@ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/magic.h> +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../volumes.h" +#include "../disk-io.h" +#include "../qgroup.h" + +static struct vfsmount *test_mnt = NULL; + +static const struct super_operations btrfs_test_super_ops = { + .alloc_inode = btrfs_alloc_inode, + .destroy_inode = btrfs_test_destroy_inode, +}; + +static struct dentry *btrfs_test_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + return mount_pseudo(fs_type, "btrfs_test:", &btrfs_test_super_ops, + NULL, BTRFS_TEST_MAGIC); +} + +static struct file_system_type test_type = { + .name = "btrfs_test_fs", + .mount = btrfs_test_mount, + .kill_sb = kill_anon_super, +}; + +struct inode *btrfs_new_test_inode(void) +{ + return new_inode(test_mnt->mnt_sb); +} + +int btrfs_init_test_fs(void) +{ + int ret; + + ret = register_filesystem(&test_type); + if (ret) { + printk(KERN_ERR "btrfs: cannot register test file system\n"); + return ret; + } + + test_mnt = kern_mount(&test_type); + if (IS_ERR(test_mnt)) { + printk(KERN_ERR "btrfs: cannot mount test file system\n"); + unregister_filesystem(&test_type); + return ret; + } + return 0; +} + +void btrfs_destroy_test_fs(void) +{ + kern_unmount(test_mnt); + unregister_filesystem(&test_type); +} + +struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void) +{ + struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info), + GFP_NOFS); + + if (!fs_info) + return fs_info; + fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices), + GFP_NOFS); + if (!fs_info->fs_devices) { + kfree(fs_info); + return NULL; + } + fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block), + GFP_NOFS); + if (!fs_info->super_copy) { + kfree(fs_info->fs_devices); + kfree(fs_info); + return NULL; + } + + if (init_srcu_struct(&fs_info->subvol_srcu)) { + kfree(fs_info->fs_devices); + kfree(fs_info->super_copy); + kfree(fs_info); + return NULL; + } + + spin_lock_init(&fs_info->buffer_lock); + spin_lock_init(&fs_info->qgroup_lock); + spin_lock_init(&fs_info->qgroup_op_lock); + spin_lock_init(&fs_info->super_lock); + spin_lock_init(&fs_info->fs_roots_radix_lock); + spin_lock_init(&fs_info->tree_mod_seq_lock); + mutex_init(&fs_info->qgroup_ioctl_lock); + mutex_init(&fs_info->qgroup_rescan_lock); + rwlock_init(&fs_info->tree_mod_log_lock); + fs_info->running_transaction = NULL; + fs_info->qgroup_tree = RB_ROOT; + fs_info->qgroup_ulist = NULL; + atomic64_set(&fs_info->tree_mod_seq, 0); + INIT_LIST_HEAD(&fs_info->dirty_qgroups); + INIT_LIST_HEAD(&fs_info->dead_roots); + INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); + INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + return fs_info; +} + +static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) +{ + struct radix_tree_iter iter; + void **slot; + + spin_lock(&fs_info->buffer_lock); +restart: + radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { + struct extent_buffer *eb; + + eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock); + if (!eb) + continue; + /* Shouldn't happen but that kind of thinking creates CVE's */ + if (radix_tree_exception(eb)) { + if (radix_tree_deref_retry(eb)) + goto restart; + continue; + } + spin_unlock(&fs_info->buffer_lock); + free_extent_buffer_stale(eb); + spin_lock(&fs_info->buffer_lock); + } + spin_unlock(&fs_info->buffer_lock); + + btrfs_free_qgroup_config(fs_info); + btrfs_free_fs_roots(fs_info); + cleanup_srcu_struct(&fs_info->subvol_srcu); + kfree(fs_info->super_copy); + kfree(fs_info->fs_devices); + kfree(fs_info); +} + +void btrfs_free_dummy_root(struct btrfs_root *root) +{ + if (!root) + return; + if (root->node) + free_extent_buffer(root->node); + if (root->fs_info) + btrfs_free_dummy_fs_info(root->fs_info); + kfree(root); +} + diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 58087762577..fd395422448 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -21,14 +21,48 @@ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -#define test_msg(fmt, ...) pr_info("btrfs: selftest: " fmt, ##__VA_ARGS__) +#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__) + +struct btrfs_root; int btrfs_test_free_space_cache(void); +int btrfs_test_extent_buffer_operations(void); +int btrfs_test_extent_io(void); +int btrfs_test_inodes(void); +int btrfs_test_qgroups(void); +int btrfs_init_test_fs(void); +void btrfs_destroy_test_fs(void); +struct inode *btrfs_new_test_inode(void); +struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void); +void btrfs_free_dummy_root(struct btrfs_root *root); #else static inline int btrfs_test_free_space_cache(void) { return 0; } +static inline int btrfs_test_extent_buffer_operations(void) +{ + return 0; +} +static inline int btrfs_init_test_fs(void) +{ + return 0; +} +static inline void btrfs_destroy_test_fs(void) +{ +} +static inline int btrfs_test_extent_io(void) +{ + return 0; +} +static inline int btrfs_test_inodes(void) +{ + return 0; +} +static inline int btrfs_test_qgroups(void) +{ + return 0; +} #endif #endif diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c new file mode 100644 index 00000000000..cc286ce97d1 --- /dev/null +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -0,0 +1,229 @@ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/slab.h> +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../extent_io.h" +#include "../disk-io.h" + +static int test_btrfs_split_item(void) +{ + struct btrfs_path *path; + struct btrfs_root *root; + struct extent_buffer *eb; + struct btrfs_item *item; + char *value = "mary had a little lamb"; + char *split1 = "mary had a little"; + char *split2 = " lamb"; + char *split3 = "mary"; + char *split4 = " had a little"; + char buf[32]; + struct btrfs_key key; + u32 value_len = strlen(value); + int ret = 0; + + test_msg("Running btrfs_split_item tests\n"); + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Could not allocate root\n"); + return PTR_ERR(root); + } + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Could not allocate path\n"); + kfree(root); + return -ENOMEM; + } + + path->nodes[0] = eb = alloc_dummy_extent_buffer(0, 4096); + if (!eb) { + test_msg("Could not allocate dummy buffer\n"); + ret = -ENOMEM; + goto out; + } + path->slots[0] = 0; + + key.objectid = 0; + key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = 0; + + setup_items_for_insert(root, path, &key, &value_len, value_len, + value_len + sizeof(struct btrfs_item), 1); + item = btrfs_item_nr(0); + write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0), + value_len); + + key.offset = 3; + + /* + * Passing NULL trans here should be safe because we have plenty of + * space in this leaf to split the item without having to split the + * leaf. + */ + ret = btrfs_split_item(NULL, root, path, &key, 17); + if (ret) { + test_msg("Split item failed %d\n", ret); + goto out; + } + + /* + * Read the first slot, it should have the original key and contain only + * 'mary had a little' + */ + btrfs_item_key_to_cpu(eb, &key, 0); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 0) { + test_msg("Invalid key at slot 0\n"); + ret = -EINVAL; + goto out; + } + + item = btrfs_item_nr(0); + if (btrfs_item_size(eb, item) != strlen(split1)) { + test_msg("Invalid len in the first split\n"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0), + strlen(split1)); + if (memcmp(buf, split1, strlen(split1))) { + test_msg("Data in the buffer doesn't match what it should " + "in the first split have='%.*s' want '%s'\n", + (int)strlen(split1), buf, split1); + ret = -EINVAL; + goto out; + } + + btrfs_item_key_to_cpu(eb, &key, 1); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 3) { + test_msg("Invalid key at slot 1\n"); + ret = -EINVAL; + goto out; + } + + item = btrfs_item_nr(1); + if (btrfs_item_size(eb, item) != strlen(split2)) { + test_msg("Invalid len in the second split\n"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1), + strlen(split2)); + if (memcmp(buf, split2, strlen(split2))) { + test_msg("Data in the buffer doesn't match what it should " + "in the second split\n"); + ret = -EINVAL; + goto out; + } + + key.offset = 1; + /* Do it again so we test memmoving the other items in the leaf */ + ret = btrfs_split_item(NULL, root, path, &key, 4); + if (ret) { + test_msg("Second split item failed %d\n", ret); + goto out; + } + + btrfs_item_key_to_cpu(eb, &key, 0); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 0) { + test_msg("Invalid key at slot 0\n"); + ret = -EINVAL; + goto out; + } + + item = btrfs_item_nr(0); + if (btrfs_item_size(eb, item) != strlen(split3)) { + test_msg("Invalid len in the first split\n"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0), + strlen(split3)); + if (memcmp(buf, split3, strlen(split3))) { + test_msg("Data in the buffer doesn't match what it should " + "in the third split"); + ret = -EINVAL; + goto out; + } + + btrfs_item_key_to_cpu(eb, &key, 1); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 1) { + test_msg("Invalid key at slot 1\n"); + ret = -EINVAL; + goto out; + } + + item = btrfs_item_nr(1); + if (btrfs_item_size(eb, item) != strlen(split4)) { + test_msg("Invalid len in the second split\n"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1), + strlen(split4)); + if (memcmp(buf, split4, strlen(split4))) { + test_msg("Data in the buffer doesn't match what it should " + "in the fourth split\n"); + ret = -EINVAL; + goto out; + } + + btrfs_item_key_to_cpu(eb, &key, 2); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 3) { + test_msg("Invalid key at slot 2\n"); + ret = -EINVAL; + goto out; + } + + item = btrfs_item_nr(2); + if (btrfs_item_size(eb, item) != strlen(split2)) { + test_msg("Invalid len in the second split\n"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 2), + strlen(split2)); + if (memcmp(buf, split2, strlen(split2))) { + test_msg("Data in the buffer doesn't match what it should " + "in the last chunk\n"); + ret = -EINVAL; + goto out; + } +out: + btrfs_free_path(path); + kfree(root); + return ret; +} + +int btrfs_test_extent_buffer_operations(void) +{ + test_msg("Running extent buffer operation tests"); + return test_btrfs_split_item(); +} diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c new file mode 100644 index 00000000000..7e99c2f98dd --- /dev/null +++ b/fs/btrfs/tests/extent-io-tests.c @@ -0,0 +1,276 @@ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/pagemap.h> +#include <linux/sched.h> +#include "btrfs-tests.h" +#include "../extent_io.h" + +#define PROCESS_UNLOCK (1 << 0) +#define PROCESS_RELEASE (1 << 1) +#define PROCESS_TEST_LOCKED (1 << 2) + +static noinline int process_page_range(struct inode *inode, u64 start, u64 end, + unsigned long flags) +{ + int ret; + struct page *pages[16]; + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = end_index - index + 1; + int i; + int count = 0; + int loops = 0; + + while (nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, nr_pages, + ARRAY_SIZE(pages)), pages); + for (i = 0; i < ret; i++) { + if (flags & PROCESS_TEST_LOCKED && + !PageLocked(pages[i])) + count++; + if (flags & PROCESS_UNLOCK && PageLocked(pages[i])) + unlock_page(pages[i]); + page_cache_release(pages[i]); + if (flags & PROCESS_RELEASE) + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + cond_resched(); + loops++; + if (loops > 100000) { + printk(KERN_ERR "stuck in a loop, start %Lu, end %Lu, nr_pages %lu, ret %d\n", start, end, nr_pages, ret); + break; + } + } + return count; +} + +static int test_find_delalloc(void) +{ + struct inode *inode; + struct extent_io_tree tmp; + struct page *page; + struct page *locked_page = NULL; + unsigned long index = 0; + u64 total_dirty = 256 * 1024 * 1024; + u64 max_bytes = 128 * 1024 * 1024; + u64 start, end, test_start; + u64 found; + int ret = -EINVAL; + + inode = btrfs_new_test_inode(); + if (!inode) { + test_msg("Failed to allocate test inode\n"); + return -ENOMEM; + } + + extent_io_tree_init(&tmp, &inode->i_data); + + /* + * First go through and create and mark all of our pages dirty, we pin + * everything to make sure our pages don't get evicted and screw up our + * test. + */ + for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) { + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (!page) { + test_msg("Failed to allocate test page\n"); + ret = -ENOMEM; + goto out; + } + SetPageDirty(page); + if (index) { + unlock_page(page); + } else { + page_cache_get(page); + locked_page = page; + } + } + + /* Test this scenario + * |--- delalloc ---| + * |--- search ---| + */ + set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_NOFS); + start = 0; + end = 0; + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end, max_bytes); + if (!found) { + test_msg("Should have found at least one delalloc\n"); + goto out_bits; + } + if (start != 0 || end != 4095) { + test_msg("Expected start 0 end 4095, got start %Lu end %Lu\n", + start, end); + goto out_bits; + } + unlock_extent(&tmp, start, end); + unlock_page(locked_page); + page_cache_release(locked_page); + + /* + * Test this scenario + * + * |--- delalloc ---| + * |--- search ---| + */ + test_start = 64 * 1024 * 1024; + locked_page = find_lock_page(inode->i_mapping, + test_start >> PAGE_CACHE_SHIFT); + if (!locked_page) { + test_msg("Couldn't find the locked page\n"); + goto out_bits; + } + set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_NOFS); + start = test_start; + end = 0; + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end, max_bytes); + if (!found) { + test_msg("Couldn't find delalloc in our range\n"); + goto out_bits; + } + if (start != test_start || end != max_bytes - 1) { + test_msg("Expected start %Lu end %Lu, got start %Lu, end " + "%Lu\n", test_start, max_bytes - 1, start, end); + goto out_bits; + } + if (process_page_range(inode, start, end, + PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) { + test_msg("There were unlocked pages in the range\n"); + goto out_bits; + } + unlock_extent(&tmp, start, end); + /* locked_page was unlocked above */ + page_cache_release(locked_page); + + /* + * Test this scenario + * |--- delalloc ---| + * |--- search ---| + */ + test_start = max_bytes + 4096; + locked_page = find_lock_page(inode->i_mapping, test_start >> + PAGE_CACHE_SHIFT); + if (!locked_page) { + test_msg("Could'nt find the locked page\n"); + goto out_bits; + } + start = test_start; + end = 0; + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end, max_bytes); + if (found) { + test_msg("Found range when we shouldn't have\n"); + goto out_bits; + } + if (end != (u64)-1) { + test_msg("Did not return the proper end offset\n"); + goto out_bits; + } + + /* + * Test this scenario + * [------- delalloc -------| + * [max_bytes]|-- search--| + * + * We are re-using our test_start from above since it works out well. + */ + set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_NOFS); + start = test_start; + end = 0; + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end, max_bytes); + if (!found) { + test_msg("Didn't find our range\n"); + goto out_bits; + } + if (start != test_start || end != total_dirty - 1) { + test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n", + test_start, total_dirty - 1, start, end); + goto out_bits; + } + if (process_page_range(inode, start, end, + PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) { + test_msg("Pages in range were not all locked\n"); + goto out_bits; + } + unlock_extent(&tmp, start, end); + + /* + * Now to test where we run into a page that is no longer dirty in the + * range we want to find. + */ + page = find_get_page(inode->i_mapping, (max_bytes + (1 * 1024 * 1024)) + >> PAGE_CACHE_SHIFT); + if (!page) { + test_msg("Couldn't find our page\n"); + goto out_bits; + } + ClearPageDirty(page); + page_cache_release(page); + + /* We unlocked it in the previous test */ + lock_page(locked_page); + start = test_start; + end = 0; + /* + * Currently if we fail to find dirty pages in the delalloc range we + * will adjust max_bytes down to PAGE_CACHE_SIZE and then re-search. If + * this changes at any point in the future we will need to fix this + * tests expected behavior. + */ + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end, max_bytes); + if (!found) { + test_msg("Didn't find our range\n"); + goto out_bits; + } + if (start != test_start && end != test_start + PAGE_CACHE_SIZE - 1) { + test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n", + test_start, test_start + PAGE_CACHE_SIZE - 1, start, + end); + goto out_bits; + } + if (process_page_range(inode, start, end, PROCESS_TEST_LOCKED | + PROCESS_UNLOCK)) { + test_msg("Pages in range were not all locked\n"); + goto out_bits; + } + ret = 0; +out_bits: + clear_extent_bits(&tmp, 0, total_dirty - 1, + (unsigned long)-1, GFP_NOFS); +out: + if (locked_page) + page_cache_release(locked_page); + process_page_range(inode, 0, total_dirty - 1, + PROCESS_UNLOCK | PROCESS_RELEASE); + iput(inode); + return ret; +} + +int btrfs_test_extent_io(void) +{ + test_msg("Running find delalloc tests\n"); + return test_find_delalloc(); +} diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 6fc82010dc1..c8d9ddf84c6 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -101,7 +101,7 @@ static int test_extents(struct btrfs_block_group_cache *cache) ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096); if (ret) { - test_msg("Error removing middle peice %d\n", ret); + test_msg("Error removing middle piece %d\n", ret); return ret; } @@ -266,7 +266,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) } if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) { - test_msg("Left over peices after removing overlapping\n"); + test_msg("Left over pieces after removing overlapping\n"); return -1; } diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c new file mode 100644 index 00000000000..3ae0f5b8bb8 --- /dev/null +++ b/fs/btrfs/tests/inode-tests.c @@ -0,0 +1,928 @@ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../btrfs_inode.h" +#include "../disk-io.h" +#include "../extent_io.h" +#include "../volumes.h" + +static void insert_extent(struct btrfs_root *root, u64 start, u64 len, + u64 ram_bytes, u64 offset, u64 disk_bytenr, + u64 disk_len, u32 type, u8 compression, int slot) +{ + struct btrfs_path path; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf = root->node; + struct btrfs_key key; + u32 value_len = sizeof(struct btrfs_file_extent_item); + + if (type == BTRFS_FILE_EXTENT_INLINE) + value_len += len; + memset(&path, 0, sizeof(path)); + + path.nodes[0] = leaf; + path.slots[0] = slot; + + key.objectid = BTRFS_FIRST_FREE_OBJECTID; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + setup_items_for_insert(root, &path, &key, &value_len, value_len, + value_len + sizeof(struct btrfs_item), 1); + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, 1); + btrfs_set_file_extent_type(leaf, fi, type); + btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_len); + btrfs_set_file_extent_offset(leaf, fi, offset); + btrfs_set_file_extent_num_bytes(leaf, fi, len); + btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); + btrfs_set_file_extent_compression(leaf, fi, compression); + btrfs_set_file_extent_encryption(leaf, fi, 0); + btrfs_set_file_extent_other_encoding(leaf, fi, 0); +} + +static void insert_inode_item_key(struct btrfs_root *root) +{ + struct btrfs_path path; + struct extent_buffer *leaf = root->node; + struct btrfs_key key; + u32 value_len = 0; + + memset(&path, 0, sizeof(path)); + + path.nodes[0] = leaf; + path.slots[0] = 0; + + key.objectid = BTRFS_INODE_ITEM_KEY; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + setup_items_for_insert(root, &path, &key, &value_len, value_len, + value_len + sizeof(struct btrfs_item), 1); +} + +/* + * Build the most complicated map of extents the earth has ever seen. We want + * this so we can test all of the corner cases of btrfs_get_extent. Here is a + * diagram of how the extents will look though this may not be possible we still + * want to make sure everything acts normally (the last number is not inclusive) + * + * [0 - 5][5 - 6][6 - 10][10 - 4096][ 4096 - 8192 ][8192 - 12288] + * [hole ][inline][ hole ][ regular ][regular1 split][ hole ] + * + * [ 12288 - 20480][20480 - 24576][ 24576 - 28672 ][28672 - 36864][36864 - 45056] + * [regular1 split][ prealloc1 ][prealloc1 written][ prealloc1 ][ compressed ] + * + * [45056 - 49152][49152-53248][53248-61440][61440-65536][ 65536+81920 ] + * [ compressed1 ][ regular ][compressed1][ regular ][ hole but no extent] + * + * [81920-86016] + * [ regular ] + */ +static void setup_file_extents(struct btrfs_root *root) +{ + int slot = 0; + u64 disk_bytenr = 1 * 1024 * 1024; + u64 offset = 0; + + /* First we want a hole */ + insert_extent(root, offset, 5, 5, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0, + slot); + slot++; + offset += 5; + + /* + * Now we want an inline extent, I don't think this is possible but hey + * why not? Also keep in mind if we have an inline extent it counts as + * the whole first page. If we were to expand it we would have to cow + * and we wouldn't have an inline extent anymore. + */ + insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0, + slot); + slot++; + offset = 4096; + + /* Now another hole */ + insert_extent(root, offset, 4, 4, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0, + slot); + slot++; + offset += 4; + + /* Now for a regular extent */ + insert_extent(root, offset, 4095, 4095, 0, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + disk_bytenr += 4096; + offset += 4095; + + /* + * Now for 3 extents that were split from a hole punch so we test + * offsets properly. + */ + insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += 4096; + insert_extent(root, offset, 4096, 4096, 0, 0, 0, BTRFS_FILE_EXTENT_REG, + 0, slot); + slot++; + offset += 4096; + insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += 8192; + disk_bytenr += 16384; + + /* Now for a unwritten prealloc extent */ + insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + slot++; + offset += 4096; + + /* + * We want to jack up disk_bytenr a little more so the em stuff doesn't + * merge our records. + */ + disk_bytenr += 8192; + + /* + * Now for a partially written prealloc extent, basically the same as + * the hole punch example above. Ram_bytes never changes when you mark + * extents written btw. + */ + insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384, + BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + slot++; + offset += 4096; + insert_extent(root, offset, 4096, 16384, 4096, disk_bytenr, 16384, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += 4096; + insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384, + BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + slot++; + offset += 8192; + disk_bytenr += 16384; + + /* Now a normal compressed extent */ + insert_extent(root, offset, 8192, 8192, 0, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); + slot++; + offset += 8192; + /* No merges */ + disk_bytenr += 8192; + + /* Now a split compressed extent */ + insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); + slot++; + offset += 4096; + insert_extent(root, offset, 4096, 4096, 0, disk_bytenr + 4096, 4096, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += 4096; + insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); + slot++; + offset += 8192; + disk_bytenr += 8192; + + /* Now extents that have a hole but no hole extent */ + insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += 16384; + disk_bytenr += 4096; + insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_REG, 0, slot); +} + +static unsigned long prealloc_only = 0; +static unsigned long compressed_only = 0; +static unsigned long vacancy_only = 0; + +static noinline int test_btrfs_get_extent(void) +{ + struct inode *inode = NULL; + struct btrfs_root *root = NULL; + struct extent_map *em = NULL; + u64 orig_start; + u64 disk_bytenr; + u64 offset; + int ret = -ENOMEM; + + inode = btrfs_new_test_inode(); + if (!inode) { + test_msg("Couldn't allocate inode\n"); + return ret; + } + + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; + BTRFS_I(inode)->location.offset = 0; + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + goto out; + } + + /* + * We do this since btrfs_get_extent wants to assign em->bdev to + * root->fs_info->fs_devices->latest_bdev. + */ + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + goto out; + } + + root->node = alloc_dummy_extent_buffer(0, 4096); + if (!root->node) { + test_msg("Couldn't allocate dummy buffer\n"); + goto out; + } + + /* + * We will just free a dummy node if it's ref count is 2 so we need an + * extra ref so our searches don't accidently release our page. + */ + extent_buffer_get(root->node); + btrfs_set_header_nritems(root->node, 0); + btrfs_set_header_level(root->node, 0); + ret = -EINVAL; + + /* First with no extents */ + BTRFS_I(inode)->root = root; + em = btrfs_get_extent(inode, NULL, 0, 0, 4096, 0); + if (IS_ERR(em)) { + em = NULL; + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_msg("Expected a hole, got %llu\n", em->block_start); + goto out; + } + if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { + test_msg("Vacancy flag wasn't set properly\n"); + goto out; + } + free_extent_map(em); + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + + /* + * All of the magic numbers are based on the mapping setup in + * setup_file_extents, so if you change anything there you need to + * update the comment and update the expected values below. + */ + setup_file_extents(root); + + em = btrfs_get_extent(inode, NULL, 0, 0, (u64)-1, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_msg("Expected a hole, got %llu\n", em->block_start); + goto out; + } + if (em->start != 0 || em->len != 5) { + test_msg("Unexpected extent wanted start 0 len 5, got start " + "%llu len %llu\n", em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_INLINE) { + test_msg("Expected an inline, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4091) { + test_msg("Unexpected extent wanted start %llu len 1, got start " + "%llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + /* + * We don't test anything else for inline since it doesn't get set + * unless we have a page for it to write into. Maybe we should change + * this? + */ + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_msg("Expected a hole, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4) { + test_msg("Unexpected extent wanted start %llu len 4, got start " + "%llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* Regular extent */ + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4095) { + test_msg("Unexpected extent wanted start %llu len 4095, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* The next 3 are split extents */ + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + disk_bytenr = em->block_start; + orig_start = em->start; + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_msg("Expected a hole, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 8192) { + test_msg("Unexpected extent wanted start %llu len 8192, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != orig_start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", + orig_start, em->orig_start); + goto out; + } + disk_bytenr += (em->start - orig_start); + if (em->block_start != disk_bytenr) { + test_msg("Wrong block start, want %llu, have %llu\n", + disk_bytenr, em->block_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* Prealloc extent */ + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != prealloc_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + prealloc_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* The next 3 are a half written prealloc extent */ + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != prealloc_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + prealloc_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + disk_bytenr = em->block_start; + orig_start = em->start; + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_HOLE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != orig_start) { + test_msg("Unexpected orig offset, wanted %llu, have %llu\n", + orig_start, em->orig_start); + goto out; + } + if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) { + test_msg("Unexpected block start, wanted %llu, have %llu\n", + disk_bytenr + (em->start - em->orig_start), + em->block_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 8192) { + test_msg("Unexpected extent wanted start %llu len 8192, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != prealloc_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + prealloc_only, em->flags); + goto out; + } + if (em->orig_start != orig_start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", orig_start, + em->orig_start); + goto out; + } + if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) { + test_msg("Unexpected block start, wanted %llu, have %llu\n", + disk_bytenr + (em->start - em->orig_start), + em->block_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* Now for the compressed extent */ + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 8192) { + test_msg("Unexpected extent wanted start %llu len 8192, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != compressed_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + compressed_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", + em->start, em->orig_start); + goto out; + } + if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + test_msg("Unexpected compress type, wanted %d, got %d\n", + BTRFS_COMPRESS_ZLIB, em->compress_type); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* Split compressed extent */ + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != compressed_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + compressed_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", + em->start, em->orig_start); + goto out; + } + if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + test_msg("Unexpected compress type, wanted %d, got %d\n", + BTRFS_COMPRESS_ZLIB, em->compress_type); + goto out; + } + disk_bytenr = em->block_start; + orig_start = em->start; + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != disk_bytenr) { + test_msg("Block start does not match, want %llu got %llu\n", + disk_bytenr, em->block_start); + goto out; + } + if (em->start != offset || em->len != 8192) { + test_msg("Unexpected extent wanted start %llu len 8192, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != compressed_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + compressed_only, em->flags); + goto out; + } + if (em->orig_start != orig_start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", + em->start, orig_start); + goto out; + } + if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + test_msg("Unexpected compress type, wanted %d, got %d\n", + BTRFS_COMPRESS_ZLIB, em->compress_type); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* A hole between regular extents but no hole extent */ + em = btrfs_get_extent(inode, NULL, 0, offset + 6, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096 * 1024, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_msg("Expected a hole extent, got %llu\n", em->block_start); + goto out; + } + /* + * Currently we just return a length that we requested rather than the + * length of the actual hole, if this changes we'll have to change this + * test. + */ + if (em->start != offset || em->len != 12288) { + test_msg("Unexpected extent wanted start %llu len 12288, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != vacancy_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + vacancy_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + ret = 0; +out: + if (!IS_ERR(em)) + free_extent_map(em); + iput(inode); + btrfs_free_dummy_root(root); + return ret; +} + +static int test_hole_first(void) +{ + struct inode *inode = NULL; + struct btrfs_root *root = NULL; + struct extent_map *em = NULL; + int ret = -ENOMEM; + + inode = btrfs_new_test_inode(); + if (!inode) { + test_msg("Couldn't allocate inode\n"); + return ret; + } + + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; + BTRFS_I(inode)->location.offset = 0; + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + goto out; + } + + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + goto out; + } + + root->node = alloc_dummy_extent_buffer(0, 4096); + if (!root->node) { + test_msg("Couldn't allocate dummy buffer\n"); + goto out; + } + + extent_buffer_get(root->node); + btrfs_set_header_nritems(root->node, 0); + btrfs_set_header_level(root->node, 0); + BTRFS_I(inode)->root = root; + ret = -EINVAL; + + /* + * Need a blank inode item here just so we don't confuse + * btrfs_get_extent. + */ + insert_inode_item_key(root); + insert_extent(root, 4096, 4096, 4096, 0, 4096, 4096, + BTRFS_FILE_EXTENT_REG, 0, 1); + em = btrfs_get_extent(inode, NULL, 0, 0, 8192, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_msg("Expected a hole, got %llu\n", em->block_start); + goto out; + } + if (em->start != 0 || em->len != 4096) { + test_msg("Unexpected extent wanted start 0 len 4096, got start " + "%llu len %llu\n", em->start, em->len); + goto out; + } + if (em->flags != vacancy_only) { + test_msg("Wrong flags, wanted %lu, have %lu\n", vacancy_only, + em->flags); + goto out; + } + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, 4096, 8192, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != 4096) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != 4096 || em->len != 4096) { + test_msg("Unexpected extent wanted start 4096 len 4096, got " + "start %llu len %llu\n", em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, wanted 0 got %lu\n", + em->flags); + goto out; + } + ret = 0; +out: + if (!IS_ERR(em)) + free_extent_map(em); + iput(inode); + btrfs_free_dummy_root(root); + return ret; +} + +int btrfs_test_inodes(void) +{ + int ret; + + set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only); + set_bit(EXTENT_FLAG_VACANCY, &vacancy_only); + set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only); + + test_msg("Running btrfs_get_extent tests\n"); + ret = test_btrfs_get_extent(); + if (ret) + return ret; + test_msg("Running hole first btrfs_get_extent test\n"); + return test_hole_first(); +} diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c new file mode 100644 index 00000000000..ec3dcb20235 --- /dev/null +++ b/fs/btrfs/tests/qgroup-tests.c @@ -0,0 +1,470 @@ +/* + * Copyright (C) 2013 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../transaction.h" +#include "../disk-io.h" +#include "../qgroup.h" + +static void init_dummy_trans(struct btrfs_trans_handle *trans) +{ + memset(trans, 0, sizeof(*trans)); + trans->transid = 1; + INIT_LIST_HEAD(&trans->qgroup_ref_list); + trans->type = __TRANS_DUMMY; +} + +static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 parent, u64 root_objectid) +{ + struct btrfs_trans_handle trans; + struct btrfs_extent_item *item; + struct btrfs_extent_inline_ref *iref; + struct btrfs_tree_block_info *block_info; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key ins; + u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info); + int ret; + + init_dummy_trans(&trans); + + ins.objectid = bytenr; + ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(&trans, root, path, &ins, size); + if (ret) { + test_msg("Couldn't insert ref %d\n", ret); + btrfs_free_path(path); + return ret; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, item, 1); + btrfs_set_extent_generation(leaf, item, 1); + btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_TREE_BLOCK); + block_info = (struct btrfs_tree_block_info *)(item + 1); + btrfs_set_tree_block_level(leaf, block_info, 1); + iref = (struct btrfs_extent_inline_ref *)(block_info + 1); + if (parent > 0) { + btrfs_set_extent_inline_ref_type(leaf, iref, + BTRFS_SHARED_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else { + btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); + } + btrfs_free_path(path); + return 0; +} + +static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, + u64 parent, u64 root_objectid) +{ + struct btrfs_trans_handle trans; + struct btrfs_extent_item *item; + struct btrfs_path *path; + struct btrfs_key key; + u64 refs; + int ret; + + init_dummy_trans(&trans); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + + path->leave_spinning = 1; + ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); + if (ret) { + test_msg("Couldn't find extent ref\n"); + btrfs_free_path(path); + return ret; + } + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + refs = btrfs_extent_refs(path->nodes[0], item); + btrfs_set_extent_refs(path->nodes[0], item, refs + 1); + btrfs_release_path(path); + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_insert_empty_item(&trans, root, path, &key, 0); + if (ret) + test_msg("Failed to insert backref\n"); + btrfs_free_path(path); + return ret; +} + +static int remove_extent_item(struct btrfs_root *root, u64 bytenr, + u64 num_bytes) +{ + struct btrfs_trans_handle trans; + struct btrfs_key key; + struct btrfs_path *path; + int ret; + + init_dummy_trans(&trans); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + path->leave_spinning = 1; + + ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); + if (ret) { + test_msg("Didn't find our key %d\n", ret); + btrfs_free_path(path); + return ret; + } + btrfs_del_item(&trans, root, path); + btrfs_free_path(path); + return 0; +} + +static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 parent, u64 root_objectid) +{ + struct btrfs_trans_handle trans; + struct btrfs_extent_item *item; + struct btrfs_path *path; + struct btrfs_key key; + u64 refs; + int ret; + + init_dummy_trans(&trans); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + + path->leave_spinning = 1; + ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); + if (ret) { + test_msg("Couldn't find extent ref\n"); + btrfs_free_path(path); + return ret; + } + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + refs = btrfs_extent_refs(path->nodes[0], item); + btrfs_set_extent_refs(path->nodes[0], item, refs - 1); + btrfs_release_path(path); + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); + if (ret) { + test_msg("Couldn't find backref %d\n", ret); + btrfs_free_path(path); + return ret; + } + btrfs_del_item(&trans, root, path); + btrfs_free_path(path); + return ret; +} + +static int test_no_shared_qgroup(struct btrfs_root *root) +{ + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + init_dummy_trans(&trans); + + test_msg("Qgroup basic add\n"); + ret = btrfs_create_qgroup(NULL, fs_info, 5, NULL); + if (ret) { + test_msg("Couldn't create a qgroup %d\n", ret); + return ret; + } + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) { + test_msg("Couldn't add space to a qgroup %d\n", ret); + return ret; + } + + ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); + if (ret) + return ret; + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Delayed qgroup accounting failed %d\n", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + ret = remove_extent_item(root, 4096, 4096); + if (ret) + return -EINVAL; + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, + BTRFS_QGROUP_OPER_SUB_EXCL, 0); + if (ret) { + test_msg("Couldn't remove space from the qgroup %d\n", ret); + return -EINVAL; + } + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Qgroup accounting failed %d\n", ret); + return -EINVAL; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 0, 0)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + return 0; +} + +/* + * Add a ref for two different roots to make sure the shared value comes out + * right, also remove one of the roots and make sure the exclusive count is + * adjusted properly. + */ +static int test_multiple_refs(struct btrfs_root *root) +{ + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + init_dummy_trans(&trans); + + test_msg("Qgroup multiple refs test\n"); + + /* We have 5 created already from the previous test */ + ret = btrfs_create_qgroup(NULL, fs_info, 256, NULL); + if (ret) { + test_msg("Couldn't create a qgroup %d\n", ret); + return ret; + } + + ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); + if (ret) + return ret; + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) { + test_msg("Couldn't add space to a qgroup %d\n", ret); + return ret; + } + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Delayed qgroup accounting failed %d\n", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + ret = add_tree_ref(root, 4096, 4096, 0, 256); + if (ret) + return ret; + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, + BTRFS_QGROUP_OPER_ADD_SHARED, 0); + if (ret) { + test_msg("Qgroup record ref failed %d\n", ret); + return ret; + } + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Qgroup accounting failed %d\n", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 0)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + if (btrfs_verify_qgroup_counts(fs_info, 256, 4096, 0)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + ret = remove_extent_ref(root, 4096, 4096, 0, 256); + if (ret) + return ret; + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, + BTRFS_QGROUP_OPER_SUB_SHARED, 0); + if (ret) { + test_msg("Qgroup record ref failed %d\n", ret); + return ret; + } + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Qgroup accounting failed %d\n", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, 256, 0, 0)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + return 0; +} + +int btrfs_test_qgroups(void) +{ + struct btrfs_root *root; + struct btrfs_root *tmp_root; + int ret = 0; + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + return PTR_ERR(root); + } + + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + ret = -ENOMEM; + goto out; + } + + /* + * Can't use bytenr 0, some things freak out + * *cough*backref walking code*cough* + */ + root->node = alloc_test_extent_buffer(root->fs_info, 4096, 4096); + if (!root->node) { + test_msg("Couldn't allocate dummy buffer\n"); + ret = -ENOMEM; + goto out; + } + btrfs_set_header_level(root->node, 0); + btrfs_set_header_nritems(root->node, 0); + root->alloc_bytenr += 8192; + + tmp_root = btrfs_alloc_dummy_root(); + if (IS_ERR(tmp_root)) { + test_msg("Couldn't allocate a fs root\n"); + ret = PTR_ERR(tmp_root); + goto out; + } + + tmp_root->root_key.objectid = 5; + root->fs_info->fs_root = tmp_root; + ret = btrfs_insert_fs_root(root->fs_info, tmp_root); + if (ret) { + test_msg("Couldn't insert fs root %d\n", ret); + goto out; + } + + tmp_root = btrfs_alloc_dummy_root(); + if (IS_ERR(tmp_root)) { + test_msg("Couldn't allocate a fs root\n"); + ret = PTR_ERR(tmp_root); + goto out; + } + + tmp_root->root_key.objectid = 256; + ret = btrfs_insert_fs_root(root->fs_info, tmp_root); + if (ret) { + test_msg("Couldn't insert fs root %d\n", ret); + goto out; + } + + /* We are using this root as our extent root */ + root->fs_info->extent_root = root; + + /* + * Some of the paths we test assume we have a filled out fs_info, so we + * just need to addt he root in there so we don't panic. + */ + root->fs_info->tree_root = root; + root->fs_info->quota_root = root; + root->fs_info->quota_enabled = 1; + + test_msg("Running qgroup tests\n"); + ret = test_no_shared_qgroup(root); + if (ret) + goto out; + ret = test_multiple_refs(root); +out: + btrfs_free_dummy_root(root); + return ret; +} diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8c81bdc1ef9..5f379affdf2 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -31,6 +31,7 @@ #include "inode-map.h" #include "volumes.h" #include "dev-replace.h" +#include "qgroup.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -57,12 +58,12 @@ static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { __TRANS_JOIN_NOLOCK), }; -static void put_transaction(struct btrfs_transaction *transaction) +void btrfs_put_transaction(struct btrfs_transaction *transaction) { WARN_ON(atomic_read(&transaction->use_count) == 0); if (atomic_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); - WARN_ON(transaction->delayed_refs.root.rb_node); + WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root)); while (!list_empty(&transaction->pending_chunks)) { struct extent_map *em; @@ -75,10 +76,21 @@ static void put_transaction(struct btrfs_transaction *transaction) } } -static noinline void switch_commit_root(struct btrfs_root *root) +static noinline void switch_commit_roots(struct btrfs_transaction *trans, + struct btrfs_fs_info *fs_info) { - free_extent_buffer(root->commit_root); - root->commit_root = btrfs_root_node(root); + struct btrfs_root *root, *tmp; + + down_write(&fs_info->commit_root_sem); + list_for_each_entry_safe(root, tmp, &trans->switch_commits, + dirty_list) { + list_del_init(&root->dirty_list); + free_extent_buffer(root->commit_root); + root->commit_root = btrfs_root_node(root); + if (is_fstree(root->objectid)) + btrfs_unpin_free_ino(root); + } + up_write(&fs_info->commit_root_sem); } static inline void extwriter_counter_inc(struct btrfs_transaction *trans, @@ -183,8 +195,8 @@ loop: atomic_set(&cur_trans->use_count, 2); cur_trans->start_time = get_seconds(); - cur_trans->delayed_refs.root = RB_ROOT; - cur_trans->delayed_refs.num_entries = 0; + cur_trans->delayed_refs.href_root = RB_ROOT; + atomic_set(&cur_trans->delayed_refs.num_entries, 0); cur_trans->delayed_refs.num_heads_ready = 0; cur_trans->delayed_refs.num_heads = 0; cur_trans->delayed_refs.flushing = 0; @@ -196,21 +208,19 @@ loop: */ smp_mb(); if (!list_empty(&fs_info->tree_mod_seq_list)) - WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when " + WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when " "creating a fresh transaction\n"); if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) - WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when " + WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when " "creating a fresh transaction\n"); atomic64_set(&fs_info->tree_mod_seq, 0); spin_lock_init(&cur_trans->delayed_refs.lock); - atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0); - atomic_set(&cur_trans->delayed_refs.ref_seq, 0); - init_waitqueue_head(&cur_trans->delayed_refs.wait); INIT_LIST_HEAD(&cur_trans->pending_snapshots); INIT_LIST_HEAD(&cur_trans->ordered_operations); INIT_LIST_HEAD(&cur_trans->pending_chunks); + INIT_LIST_HEAD(&cur_trans->switch_commits); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); @@ -232,18 +242,19 @@ loop: static int record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - if (root->ref_cows && root->last_trans < trans->transid) { + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + root->last_trans < trans->transid) { WARN_ON(root == root->fs_info->extent_root); WARN_ON(root->commit_root != root->node); /* - * see below for in_trans_setup usage rules + * see below for IN_TRANS_SETUP usage rules * we have the reloc mutex held now, so there * is only one writer in this function */ - root->in_trans_setup = 1; + set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); - /* make sure readers find in_trans_setup before + /* make sure readers find IN_TRANS_SETUP before * they find our root->last_trans update */ smp_wmb(); @@ -270,7 +281,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, * But, we have to set root->last_trans before we * init the relocation root, otherwise, we trip over warnings * in ctree.c. The solution used here is to flag ourselves - * with root->in_trans_setup. When this is 1, we're still + * with root IN_TRANS_SETUP. When this is 1, we're still * fixing up the reloc trees and everyone must wait. * * When this is zero, they can trust root->last_trans and fly @@ -279,8 +290,8 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, * done before we pop in the zero below */ btrfs_init_reloc_root(trans, root); - smp_wmb(); - root->in_trans_setup = 0; + smp_mb__before_atomic(); + clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); } return 0; } @@ -289,16 +300,16 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return 0; /* - * see record_root_in_trans for comments about in_trans_setup usage + * see record_root_in_trans for comments about IN_TRANS_SETUP usage * and barriers */ smp_rmb(); if (root->last_trans == trans->transid && - !root->in_trans_setup) + !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state)) return 0; mutex_lock(&root->fs_info->reloc_mutex); @@ -332,7 +343,7 @@ static void wait_current_trans(struct btrfs_root *root) wait_event(root->fs_info->transaction_wait, cur_trans->state >= TRANS_STATE_UNBLOCKED || cur_trans->aborted); - put_transaction(cur_trans); + btrfs_put_transaction(cur_trans); } else { spin_unlock(&root->fs_info->trans_lock); } @@ -353,6 +364,17 @@ static int may_wait_transaction(struct btrfs_root *root, int type) return 0; } +static inline bool need_reserve_reloc_root(struct btrfs_root *root) +{ + if (!root->fs_info->reloc_ctl || + !test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + root->reloc_root) + return false; + + return true; +} + static struct btrfs_trans_handle * start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, enum btrfs_reserve_flush_enum flush) @@ -360,8 +382,12 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; u64 num_bytes = 0; - int ret; u64 qgroup_reserved = 0; + bool reloc_reserved = false; + int ret; + + /* Send isn't supposed to start transactions. */ + ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB); if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return ERR_PTR(-EROFS); @@ -390,6 +416,14 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, } num_bytes = btrfs_calc_trans_metadata_size(root, num_items); + /* + * Do the reservation for the relocation root creation + */ + if (unlikely(need_reserve_reloc_root(root))) { + num_bytes += root->nodesize; + reloc_reserved = true; + } + ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv, num_bytes, flush); @@ -451,12 +485,15 @@ again: h->delayed_ref_elem.seq = 0; h->type = type; h->allocating_chunk = false; + h->reloc_reserved = false; + h->sync = false; INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); smp_mb(); if (cur_trans->state >= TRANS_STATE_BLOCKED && may_wait_transaction(root, type)) { + current->journal_info = h; btrfs_commit_transaction(h, root); goto again; } @@ -466,6 +503,7 @@ again: h->transid, num_bytes, 1); h->block_rsv = &root->fs_info->trans_block_rsv; h->bytes_reserved = num_bytes; + h->reloc_reserved = reloc_reserved; } h->qgroup_reserved = qgroup_reserved; @@ -610,7 +648,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) } wait_for_commit(root, cur_trans); - put_transaction(cur_trans); + btrfs_put_transaction(cur_trans); out: return ret; } @@ -625,7 +663,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { if (root->fs_info->global_block_rsv.space_info->full && - btrfs_should_throttle_delayed_refs(trans, root)) + btrfs_check_space_for_delayed_refs(trans, root)) return 1; return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); @@ -662,20 +700,35 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, unsigned long cur = trans->delayed_ref_updates; int lock = (trans->type != TRANS_JOIN_NOLOCK); int err = 0; + int must_run_delayed_refs = 0; - if (--trans->use_count) { + if (trans->use_count > 1) { + trans->use_count--; trans->block_rsv = trans->orig_rsv; return 0; } - /* - * do the qgroup accounting as early as possible - */ - err = btrfs_delayed_refs_qgroup_accounting(trans, info); - btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; + if (!list_empty(&trans->new_bgs)) + btrfs_create_pending_block_groups(trans, root); + + trans->delayed_ref_updates = 0; + if (!trans->sync) { + must_run_delayed_refs = + btrfs_should_throttle_delayed_refs(trans, root); + cur = max_t(unsigned long, cur, 32); + + /* + * don't make the caller wait if they are from a NOLOCK + * or ATTACH transaction, it will deadlock with commit + */ + if (must_run_delayed_refs == 1 && + (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH))) + must_run_delayed_refs = 2; + } + if (trans->qgroup_reserved) { /* * the same root has to be passed here between start_transaction @@ -685,16 +738,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, trans->qgroup_reserved = 0; } - if (!list_empty(&trans->new_bgs)) - btrfs_create_pending_block_groups(trans, root); - - trans->delayed_ref_updates = 0; - if (btrfs_should_throttle_delayed_refs(trans, root)) { - cur = max_t(unsigned long, cur, 1); - trans->delayed_ref_updates = 0; - btrfs_run_delayed_refs(trans, root, cur); - } - btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; @@ -711,17 +754,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) { - if (throttle) { - /* - * We may race with somebody else here so end up having - * to call end_transaction on ourselves again, so inc - * our use_count. - */ - trans->use_count++; + if (throttle) return btrfs_commit_transaction(trans, root); - } else { + else wake_up_process(info->transaction_kthread); - } } if (trans->type & __TRANS_FREEZABLE) @@ -735,7 +771,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, smp_mb(); if (waitqueue_active(&cur_trans->writer_wait)) wake_up(&cur_trans->writer_wait); - put_transaction(cur_trans); + btrfs_put_transaction(cur_trans); if (current->journal_info == trans) current->journal_info = NULL; @@ -744,11 +780,17 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_run_delayed_iputs(root); if (trans->aborted || - test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) + test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { + wake_up_process(info->transaction_kthread); err = -EIO; + } assert_qgroups_uptodate(trans); kmem_cache_free(btrfs_trans_handle_cachep, trans); + if (must_run_delayed_refs) { + btrfs_async_run_delayed_refs(root, cur, + must_run_delayed_refs == 1); + } return err; } @@ -764,12 +806,6 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, return __btrfs_end_transaction(trans, root, 1); } -int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - return __btrfs_end_transaction(trans, root, 1); -} - /* * when btree blocks are allocated, they have some corresponding bits set for * them in one of two extent_io trees. This is used to make sure all of @@ -909,9 +945,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, return ret; } - if (root != root->fs_info->extent_root) - switch_commit_root(root); - return 0; } @@ -948,31 +981,35 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, return ret; ret = btrfs_run_dev_stats(trans, root->fs_info); - WARN_ON(ret); + if (ret) + return ret; ret = btrfs_run_dev_replace(trans, root->fs_info); - WARN_ON(ret); - + if (ret) + return ret; ret = btrfs_run_qgroups(trans, root->fs_info); - BUG_ON(ret); + if (ret) + return ret; /* run_qgroups might have added some more refs */ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); + if (ret) + return ret; while (!list_empty(&fs_info->dirty_cowonly_roots)) { next = fs_info->dirty_cowonly_roots.next; list_del_init(next); root = list_entry(next, struct btrfs_root, dirty_list); + if (root != fs_info->extent_root) + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); ret = update_cowonly_root(trans, root); if (ret) return ret; } - down_write(&fs_info->extent_commit_sem); - switch_commit_root(fs_info->extent_root); - up_write(&fs_info->extent_commit_sem); - + list_add_tail(&fs_info->extent_root->dirty_list, + &trans->transaction->switch_commits); btrfs_after_dev_replace_commit(fs_info); return 0; @@ -1025,15 +1062,12 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, btrfs_save_ino_cache(root, trans); /* see comments in should_cow_block() */ - root->force_cow = 0; - smp_wmb(); + clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); + smp_mb__after_atomic(); if (root->commit_root != root->node) { - mutex_lock(&root->fs_commit_mutex); - switch_commit_root(root); - btrfs_unpin_free_ino(root); - mutex_unlock(&root->fs_commit_mutex); - + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); btrfs_set_root_node(&root->root_item, root->node); } @@ -1060,7 +1094,7 @@ int btrfs_defrag_root(struct btrfs_root *root) struct btrfs_trans_handle *trans; int ret; - if (xchg(&root->defrag_running, 1)) + if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state)) return 0; while (1) { @@ -1078,12 +1112,12 @@ int btrfs_defrag_root(struct btrfs_root *root) break; if (btrfs_defrag_cancelled(root->fs_info)) { - printk(KERN_DEBUG "btrfs: defrag_root cancelled\n"); + pr_debug("BTRFS: defrag_root cancelled\n"); ret = -EAGAIN; break; } } - root->defrag_running = 0; + clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state); return ret; } @@ -1147,12 +1181,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto no_free_objectid; } - pending->error = btrfs_qgroup_inherit(trans, fs_info, - root->root_key.objectid, - objectid, pending->inherit); - if (pending->error) - goto no_free_objectid; - key.objectid = objectid; key.offset = (u64)-1; key.type = BTRFS_ROOT_ITEM_KEY; @@ -1249,8 +1277,26 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } + /* + * We need to flush delayed refs in order to make sure all of our quota + * operations have been done before we call btrfs_qgroup_inherit. + */ + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + ret = btrfs_qgroup_inherit(trans, fs_info, + root->root_key.objectid, + objectid, pending->inherit); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + /* see comments in should_cow_block() */ - root->force_cow = 1; + set_bit(BTRFS_ROOT_FORCE_COW, &root->state); smp_wmb(); btrfs_set_root_node(new_root_item, tmp); @@ -1453,7 +1499,7 @@ static void do_async_commit(struct work_struct *work) * We've got freeze protection passed with the transaction. * Tell lockdep about it. */ - if (ac->newtrans->type < TRANS_JOIN_NOLOCK) + if (ac->newtrans->type & __TRANS_FREEZABLE) rwsem_acquire_read( &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 0, 1, _THIS_IP_); @@ -1494,7 +1540,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, * Tell lockdep we've released the freeze rwsem, since the * async commit thread will be the one to unlock it. */ - if (trans->type < TRANS_JOIN_NOLOCK) + if (ac->newtrans->type & __TRANS_FREEZABLE) rwsem_release( &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1, _THIS_IP_); @@ -1510,7 +1556,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, if (current->journal_info == trans) current->journal_info = NULL; - put_transaction(cur_trans); + btrfs_put_transaction(cur_trans); return 0; } @@ -1552,15 +1598,16 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, root->fs_info->running_transaction = NULL; spin_unlock(&root->fs_info->trans_lock); - put_transaction(cur_trans); - put_transaction(cur_trans); + if (trans->type & __TRANS_FREEZABLE) + sb_end_intwrite(root->fs_info->sb); + btrfs_put_transaction(cur_trans); + btrfs_put_transaction(cur_trans); trace_btrfs_transaction_commit(root); - btrfs_scrub_continue(root); - if (current->journal_info == trans) current->journal_info = NULL; + btrfs_scrub_cancel(root->fs_info); kmem_cache_free(btrfs_trans_handle_cachep, trans); } @@ -1575,13 +1622,6 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, return ret; /* - * running the delayed items may have added new refs. account - * them now so that they hinder processing of more delayed refs - * as little as possible. - */ - btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); - - /* * rename don't use btrfs_join_transaction, so, once we * set the transaction to blocked above, we aren't going * to get any new ordered operations. We can safely run @@ -1596,14 +1636,14 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) - return btrfs_start_all_delalloc_inodes(fs_info, 1); + return btrfs_start_delalloc_roots(fs_info, 1, -1); return 0; } static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) { if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) - btrfs_wait_all_ordered_extents(fs_info); + btrfs_wait_ordered_roots(fs_info, -1); } int btrfs_commit_transaction(struct btrfs_trans_handle *trans, @@ -1669,7 +1709,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, wait_for_commit(root, cur_trans); - put_transaction(cur_trans); + btrfs_put_transaction(cur_trans); return ret; } @@ -1686,7 +1726,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, wait_for_commit(root, prev_trans); - put_transaction(prev_trans); + btrfs_put_transaction(prev_trans); } else { spin_unlock(&root->fs_info->trans_lock); } @@ -1713,6 +1753,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto cleanup_transaction; btrfs_wait_delalloc_flush(root->fs_info); + + btrfs_scrub_pause(root); /* * Ok now we need to make sure to block out any other joins while we * commit the transaction. We could have started a join before setting @@ -1727,7 +1769,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, /* ->aborted might be set after the previous check, so check it */ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; - goto cleanup_transaction; + goto scrub_continue; } /* * the reloc mutex makes sure that we stop @@ -1744,7 +1786,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = create_pending_snapshots(trans, root->fs_info); if (ret) { mutex_unlock(&root->fs_info->reloc_mutex); - goto cleanup_transaction; + goto scrub_continue; } /* @@ -1760,13 +1802,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_items(trans, root); if (ret) { mutex_unlock(&root->fs_info->reloc_mutex); - goto cleanup_transaction; + goto scrub_continue; } ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) { mutex_unlock(&root->fs_info->reloc_mutex); - goto cleanup_transaction; + goto scrub_continue; } /* @@ -1777,7 +1819,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, WARN_ON(cur_trans != trans->transaction); - btrfs_scrub_pause(root); /* btrfs_commit_tree_roots is responsible for getting the * various roots consistent with each other. Every pointer * in the tree of tree roots has to point to the most up to date @@ -1797,9 +1838,18 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (ret) { mutex_unlock(&root->fs_info->tree_log_mutex); mutex_unlock(&root->fs_info->reloc_mutex); - goto cleanup_transaction; + goto scrub_continue; } + /* + * Since the transaction is done, we should set the inode map cache flag + * before any other comming transaction. + */ + if (btrfs_test_opt(root, CHANGE_INODE_CACHE)) + btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE); + else + btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE); + /* commit_fs_roots gets rid of all the tree log roots, it is now * safe to free the root of tree log roots */ @@ -1809,7 +1859,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (ret) { mutex_unlock(&root->fs_info->tree_log_mutex); mutex_unlock(&root->fs_info->reloc_mutex); - goto cleanup_transaction; + goto scrub_continue; } /* @@ -1820,7 +1870,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = cur_trans->aborted; mutex_unlock(&root->fs_info->tree_log_mutex); mutex_unlock(&root->fs_info->reloc_mutex); - goto cleanup_transaction; + goto scrub_continue; } btrfs_prepare_extent_commit(trans, root); @@ -1829,11 +1879,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_set_root_node(&root->fs_info->tree_root->root_item, root->fs_info->tree_root->node); - switch_commit_root(root->fs_info->tree_root); + list_add_tail(&root->fs_info->tree_root->dirty_list, + &cur_trans->switch_commits); btrfs_set_root_node(&root->fs_info->chunk_root->root_item, root->fs_info->chunk_root->node); - switch_commit_root(root->fs_info->chunk_root); + list_add_tail(&root->fs_info->chunk_root->dirty_list, + &cur_trans->switch_commits); + + switch_commit_roots(cur_trans, root->fs_info); assert_qgroups_uptodate(trans); update_super_roots(root); @@ -1856,13 +1910,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_error(root->fs_info, ret, "Error while writing out transaction"); mutex_unlock(&root->fs_info->tree_log_mutex); - goto cleanup_transaction; + goto scrub_continue; } ret = write_ctree_super(trans, root, 0); if (ret) { mutex_unlock(&root->fs_info->tree_log_mutex); - goto cleanup_transaction; + goto scrub_continue; } /* @@ -1885,8 +1939,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, list_del_init(&cur_trans->list); spin_unlock(&root->fs_info->trans_lock); - put_transaction(cur_trans); - put_transaction(cur_trans); + btrfs_put_transaction(cur_trans); + btrfs_put_transaction(cur_trans); if (trans->type & __TRANS_FREEZABLE) sb_end_intwrite(root->fs_info->sb); @@ -1905,6 +1959,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, return ret; +scrub_continue: + btrfs_scrub_continue(root); cleanup_transaction: btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; @@ -1945,7 +2001,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) list_del_init(&root->root_list); spin_unlock(&fs_info->trans_lock); - pr_debug("btrfs: cleaner removing %llu\n", root->objectid); + pr_debug("BTRFS: cleaner removing %llu\n", root->objectid); btrfs_kill_all_delayed_nodes(root); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 5c2af849162..7dd558ed071 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -57,6 +57,7 @@ struct btrfs_transaction { struct list_head pending_snapshots; struct list_head ordered_operations; struct list_head pending_chunks; + struct list_head switch_commits; struct btrfs_delayed_ref_root delayed_refs; int aborted; }; @@ -68,6 +69,7 @@ struct btrfs_transaction { #define __TRANS_ATTACH (1U << 10) #define __TRANS_JOIN (1U << 11) #define __TRANS_JOIN_NOLOCK (1U << 12) +#define __TRANS_DUMMY (1U << 13) #define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE) #define TRANS_START (__TRANS_START | __TRANS_FREEZABLE) @@ -78,6 +80,8 @@ struct btrfs_transaction { #define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \ __TRANS_ATTACH) +#define BTRFS_SEND_TRANS_STUB 1 + struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; @@ -92,6 +96,8 @@ struct btrfs_trans_handle { short aborted; short adding_csums; bool allocating_chunk; + bool reloc_reserved; + bool sync; unsigned int type; /* * this root is only needed to validate that the root passed to @@ -153,8 +159,6 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, int wait_for_unblock); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans, - struct btrfs_root *root); int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_throttle(struct btrfs_root *root); @@ -166,4 +170,5 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark); int btrfs_transaction_blocked(struct btrfs_fs_info *info); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); +void btrfs_put_transaction(struct btrfs_transaction *transaction); #endif diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 94e05c1f118..a63719cc957 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -37,7 +37,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, int ret = 0; int wret; int level; - int is_extent = 0; int next_key_ret = 0; u64 last_ret = 0; u64 min_trans = 0; @@ -50,7 +49,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, goto out; } - if (root->ref_cows == 0 && !is_extent) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) goto out; if (btrfs_test_opt(root, SSD)) @@ -85,7 +84,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, path->keep_locks = 1; - ret = btrfs_search_forward(root, &key, NULL, path, min_trans); + ret = btrfs_search_forward(root, &key, path, min_trans); if (ret < 0) goto out; if (ret > 0) { diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 79f057c0619..9e1f2cd5e67 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -20,14 +20,11 @@ #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/list_sort.h> -#include "ctree.h" -#include "transaction.h" +#include "tree-log.h" #include "disk-io.h" #include "locking.h" #include "print-tree.h" #include "backref.h" -#include "compat.h" -#include "tree-log.h" #include "hash.h" /* magic values for the inode_only field in btrfs_log_inode: @@ -137,43 +134,61 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, * syncing the tree wait for us to finish */ static int start_log_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_root *root, + struct btrfs_log_ctx *ctx) { + int index; int ret; - int err = 0; mutex_lock(&root->log_mutex); if (root->log_root) { + if (btrfs_need_log_full_commit(root->fs_info, trans)) { + ret = -EAGAIN; + goto out; + } if (!root->log_start_pid) { root->log_start_pid = current->pid; - root->log_multiple_pids = false; + clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); } else if (root->log_start_pid != current->pid) { - root->log_multiple_pids = true; + set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); } atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); + if (ctx) { + index = root->log_transid % 2; + list_add_tail(&ctx->list, &root->log_ctxs[index]); + ctx->log_transid = root->log_transid; + } mutex_unlock(&root->log_mutex); return 0; } - root->log_multiple_pids = false; - root->log_start_pid = current->pid; + + ret = 0; mutex_lock(&root->fs_info->tree_log_mutex); - if (!root->fs_info->log_root_tree) { + if (!root->fs_info->log_root_tree) ret = btrfs_init_log_root_tree(trans, root->fs_info); - if (ret) - err = ret; - } - if (err == 0 && !root->log_root) { + mutex_unlock(&root->fs_info->tree_log_mutex); + if (ret) + goto out; + + if (!root->log_root) { ret = btrfs_add_log_tree(trans, root); if (ret) - err = ret; + goto out; } - mutex_unlock(&root->fs_info->tree_log_mutex); + clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); + root->log_start_pid = current->pid; atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); + if (ctx) { + index = root->log_transid % 2; + list_add_tail(&ctx->list, &root->log_ctxs[index]); + ctx->log_transid = root->log_transid; + } +out: mutex_unlock(&root->log_mutex); - return err; + return ret; } /* @@ -571,7 +586,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, if (btrfs_file_extent_disk_bytenr(eb, item) == 0) nbytes = 0; } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - size = btrfs_file_extent_inline_len(eb, item); + size = btrfs_file_extent_inline_len(eb, slot, item); nbytes = btrfs_file_extent_ram_bytes(eb, item); extent_end = ALIGN(start + size, root->sectorsize); } else { @@ -936,7 +951,7 @@ again: parent_objectid, victim_name, victim_name_len)) { - btrfs_inc_nlink(inode); + inc_nlink(inode); btrfs_release_path(path); ret = btrfs_unlink_inode(trans, root, dir, @@ -1006,7 +1021,7 @@ again: victim_parent = read_one_inode(root, parent_objectid); if (victim_parent) { - btrfs_inc_nlink(inode); + inc_nlink(inode); btrfs_release_path(path); ret = btrfs_unlink_inode(trans, root, @@ -1113,11 +1128,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - struct inode *dir; - struct inode *inode; + struct inode *dir = NULL; + struct inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; - char *name; + char *name = NULL; int namelen; int ret; int search_done = 0; @@ -1150,13 +1165,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * care of the rest */ dir = read_one_inode(root, parent_objectid); - if (!dir) - return -ENOENT; + if (!dir) { + ret = -ENOENT; + goto out; + } inode = read_one_inode(root, inode_objectid); if (!inode) { - iput(dir); - return -EIO; + ret = -EIO; + goto out; } while (ref_ptr < ref_end) { @@ -1169,14 +1186,16 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, */ if (!dir) dir = read_one_inode(root, parent_objectid); - if (!dir) - return -ENOENT; + if (!dir) { + ret = -ENOENT; + goto out; + } } else { ret = ref_get_fields(eb, ref_ptr, &namelen, &name, &ref_index); } if (ret) - return ret; + goto out; /* if we already have a perfect match, we're done */ if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), @@ -1196,12 +1215,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, parent_objectid, ref_index, name, namelen, &search_done); - if (ret == 1) { - ret = 0; + if (ret) { + if (ret == 1) + ret = 0; goto out; } - if (ret) - goto out; } /* insert our name */ @@ -1215,6 +1233,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; kfree(name); + name = NULL; if (log_ref_ver) { iput(dir); dir = NULL; @@ -1225,6 +1244,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, ret = overwrite_item(trans, root, path, eb, slot, key); out: btrfs_release_path(path); + kfree(name); iput(dir); iput(inode); return ret; @@ -1234,7 +1254,8 @@ static int insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset) { int ret; - ret = btrfs_find_orphan_item(root, offset); + ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID, + offset, BTRFS_ORPHAN_ITEM_KEY, NULL); if (ret > 0) ret = btrfs_insert_orphan_item(trans, root, offset); return ret; @@ -1307,6 +1328,7 @@ static int count_inode_refs(struct btrfs_root *root, break; path->slots[0]--; } +process_slot: btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid != ino || @@ -1327,6 +1349,10 @@ static int count_inode_refs(struct btrfs_root *root, if (key.offset == 0) break; + if (path->slots[0] > 0) { + path->slots[0]--; + goto process_slot; + } key.offset--; btrfs_release_path(path); } @@ -1480,7 +1506,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, if (!inode->i_nlink) set_nlink(inode, 1); else - btrfs_inc_nlink(inode); + inc_nlink(inode); ret = btrfs_update_inode(trans, root, inode); } else if (ret == -EEXIST) { ret = 0; @@ -1823,7 +1849,7 @@ again: dir_key->offset, name, name_len, 0); } - if (IS_ERR_OR_NULL(log_di)) { + if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { btrfs_dir_item_key_to_cpu(eb, di, &location); btrfs_release_path(path); btrfs_release_path(log_path); @@ -1841,7 +1867,7 @@ again: goto out; } - btrfs_inc_nlink(inode); + inc_nlink(inode); ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); if (!ret) @@ -1860,6 +1886,9 @@ again: goto again; ret = 0; goto out; + } else if (IS_ERR(log_di)) { + kfree(name); + return PTR_ERR(log_di); } btrfs_release_path(log_path); kfree(name); @@ -2118,8 +2147,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, WARN_ON(*level >= BTRFS_MAX_LEVEL); cur = path->nodes[*level]; - if (btrfs_header_level(cur) != *level) - WARN_ON(1); + WARN_ON(btrfs_header_level(cur) != *level); if (path->slots[*level] >= btrfs_header_nritems(cur)) @@ -2151,11 +2179,13 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, return ret; } - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); - clean_tree_block(trans, root, next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); + if (trans) { + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + clean_tree_block(trans, root, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + } WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); @@ -2227,11 +2257,13 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, next = path->nodes[*level]; - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); - clean_tree_block(trans, root, next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); + if (trans) { + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + clean_tree_block(trans, root, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + } WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_and_pin_reserved_extent(root, @@ -2301,11 +2333,13 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, next = path->nodes[orig_level]; - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); - clean_tree_block(trans, log, next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); + if (trans) { + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + clean_tree_block(trans, log, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + } WARN_ON(log->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); @@ -2341,8 +2375,8 @@ static int update_log_root(struct btrfs_trans_handle *trans, return ret; } -static int wait_log_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root, unsigned long transid) +static void wait_log_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int transid) { DEFINE_WAIT(wait); int index = transid % 2; @@ -2357,36 +2391,63 @@ static int wait_log_commit(struct btrfs_trans_handle *trans, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); - if (root->fs_info->last_trans_log_full_commit != - trans->transid && root->log_transid < transid + 2 && + if (root->log_transid_committed < transid && atomic_read(&root->log_commit[index])) schedule(); finish_wait(&root->log_commit_wait[index], &wait); mutex_lock(&root->log_mutex); - } while (root->fs_info->last_trans_log_full_commit != - trans->transid && root->log_transid < transid + 2 && + } while (root->log_transid_committed < transid && atomic_read(&root->log_commit[index])); - return 0; } static void wait_for_writer(struct btrfs_trans_handle *trans, struct btrfs_root *root) { DEFINE_WAIT(wait); - while (root->fs_info->last_trans_log_full_commit != - trans->transid && atomic_read(&root->log_writers)) { + + while (atomic_read(&root->log_writers)) { prepare_to_wait(&root->log_writer_wait, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); - if (root->fs_info->last_trans_log_full_commit != - trans->transid && atomic_read(&root->log_writers)) + if (atomic_read(&root->log_writers)) schedule(); mutex_lock(&root->log_mutex); finish_wait(&root->log_writer_wait, &wait); } } +static inline void btrfs_remove_log_ctx(struct btrfs_root *root, + struct btrfs_log_ctx *ctx) +{ + if (!ctx) + return; + + mutex_lock(&root->log_mutex); + list_del_init(&ctx->list); + mutex_unlock(&root->log_mutex); +} + +/* + * Invoked in log mutex context, or be sure there is no other task which + * can access the list. + */ +static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, + int index, int error) +{ + struct btrfs_log_ctx *ctx; + + if (!error) { + INIT_LIST_HEAD(&root->log_ctxs[index]); + return; + } + + list_for_each_entry(ctx, &root->log_ctxs[index], list) + ctx->log_ret = error; + + INIT_LIST_HEAD(&root->log_ctxs[index]); +} + /* * btrfs_sync_log does sends a given tree log down to the disk and * updates the super blocks to record it. When this call is done, @@ -2400,7 +2461,7 @@ static void wait_for_writer(struct btrfs_trans_handle *trans, * that has happened. */ int btrfs_sync_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_root *root, struct btrfs_log_ctx *ctx) { int index1; int index2; @@ -2408,26 +2469,35 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, int ret; struct btrfs_root *log = root->log_root; struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; - unsigned long log_transid = 0; + int log_transid = 0; + struct btrfs_log_ctx root_log_ctx; struct blk_plug plug; mutex_lock(&root->log_mutex); - log_transid = root->log_transid; - index1 = root->log_transid % 2; + log_transid = ctx->log_transid; + if (root->log_transid_committed >= log_transid) { + mutex_unlock(&root->log_mutex); + return ctx->log_ret; + } + + index1 = log_transid % 2; if (atomic_read(&root->log_commit[index1])) { - wait_log_commit(trans, root, root->log_transid); + wait_log_commit(trans, root, log_transid); mutex_unlock(&root->log_mutex); - return 0; + return ctx->log_ret; } + ASSERT(log_transid == root->log_transid); atomic_set(&root->log_commit[index1], 1); /* wait for previous tree log sync to complete */ if (atomic_read(&root->log_commit[(index1 + 1) % 2])) - wait_log_commit(trans, root, root->log_transid - 1); + wait_log_commit(trans, root, log_transid - 1); + while (1) { int batch = atomic_read(&root->log_batch); /* when we're on an ssd, just kick the log commit out */ - if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { + if (!btrfs_test_opt(root, SSD) && + test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); mutex_lock(&root->log_mutex); @@ -2438,7 +2508,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } /* bail out if we need to do a full commit */ - if (root->fs_info->last_trans_log_full_commit == trans->transid) { + if (btrfs_need_log_full_commit(root->fs_info, trans)) { ret = -EAGAIN; btrfs_free_logged_extents(log, log_transid); mutex_unlock(&root->log_mutex); @@ -2459,6 +2529,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, blk_finish_plug(&plug); btrfs_abort_transaction(trans, root, ret); btrfs_free_logged_extents(log, log_transid); + btrfs_set_log_full_commit(root->fs_info, trans); mutex_unlock(&root->log_mutex); goto out; } @@ -2468,7 +2539,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, root->log_transid++; log->log_transid = root->log_transid; root->log_start_pid = 0; - smp_mb(); /* * IO has been started, blocks of the log tree have WRITTEN flag set * in their headers. new modifications of the log will be written to @@ -2476,9 +2546,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ mutex_unlock(&root->log_mutex); + btrfs_init_log_ctx(&root_log_ctx); + mutex_lock(&log_root_tree->log_mutex); atomic_inc(&log_root_tree->log_batch); atomic_inc(&log_root_tree->log_writers); + + index2 = log_root_tree->log_transid % 2; + list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); + root_log_ctx.log_transid = log_root_tree->log_transid; + mutex_unlock(&log_root_tree->log_mutex); ret = update_log_root(trans, log); @@ -2491,13 +2568,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } if (ret) { + if (!list_empty(&root_log_ctx.list)) + list_del_init(&root_log_ctx.list); + blk_finish_plug(&plug); + btrfs_set_log_full_commit(root->fs_info, trans); + if (ret != -ENOSPC) { btrfs_abort_transaction(trans, root, ret); mutex_unlock(&log_root_tree->log_mutex); goto out; } - root->fs_info->last_trans_log_full_commit = trans->transid; btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2505,22 +2586,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out; } - index2 = log_root_tree->log_transid % 2; + if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { + mutex_unlock(&log_root_tree->log_mutex); + ret = root_log_ctx.log_ret; + goto out; + } + + index2 = root_log_ctx.log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); wait_log_commit(trans, log_root_tree, - log_root_tree->log_transid); + root_log_ctx.log_transid); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); - ret = 0; + ret = root_log_ctx.log_ret; goto out; } + ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); atomic_set(&log_root_tree->log_commit[index2], 1); if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { wait_log_commit(trans, log_root_tree, - log_root_tree->log_transid - 1); + root_log_ctx.log_transid - 1); } wait_for_writer(trans, log_root_tree); @@ -2529,7 +2617,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * now that we've moved on to the tree of log tree roots, * check the full commit flag again */ - if (root->fs_info->last_trans_log_full_commit == trans->transid) { + if (btrfs_need_log_full_commit(root->fs_info, trans)) { blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_free_logged_extents(log, log_transid); @@ -2543,6 +2631,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, EXTENT_DIRTY | EXTENT_NEW); blk_finish_plug(&plug); if (ret) { + btrfs_set_log_full_commit(root->fs_info, trans); btrfs_abort_transaction(trans, root, ret); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2560,8 +2649,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_header_level(log_root_tree->node)); log_root_tree->log_transid++; - smp_mb(); - mutex_unlock(&log_root_tree->log_mutex); /* @@ -2571,10 +2658,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * the running transaction open, so a full commit can't hop * in and cause problems either. */ - btrfs_scrub_pause_super(root); ret = write_ctree_super(trans, root->fs_info->tree_root, 1); - btrfs_scrub_continue_super(root); if (ret) { + btrfs_set_log_full_commit(root->fs_info, trans); btrfs_abort_transaction(trans, root, ret); goto out_wake_log_root; } @@ -2585,13 +2671,28 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&root->log_mutex); out_wake_log_root: + /* + * We needn't get log_mutex here because we are sure all + * the other tasks are blocked. + */ + btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); + + mutex_lock(&log_root_tree->log_mutex); + log_root_tree->log_transid_committed++; atomic_set(&log_root_tree->log_commit[index2], 0); - smp_mb(); + mutex_unlock(&log_root_tree->log_mutex); + if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) wake_up(&log_root_tree->log_commit_wait[index2]); out: + /* See above. */ + btrfs_remove_all_log_ctxs(root, index1, ret); + + mutex_lock(&root->log_mutex); + root->log_transid_committed++; atomic_set(&root->log_commit[index1], 0); - smp_mb(); + mutex_unlock(&root->log_mutex); + if (waitqueue_active(&root->log_commit_wait[index1])) wake_up(&root->log_commit_wait[index1]); return ret; @@ -2608,13 +2709,10 @@ static void free_log_tree(struct btrfs_trans_handle *trans, .process_func = process_one_buffer }; - if (trans) { - ret = walk_log_tree(trans, log, &wc); - - /* I don't think this can happen but just in case */ - if (ret) - btrfs_abort_transaction(trans, log, ret); - } + ret = walk_log_tree(trans, log, &wc); + /* I don't think this can happen but just in case */ + if (ret) + btrfs_abort_transaction(trans, log, ret); while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, @@ -2780,7 +2878,7 @@ fail: out_unlock: mutex_unlock(&BTRFS_I(dir)->log_mutex); if (ret == -ENOSPC) { - root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); ret = 0; } else if (ret < 0) btrfs_abort_transaction(trans, root, ret); @@ -2813,7 +2911,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, dirid, &index); mutex_unlock(&BTRFS_I(inode)->log_mutex); if (ret == -ENOSPC) { - root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); ret = 0; } else if (ret < 0 && ret != -ENOENT) btrfs_abort_transaction(trans, root, ret); @@ -2867,7 +2965,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, u64 min_offset, u64 *last_offset_ret) { struct btrfs_key min_key; - struct btrfs_key max_key; struct btrfs_root *log = root->log_root; struct extent_buffer *src; int err = 0; @@ -2879,9 +2976,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, u64 ino = btrfs_ino(inode); log = root->log_root; - max_key.objectid = ino; - max_key.offset = (u64)-1; - max_key.type = key_type; min_key.objectid = ino; min_key.type = key_type; @@ -2889,8 +2983,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, path->keep_locks = 1; - ret = btrfs_search_forward(root, &min_key, &max_key, - path, trans->transid); + ret = btrfs_search_forward(root, &min_key, path, trans->transid); /* * we didn't find anything from this transaction, see if there @@ -2943,10 +3036,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, /* find the first key from this transaction again */ ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); - if (ret != 0) { - WARN_ON(1); + if (WARN_ON(ret != 0)) goto done; - } /* * we have a block from this transaction, log every item in it @@ -3172,11 +3263,10 @@ static int log_inode_item(struct btrfs_trans_handle *trans, struct inode *inode) { struct btrfs_inode_item *inode_item; - struct btrfs_key key; int ret; - memcpy(&key, &BTRFS_I(inode)->location, sizeof(key)); - ret = btrfs_insert_empty_item(trans, log, path, &key, + ret = btrfs_insert_empty_item(trans, log, path, + &BTRFS_I(inode)->location, sizeof(*inode_item)); if (ret && ret != -EEXIST) return ret; @@ -3190,7 +3280,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans, static noinline int copy_items(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *dst_path, - struct extent_buffer *src, + struct btrfs_path *src_path, u64 *last_extent, int start_slot, int nr, int inode_only) { unsigned long src_offset; @@ -3198,6 +3288,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_root *log = BTRFS_I(inode)->root->log_root; struct btrfs_file_extent_item *extent; struct btrfs_inode_item *inode_item; + struct extent_buffer *src = src_path->nodes[0]; + struct btrfs_key first_key, last_key, key; int ret; struct btrfs_key *ins_keys; u32 *ins_sizes; @@ -3205,6 +3297,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, int i; struct list_head ordered_sums; int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + bool has_extents = false; + bool need_find_last_extent = (*last_extent == 0); + bool done = false; INIT_LIST_HEAD(&ordered_sums); @@ -3213,6 +3308,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, if (!ins_data) return -ENOMEM; + first_key.objectid = (u64)-1; + ins_sizes = (u32 *)ins_data; ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); @@ -3233,6 +3330,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, src_offset = btrfs_item_ptr_offset(src, start_slot + i); + if ((i == (nr - 1))) + last_key = ins_keys[i]; + if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_path->slots[0], @@ -3244,6 +3344,21 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, src_offset, ins_sizes[i]); } + /* + * We set need_find_last_extent here in case we know we were + * processing other items and then walk into the first extent in + * the inode. If we don't hit an extent then nothing changes, + * we'll do the last search the next time around. + */ + if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { + has_extents = true; + if (need_find_last_extent && + first_key.objectid == (u64)-1) + first_key = ins_keys[i]; + } else { + need_find_last_extent = false; + } + /* take a reference on file data extents so that truncates * or deletes of this inode don't have to relog the inode * again @@ -3308,6 +3423,128 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, list_del(&sums->list); kfree(sums); } + + if (!has_extents) + return ret; + + /* + * Because we use btrfs_search_forward we could skip leaves that were + * not modified and then assume *last_extent is valid when it really + * isn't. So back up to the previous leaf and read the end of the last + * extent before we go and fill in holes. + */ + if (need_find_last_extent) { + u64 len; + + ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path); + if (ret < 0) + return ret; + if (ret) + goto fill_holes; + if (src_path->slots[0]) + src_path->slots[0]--; + src = src_path->nodes[0]; + btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY) + goto fill_holes; + extent = btrfs_item_ptr(src, src_path->slots[0], + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(src, extent) == + BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_inline_len(src, + src_path->slots[0], + extent); + *last_extent = ALIGN(key.offset + len, + log->sectorsize); + } else { + len = btrfs_file_extent_num_bytes(src, extent); + *last_extent = key.offset + len; + } + } +fill_holes: + /* So we did prev_leaf, now we need to move to the next leaf, but a few + * things could have happened + * + * 1) A merge could have happened, so we could currently be on a leaf + * that holds what we were copying in the first place. + * 2) A split could have happened, and now not all of the items we want + * are on the same leaf. + * + * So we need to adjust how we search for holes, we need to drop the + * path and re-search for the first extent key we found, and then walk + * forward until we hit the last one we copied. + */ + if (need_find_last_extent) { + /* btrfs_prev_leaf could return 1 without releasing the path */ + btrfs_release_path(src_path); + ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key, + src_path, 0, 0); + if (ret < 0) + return ret; + ASSERT(ret == 0); + src = src_path->nodes[0]; + i = src_path->slots[0]; + } else { + i = start_slot; + } + + /* + * Ok so here we need to go through and fill in any holes we may have + * to make sure that holes are punched for those areas in case they had + * extents previously. + */ + while (!done) { + u64 offset, len; + u64 extent_end; + + if (i >= btrfs_header_nritems(src_path->nodes[0])) { + ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path); + if (ret < 0) + return ret; + ASSERT(ret == 0); + src = src_path->nodes[0]; + i = 0; + } + + btrfs_item_key_to_cpu(src, &key, i); + if (!btrfs_comp_cpu_keys(&key, &last_key)) + done = true; + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY) { + i++; + continue; + } + extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(src, extent) == + BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_inline_len(src, i, extent); + extent_end = ALIGN(key.offset + len, log->sectorsize); + } else { + len = btrfs_file_extent_num_bytes(src, extent); + extent_end = key.offset + len; + } + i++; + + if (*last_extent == key.offset) { + *last_extent = extent_end; + continue; + } + offset = *last_extent; + len = key.offset - *last_extent; + ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), + offset, 0, 0, len, 0, len, 0, + 0, 0); + if (ret) + break; + *last_extent = offset + len; + } + /* + * Need to let the callers know we dropped the path so they should + * re-search. + */ + if (!ret && need_find_last_extent) + ret = 1; return ret; } @@ -3327,7 +3564,8 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) static int log_one_extent(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_root *root, - struct extent_map *em, struct btrfs_path *path) + struct extent_map *em, struct btrfs_path *path, + struct list_head *logged_list) { struct btrfs_root *log = root->log_root; struct btrfs_file_extent_item *fi; @@ -3343,23 +3581,28 @@ static int log_one_extent(struct btrfs_trans_handle *trans, u64 extent_offset = em->start - em->orig_start; u64 block_len; int ret; - int index = log->log_transid % 2; bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - - ret = __btrfs_drop_extents(trans, log, inode, path, em->start, - em->start + em->len, NULL, 0); - if (ret) - return ret; + int extent_inserted = 0; INIT_LIST_HEAD(&ordered_sums); btrfs_init_map_token(&token); - key.objectid = btrfs_ino(inode); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = em->start; - ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi)); + ret = __btrfs_drop_extents(trans, log, inode, path, em->start, + em->start + em->len, NULL, 0, 1, + sizeof(*fi), &extent_inserted); if (ret) return ret; + + if (!extent_inserted) { + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = em->start; + + ret = btrfs_insert_empty_item(trans, log, path, &key, + sizeof(*fi)); + if (ret) + return ret; + } leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -3375,7 +3618,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, btrfs_set_token_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG, &token); - if (em->block_start == 0) + if (em->block_start == EXTENT_MAP_HOLE) skip_csum = true; } @@ -3417,26 +3660,16 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (skip_csum) return 0; - if (em->compress_type) { - csum_offset = 0; - csum_len = block_len; - } - /* * First check and see if our csums are on our outstanding ordered * extents. */ -again: - spin_lock_irq(&log->log_extents_lock[index]); - list_for_each_entry(ordered, &log->logged_list[index], log_list) { + list_for_each_entry(ordered, logged_list, log_list) { struct btrfs_ordered_sum *sum; if (!mod_len) break; - if (ordered->inode != inode) - continue; - if (ordered->file_offset + ordered->len <= mod_start || mod_start + mod_len <= ordered->file_offset) continue; @@ -3479,34 +3712,32 @@ again: if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) continue; - atomic_inc(&ordered->refs); - spin_unlock_irq(&log->log_extents_lock[index]); - /* - * we've dropped the lock, we must either break or - * start over after this. - */ - wait_event(ordered->wait, ordered->csum_bytes_left == 0); + if (ordered->csum_bytes_left) { + btrfs_start_ordered_extent(inode, ordered, 0); + wait_event(ordered->wait, + ordered->csum_bytes_left == 0); + } list_for_each_entry(sum, &ordered->list, list) { ret = btrfs_csum_file_blocks(trans, log, sum); - if (ret) { - btrfs_put_ordered_extent(ordered); + if (ret) goto unlocked; - } } - btrfs_put_ordered_extent(ordered); - goto again; } - spin_unlock_irq(&log->log_extents_lock[index]); unlocked: if (!mod_len || ret) return ret; - csum_offset = mod_start - em->start; - csum_len = mod_len; + if (em->compress_type) { + csum_offset = 0; + csum_len = block_len; + } else { + csum_offset = mod_start - em->start; + csum_len = mod_len; + } /* block start is already adjusted for the file extent offset. */ ret = btrfs_lookup_csums_range(log->fs_info->csum_root, @@ -3532,7 +3763,8 @@ unlocked: static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - struct btrfs_path *path) + struct btrfs_path *path, + struct list_head *logged_list) { struct extent_map *em, *n; struct list_head extents; @@ -3590,7 +3822,7 @@ process: write_unlock(&tree->lock); - ret = log_one_extent(trans, inode, root, em, path); + ret = log_one_extent(trans, inode, root, em, path, logged_list); write_lock(&tree->lock); clear_em_logging(tree, em); free_extent_map(em); @@ -3626,6 +3858,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key max_key; struct btrfs_root *log = root->log_root; struct extent_buffer *src = NULL; + LIST_HEAD(logged_list); + u64 last_extent = 0; int err = 0; int ret; int nritems; @@ -3673,7 +3907,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, mutex_lock(&BTRFS_I(inode)->log_mutex); - btrfs_get_logged_extents(log, inode); + btrfs_get_logged_extents(inode, &logged_list); /* * a brute force approach to making sure we get the most uptodate @@ -3693,7 +3927,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags)) { + &BTRFS_I(inode)->runtime_flags) || + inode_only == LOG_INODE_EXISTS) { if (inode_only == LOG_INODE_ALL) fast_search = true; max_key.type = BTRFS_XATTR_ITEM_KEY; @@ -3719,7 +3954,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, while (1) { ins_nr = 0; - ret = btrfs_search_forward(root, &min_key, &max_key, + ret = btrfs_search_forward(root, &min_key, path, trans->transid); if (ret != 0) break; @@ -3740,11 +3975,15 @@ again: goto next_slot; } - ret = copy_items(trans, inode, dst_path, src, ins_start_slot, - ins_nr, inode_only); - if (ret) { + ret = copy_items(trans, inode, dst_path, path, &last_extent, + ins_start_slot, ins_nr, inode_only); + if (ret < 0) { err = ret; goto out_unlock; + } if (ret) { + ins_nr = 0; + btrfs_release_path(path); + continue; } ins_nr = 1; ins_start_slot = path->slots[0]; @@ -3758,33 +3997,35 @@ next_slot: goto again; } if (ins_nr) { - ret = copy_items(trans, inode, dst_path, src, - ins_start_slot, + ret = copy_items(trans, inode, dst_path, path, + &last_extent, ins_start_slot, ins_nr, inode_only); - if (ret) { + if (ret < 0) { err = ret; goto out_unlock; } + ret = 0; ins_nr = 0; } btrfs_release_path(path); - if (min_key.offset < (u64)-1) + if (min_key.offset < (u64)-1) { min_key.offset++; - else if (min_key.type < (u8)-1) + } else if (min_key.type < max_key.type) { min_key.type++; - else if (min_key.objectid < (u64)-1) - min_key.objectid++; - else + min_key.offset = 0; + } else { break; + } } if (ins_nr) { - ret = copy_items(trans, inode, dst_path, src, ins_start_slot, - ins_nr, inode_only); - if (ret) { + ret = copy_items(trans, inode, dst_path, path, &last_extent, + ins_start_slot, ins_nr, inode_only); + if (ret < 0) { err = ret; goto out_unlock; } + ret = 0; ins_nr = 0; } @@ -3792,12 +4033,13 @@ log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); if (fast_search) { - ret = btrfs_log_changed_extents(trans, root, inode, dst_path); + ret = btrfs_log_changed_extents(trans, root, inode, dst_path, + &logged_list); if (ret) { err = ret; goto out_unlock; } - } else { + } else if (inode_only == LOG_INODE_ALL) { struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em, *n; @@ -3817,8 +4059,10 @@ log_extents: BTRFS_I(inode)->logged_trans = trans->transid; BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; out_unlock: - if (err) - btrfs_free_logged_extents(log, log->log_transid); + if (unlikely(err)) + btrfs_put_logged_extents(&logged_list); + else + btrfs_submit_logged_extents(&logged_list, log); mutex_unlock(&BTRFS_I(inode)->log_mutex); btrfs_free_path(path); @@ -3878,8 +4122,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, * make sure any commits to the log are forced * to be full commits */ - root->fs_info->last_trans_log_full_commit = - trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); ret = 1; break; } @@ -3909,7 +4152,8 @@ out: */ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - struct dentry *parent, int exists_only) + struct dentry *parent, int exists_only, + struct btrfs_log_ctx *ctx) { int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; struct super_block *sb; @@ -3924,6 +4168,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } + /* + * The prev transaction commit doesn't complete, we need do + * full commit by ourselves. + */ if (root->fs_info->last_trans_log_full_commit > root->fs_info->last_trans_committed) { ret = 1; @@ -3946,9 +4194,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } - ret = start_log_trans(trans, root); + ret = start_log_trans(trans, root, ctx); if (ret) - goto end_trans; + goto end_no_trans; ret = btrfs_log_inode(trans, root, inode, inode_only); if (ret) @@ -3993,9 +4241,12 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, end_trans: dput(old_parent); if (ret < 0) { - root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); ret = 1; } + + if (ret) + btrfs_remove_log_ctx(root, ctx); btrfs_end_log_trans(root); end_no_trans: return ret; @@ -4008,12 +4259,14 @@ end_no_trans: * data on disk. */ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct dentry *dentry) + struct btrfs_root *root, struct dentry *dentry, + struct btrfs_log_ctx *ctx) { struct dentry *parent = dget_parent(dentry); int ret; - ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0); + ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, + 0, ctx); dput(parent); return ret; @@ -4250,6 +4503,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, root->fs_info->last_trans_committed)) return 0; - return btrfs_log_inode_parent(trans, root, inode, parent, 1); + return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL); } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 1d4ae0d15a7..7f5b41bd537 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -19,17 +19,47 @@ #ifndef __TREE_LOG_ #define __TREE_LOG_ +#include "ctree.h" +#include "transaction.h" + /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 +struct btrfs_log_ctx { + int log_ret; + int log_transid; + struct list_head list; +}; + +static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) +{ + ctx->log_ret = 0; + ctx->log_transid = 0; + INIT_LIST_HEAD(&ctx->list); +} + +static inline void btrfs_set_log_full_commit(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans) +{ + ACCESS_ONCE(fs_info->last_trans_log_full_commit) = trans->transid; +} + +static inline int btrfs_need_log_full_commit(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans) +{ + return ACCESS_ONCE(fs_info->last_trans_log_full_commit) == + trans->transid; +} + int btrfs_sync_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root); + struct btrfs_root *root, struct btrfs_log_ctx *ctx); int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct dentry *dentry); + struct btrfs_root *root, struct dentry *dentry, + struct btrfs_log_ctx *ctx); int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index b0a523b2c60..840a38b2778 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -5,8 +5,8 @@ */ #include <linux/slab.h> -#include <linux/export.h> #include "ulist.h" +#include "ctree.h" /* * ulist is a generic data structure to hold a collection of unique u64 @@ -14,10 +14,6 @@ * enumerating it. * It is possible to store an auxiliary value along with the key. * - * The implementation is preliminary and can probably be sped up - * significantly. A first step would be to store the values in an rbtree - * as soon as ULIST_SIZE is exceeded. - * * A sample usage for ulists is the enumeration of directed graphs without * visiting a node twice. The pseudo-code could look like this: * @@ -50,12 +46,10 @@ */ void ulist_init(struct ulist *ulist) { - ulist->nnodes = 0; - ulist->nodes = ulist->int_nodes; - ulist->nodes_alloced = ULIST_SIZE; + INIT_LIST_HEAD(&ulist->nodes); ulist->root = RB_ROOT; + ulist->nnodes = 0; } -EXPORT_SYMBOL(ulist_init); /** * ulist_fini - free up additionally allocated memory for the ulist @@ -64,18 +58,17 @@ EXPORT_SYMBOL(ulist_init); * This is useful in cases where the base 'struct ulist' has been statically * allocated. */ -void ulist_fini(struct ulist *ulist) +static void ulist_fini(struct ulist *ulist) { - /* - * The first ULIST_SIZE elements are stored inline in struct ulist. - * Only if more elements are alocated they need to be freed. - */ - if (ulist->nodes_alloced > ULIST_SIZE) - kfree(ulist->nodes); - ulist->nodes_alloced = 0; /* in case ulist_fini is called twice */ + struct ulist_node *node; + struct ulist_node *next; + + list_for_each_entry_safe(node, next, &ulist->nodes, list) { + kfree(node); + } ulist->root = RB_ROOT; + INIT_LIST_HEAD(&ulist->nodes); } -EXPORT_SYMBOL(ulist_fini); /** * ulist_reinit - prepare a ulist for reuse @@ -89,7 +82,6 @@ void ulist_reinit(struct ulist *ulist) ulist_fini(ulist); ulist_init(ulist); } -EXPORT_SYMBOL(ulist_reinit); /** * ulist_alloc - dynamically allocate a ulist @@ -108,7 +100,6 @@ struct ulist *ulist_alloc(gfp_t gfp_mask) return ulist; } -EXPORT_SYMBOL(ulist_alloc); /** * ulist_free - free dynamically allocated ulist @@ -123,7 +114,6 @@ void ulist_free(struct ulist *ulist) ulist_fini(ulist); kfree(ulist); } -EXPORT_SYMBOL(ulist_free); static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val) { @@ -192,63 +182,32 @@ int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask) int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, u64 *old_aux, gfp_t gfp_mask) { - int ret = 0; - struct ulist_node *node = NULL; + int ret; + struct ulist_node *node; + node = ulist_rbtree_search(ulist, val); if (node) { if (old_aux) *old_aux = node->aux; return 0; } + node = kmalloc(sizeof(*node), gfp_mask); + if (!node) + return -ENOMEM; - if (ulist->nnodes >= ulist->nodes_alloced) { - u64 new_alloced = ulist->nodes_alloced + 128; - struct ulist_node *new_nodes; - void *old = NULL; - int i; - - for (i = 0; i < ulist->nnodes; i++) - rb_erase(&ulist->nodes[i].rb_node, &ulist->root); - - /* - * if nodes_alloced == ULIST_SIZE no memory has been allocated - * yet, so pass NULL to krealloc - */ - if (ulist->nodes_alloced > ULIST_SIZE) - old = ulist->nodes; + node->val = val; + node->aux = aux; +#ifdef CONFIG_BTRFS_DEBUG + node->seqnum = ulist->nnodes; +#endif - new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced, - gfp_mask); - if (!new_nodes) - return -ENOMEM; - - if (!old) - memcpy(new_nodes, ulist->int_nodes, - sizeof(ulist->int_nodes)); - - ulist->nodes = new_nodes; - ulist->nodes_alloced = new_alloced; - - /* - * krealloc actually uses memcpy, which does not copy rb_node - * pointers, so we have to do it ourselves. Otherwise we may - * be bitten by crashes. - */ - for (i = 0; i < ulist->nnodes; i++) { - ret = ulist_rbtree_insert(ulist, &ulist->nodes[i]); - if (ret < 0) - return ret; - } - } - ulist->nodes[ulist->nnodes].val = val; - ulist->nodes[ulist->nnodes].aux = aux; - ret = ulist_rbtree_insert(ulist, &ulist->nodes[ulist->nnodes]); - BUG_ON(ret); - ++ulist->nnodes; + ret = ulist_rbtree_insert(ulist, node); + ASSERT(!ret); + list_add_tail(&node->list, &ulist->nodes); + ulist->nnodes++; return 1; } -EXPORT_SYMBOL(ulist_add); /** * ulist_next - iterate ulist @@ -268,11 +227,25 @@ EXPORT_SYMBOL(ulist_add); */ struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter) { - if (ulist->nnodes == 0) + struct ulist_node *node; + + if (list_empty(&ulist->nodes)) return NULL; - if (uiter->i < 0 || uiter->i >= ulist->nnodes) + if (uiter->cur_list && uiter->cur_list->next == &ulist->nodes) return NULL; - - return &ulist->nodes[uiter->i++]; + if (uiter->cur_list) { + uiter->cur_list = uiter->cur_list->next; + } else { + uiter->cur_list = ulist->nodes.next; +#ifdef CONFIG_BTRFS_DEBUG + uiter->i = 0; +#endif + } + node = list_entry(uiter->cur_list, struct ulist_node, list); +#ifdef CONFIG_BTRFS_DEBUG + ASSERT(node->seqnum == uiter->i); + ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes); + uiter->i++; +#endif + return node; } -EXPORT_SYMBOL(ulist_next); diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index fb36731074b..7f78cbf5cf4 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -17,18 +17,12 @@ * enumerating it. * It is possible to store an auxiliary value along with the key. * - * The implementation is preliminary and can probably be sped up - * significantly. A first step would be to store the values in an rbtree - * as soon as ULIST_SIZE is exceeded. */ - -/* - * number of elements statically allocated inside struct ulist - */ -#define ULIST_SIZE 16 - struct ulist_iterator { +#ifdef CONFIG_BTRFS_DEBUG int i; +#endif + struct list_head *cur_list; /* hint to start search */ }; /* @@ -37,6 +31,12 @@ struct ulist_iterator { struct ulist_node { u64 val; /* value to store */ u64 aux; /* auxiliary value saved along with the val */ + +#ifdef CONFIG_BTRFS_DEBUG + int seqnum; /* sequence number this node is added */ +#endif + + struct list_head list; /* used to link node */ struct rb_node rb_node; /* used to speed up search */ }; @@ -46,28 +46,11 @@ struct ulist { */ unsigned long nnodes; - /* - * number of nodes we already have room for - */ - unsigned long nodes_alloced; - - /* - * pointer to the array storing the elements. The first ULIST_SIZE - * elements are stored inline. In this case the it points to int_nodes. - * After exceeding ULIST_SIZE, dynamic memory is allocated. - */ - struct ulist_node *nodes; - + struct list_head nodes; struct rb_root root; - - /* - * inline storage space for the first ULIST_SIZE entries - */ - struct ulist_node int_nodes[ULIST_SIZE]; }; void ulist_init(struct ulist *ulist); -void ulist_fini(struct ulist *ulist); void ulist_reinit(struct ulist *ulist); struct ulist *ulist_alloc(gfp_t gfp_mask); void ulist_free(struct ulist *ulist); @@ -77,6 +60,6 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter); -#define ULIST_ITER_INIT(uiter) ((uiter)->i = 0) +#define ULIST_ITER_INIT(uiter) ((uiter)->cur_list = NULL) #endif diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index dd0dea3766f..f6a4c03ee7d 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -69,7 +69,7 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid, ret = -ENOENT; if (!IS_ALIGNED(item_size, sizeof(u64))) { - pr_warn("btrfs: uuid item with illegal size %lu!\n", + btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!", (unsigned long)item_size); goto out; } @@ -137,7 +137,8 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, offset = btrfs_item_ptr_offset(eb, slot); offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le); } else if (ret < 0) { - pr_warn("btrfs: insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!\n", + btrfs_warn(uuid_root->fs_info, "insert uuid item failed %d " + "(0x%016llx, 0x%016llx) type %u!", ret, (unsigned long long)key.objectid, (unsigned long long)key.offset, type); goto out; @@ -183,7 +184,7 @@ int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1); if (ret < 0) { - pr_warn("btrfs: error %d while searching for uuid item!\n", + btrfs_warn(uuid_root->fs_info, "error %d while searching for uuid item!", ret); goto out; } @@ -197,7 +198,7 @@ int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans, offset = btrfs_item_ptr_offset(eb, slot); item_size = btrfs_item_size_nr(eb, slot); if (!IS_ALIGNED(item_size, sizeof(u64))) { - pr_warn("btrfs: uuid item with illegal size %lu!\n", + btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!", (unsigned long)item_size); ret = -ENOENT; goto out; @@ -260,7 +261,6 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, { struct btrfs_root *root = fs_info->uuid_root; struct btrfs_key key; - struct btrfs_key max_key; struct btrfs_path *path; int ret = 0; struct extent_buffer *leaf; @@ -277,13 +277,10 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, key.objectid = 0; key.type = 0; key.offset = 0; - max_key.objectid = (u64)-1; - max_key.type = (u8)-1; - max_key.offset = (u64)-1; again_search_slot: path->keep_locks = 1; - ret = btrfs_search_forward(root, &key, &max_key, path, 0); + ret = btrfs_search_forward(root, &key, path, 0); if (ret) { if (ret > 0) ret = 0; @@ -303,7 +300,7 @@ again_search_slot: offset = btrfs_item_ptr_offset(leaf, slot); item_size = btrfs_item_size_nr(leaf, slot); if (!IS_ALIGNED(item_size, sizeof(u64))) { - pr_warn("btrfs: uuid item with illegal size %lu!\n", + btrfs_warn(fs_info, "uuid item with illegal size %lu!", (unsigned long)item_size); goto skip; } @@ -353,6 +350,6 @@ skip: out: btrfs_free_path(path); if (ret) - pr_warn("btrfs: btrfs_uuid_tree_iterate failed %d\n", ret); + btrfs_warn(fs_info, "btrfs_uuid_tree_iterate failed %d", ret); return 0; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 043b215769c..6cb82f62cb7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -28,7 +28,6 @@ #include <linux/raid/pq.h> #include <linux/semaphore.h> #include <asm/div64.h> -#include "compat.h" #include "ctree.h" #include "extent_map.h" #include "disk-io.h" @@ -41,6 +40,7 @@ #include "rcu-string.h" #include "math.h" #include "dev-replace.h" +#include "sysfs.h" static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -126,7 +126,7 @@ static void btrfs_kobject_uevent(struct block_device *bdev, ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); if (ret) - pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n", + pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", action, kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), &disk_to_dev(bdev->bd_disk)->kobj); @@ -201,7 +201,7 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, if (IS_ERR(*bdev)) { ret = PTR_ERR(*bdev); - printk(KERN_INFO "btrfs: open %s failed\n", device_path); + printk(KERN_INFO "BTRFS: open %s failed\n", device_path); goto error; } @@ -416,7 +416,8 @@ loop_lock: device->running_pending = 1; spin_unlock(&device->io_lock); - btrfs_requeue_work(&device->work); + btrfs_queue_work(fs_info->submit_workers, + &device->work); goto done; } /* unplug every 64 requests just for good measure */ @@ -448,6 +449,14 @@ static void pending_bios_fn(struct btrfs_work *work) run_scheduled_bios(device); } +/* + * Add new device to list of registered devices + * + * Returns: + * 1 - first time device is seen + * 0 - device already known + * < 0 - error + */ static noinline int device_list_add(const char *path, struct btrfs_super_block *disk_super, u64 devid, struct btrfs_fs_devices **fs_devices_ret) @@ -455,6 +464,7 @@ static noinline int device_list_add(const char *path, struct btrfs_device *device; struct btrfs_fs_devices *fs_devices; struct rcu_string *name; + int ret = 0; u64 found_transid = btrfs_super_generation(disk_super); fs_devices = find_fsid(disk_super->fsid); @@ -495,6 +505,7 @@ static noinline int device_list_add(const char *path, fs_devices->num_devices++; mutex_unlock(&fs_devices->device_list_mutex); + ret = 1; device->fs_devices = fs_devices; } else if (!device->name || strcmp(device->name->str, path)) { name = rcu_string_strdup(path, GFP_NOFS); @@ -513,7 +524,8 @@ static noinline int device_list_add(const char *path, fs_devices->latest_trans = found_transid; } *fs_devices_ret = fs_devices; - return 0; + + return ret; } static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) @@ -543,12 +555,14 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) * This is ok to do without rcu read locked because we hold the * uuid mutex so nothing we touch in here is going to disappear. */ - name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); - if (!name) { - kfree(device); - goto error; + if (orig_dev->name) { + name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); + if (!name) { + kfree(device); + goto error; + } + rcu_assign_pointer(device->name, name); } - rcu_assign_pointer(device->name, name); list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; @@ -666,7 +680,8 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) if (device->bdev) fs_devices->open_devices--; - if (device->writeable && !device->is_tgtdev_for_dev_replace) { + if (device->writeable && + device->devid != BTRFS_DEV_REPLACE_DEVID) { list_del_init(&device->dev_alloc_list); fs_devices->rw_devices--; } @@ -909,17 +924,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, transid = btrfs_super_generation(disk_super); total_devices = btrfs_super_num_devices(disk_super); - if (disk_super->label[0]) { - if (disk_super->label[BTRFS_LABEL_SIZE - 1]) - disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; - printk(KERN_INFO "btrfs: device label %s ", disk_super->label); - } else { - printk(KERN_INFO "btrfs: device fsid %pU ", disk_super->fsid); - } - - printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path); - ret = device_list_add(path, disk_super, devid, fs_devices_ret); + if (ret > 0) { + if (disk_super->label[0]) { + if (disk_super->label[BTRFS_LABEL_SIZE - 1]) + disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; + printk(KERN_INFO "BTRFS: device label %s ", disk_super->label); + } else { + printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid); + } + + printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path); + ret = 0; + } if (!ret && fs_devices_ret) (*fs_devices_ret)->total_devices = total_devices; @@ -1438,6 +1455,22 @@ out: return ret; } +/* + * Function to update ctime/mtime for a given device path. + * Mainly used for ctime/mtime based probe like libblkid. + */ +static void update_dev_time(char *path_name) +{ + struct file *filp; + + filp = filp_open(path_name, O_RDWR, 0); + if (!filp) + return; + file_update_time(filp); + filp_close(filp, NULL); + return; +} + static int btrfs_rm_dev_item(struct btrfs_root *root, struct btrfs_device *device) { @@ -1647,8 +1680,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->bdev == root->fs_info->fs_devices->latest_bdev) root->fs_info->fs_devices->latest_bdev = next_device->bdev; - if (device->bdev) + if (device->bdev) { device->fs_devices->open_devices--; + /* remove sysfs entry */ + btrfs_kobj_rm_device(root->fs_info, device); + } call_rcu(&device->rcu, free_device); @@ -1660,11 +1696,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) struct btrfs_fs_devices *fs_devices; fs_devices = root->fs_info->fs_devices; while (fs_devices) { - if (fs_devices->seed == cur_devices) + if (fs_devices->seed == cur_devices) { + fs_devices->seed = cur_devices->seed; break; + } fs_devices = fs_devices->seed; } - fs_devices->seed = cur_devices->seed; cur_devices->seed = NULL; lock_chunks(root); __btrfs_close_devices(cur_devices); @@ -1680,20 +1717,55 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) * remove it from the devices list and zero out the old super */ if (clear_super && disk_super) { + u64 bytenr; + int i; + /* make sure this device isn't detected as part of * the FS anymore */ memset(&disk_super->magic, 0, sizeof(disk_super->magic)); set_buffer_dirty(bh); sync_dirty_buffer(bh); + + /* clear the mirror copies of super block on the disk + * being removed, 0th copy is been taken care above and + * the below would take of the rest + */ + for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= + i_size_read(bdev->bd_inode)) + break; + + brelse(bh); + bh = __bread(bdev, bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); + if (!bh) + continue; + + disk_super = (struct btrfs_super_block *)bh->b_data; + + if (btrfs_super_bytenr(disk_super) != bytenr || + btrfs_super_magic(disk_super) != BTRFS_MAGIC) { + continue; + } + memset(&disk_super->magic, 0, + sizeof(disk_super->magic)); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + } } ret = 0; - /* Notify udev that device has changed */ - if (bdev) + if (bdev) { + /* Notify udev that device has changed */ btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + /* Update ctime/mtime for device path for libblkid */ + update_dev_time(device_path); + } + error_brelse: brelse(bh); if (bdev) @@ -1813,7 +1885,7 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, } if (!*device) { - pr_err("btrfs: no missing device found\n"); + btrfs_err(root->fs_info, "no missing device found"); return -ENOENT; } @@ -1869,7 +1941,6 @@ static int btrfs_prepare_sprout(struct btrfs_root *root) fs_devices->seeding = 0; fs_devices->num_devices = 0; fs_devices->open_devices = 0; - fs_devices->total_devices = 0; fs_devices->seed = seed_devices; generate_random_uuid(fs_devices->fsid); @@ -2041,6 +2112,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->in_fs_metadata = 1; device->is_tgtdev_for_dev_replace = 0; device->mode = FMODE_EXCL; + device->dev_stats_valid = 1; set_blocksize(device->bdev, 4096); if (seeding_dev) { @@ -2077,9 +2149,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); btrfs_set_super_num_devices(root->fs_info->super_copy, total_bytes + 1); + + /* add sysfs device entry */ + btrfs_kobj_add_device(root->fs_info, device); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (seeding_dev) { + char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; ret = init_first_rw_device(trans, root, device); if (ret) { btrfs_abort_transaction(trans, root, ret); @@ -2090,6 +2167,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) btrfs_abort_transaction(trans, root, ret); goto error_trans; } + + /* Sprouting would change fsid of the mounted root, + * so rename the fsid on the sysfs + */ + snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", + root->fs_info->fsid); + if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) + goto error_trans; } else { ret = btrfs_add_device(trans, root, device); if (ret) { @@ -2131,12 +2216,15 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_commit_transaction(trans, root); } + /* Update ctime/mtime for libblkid */ + update_dev_time(device_path); return ret; error_trans: unlock_chunks(root); btrfs_end_transaction(trans, root); rcu_string_free(device->name); + btrfs_kobj_rm_device(root->fs_info, device); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); @@ -2208,6 +2296,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, device->in_fs_metadata = 1; device->is_tgtdev_for_dev_replace = 1; device->mode = FMODE_EXCL; + device->dev_stats_valid = 1; set_blocksize(device->bdev, 4096); device->fs_devices = fs_info->fs_devices; list_add(&device->dev_list, &fs_info->fs_devices->devices); @@ -2474,9 +2563,6 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, remove_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); - kfree(map); - em->bdev = NULL; - /* once for the tree */ free_extent_map(em); /* once for us */ @@ -2550,8 +2636,7 @@ again: failed = 0; retried = true; goto again; - } else if (failed && retried) { - WARN_ON(1); + } else if (WARN_ON(failed && retried)) { ret = -ENOSPC; } error: @@ -2907,6 +2992,16 @@ static int should_balance_chunk(struct btrfs_root *root, return 0; } + /* + * limited by count, must be the last filter + */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { + if (bargs->limit == 0) + return 0; + else + bargs->limit--; + } + return 1; } @@ -2929,6 +3024,9 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) int ret; int enospc_errors = 0; bool counting = true; + u64 limit_data = bctl->data.limit; + u64 limit_meta = bctl->meta.limit; + u64 limit_sys = bctl->sys.limit; /* step one make some room on all the devices */ devices = &fs_info->fs_devices->devices; @@ -2967,6 +3065,11 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) memset(&bctl->stat, 0, sizeof(bctl->stat)); spin_unlock(&fs_info->balance_lock); again: + if (!counting) { + bctl->data.limit = limit_data; + bctl->meta.limit = limit_meta; + bctl->sys.limit = limit_sys; + } key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; @@ -3051,7 +3154,7 @@ loop: error: btrfs_free_path(path); if (enospc_errors) { - printk(KERN_INFO "btrfs: %d enospc errors during balance\n", + btrfs_info(fs_info, "%d enospc errors during balance", enospc_errors); if (!ret) ret = -ENOSPC; @@ -3137,8 +3240,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl, if (!(bctl->flags & BTRFS_BALANCE_DATA) || !(bctl->flags & BTRFS_BALANCE_METADATA) || memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { - printk(KERN_ERR "btrfs: with mixed groups data and " - "metadata balance options must be the same\n"); + btrfs_err(fs_info, "with mixed groups data and " + "metadata balance options must be the same"); ret = -EINVAL; goto out; } @@ -3164,8 +3267,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl, if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && (!alloc_profile_is_valid(bctl->data.target, 1) || (bctl->data.target & ~allowed))) { - printk(KERN_ERR "btrfs: unable to start balance with target " - "data profile %llu\n", + btrfs_err(fs_info, "unable to start balance with target " + "data profile %llu", bctl->data.target); ret = -EINVAL; goto out; @@ -3173,8 +3276,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl, if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && (!alloc_profile_is_valid(bctl->meta.target, 1) || (bctl->meta.target & ~allowed))) { - printk(KERN_ERR "btrfs: unable to start balance with target " - "metadata profile %llu\n", + btrfs_err(fs_info, + "unable to start balance with target metadata profile %llu", bctl->meta.target); ret = -EINVAL; goto out; @@ -3182,8 +3285,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl, if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && (!alloc_profile_is_valid(bctl->sys.target, 1) || (bctl->sys.target & ~allowed))) { - printk(KERN_ERR "btrfs: unable to start balance with target " - "system profile %llu\n", + btrfs_err(fs_info, + "unable to start balance with target system profile %llu", bctl->sys.target); ret = -EINVAL; goto out; @@ -3192,7 +3295,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, /* allow dup'ed data chunks only in mixed mode */ if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) { - printk(KERN_ERR "btrfs: dup for data is not allowed\n"); + btrfs_err(fs_info, "dup for data is not allowed"); ret = -EINVAL; goto out; } @@ -3212,11 +3315,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl, (fs_info->avail_metadata_alloc_bits & allowed) && !(bctl->meta.target & allowed))) { if (bctl->flags & BTRFS_BALANCE_FORCE) { - printk(KERN_INFO "btrfs: force reducing metadata " - "integrity\n"); + btrfs_info(fs_info, "force reducing metadata integrity"); } else { - printk(KERN_ERR "btrfs: balance will reduce metadata " - "integrity, use force if you want this\n"); + btrfs_err(fs_info, "balance will reduce metadata " + "integrity, use force if you want this"); ret = -EINVAL; goto out; } @@ -3302,7 +3404,7 @@ static int balance_kthread(void *data) mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl) { - printk(KERN_INFO "btrfs: continuing balance\n"); + btrfs_info(fs_info, "continuing balance"); ret = btrfs_balance(fs_info->balance_ctl, NULL); } @@ -3324,7 +3426,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->balance_lock); if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { - printk(KERN_INFO "btrfs: force skipping balance\n"); + btrfs_info(fs_info, "force skipping balance"); return 0; } @@ -3423,6 +3525,9 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info) int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) { + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + mutex_lock(&fs_info->balance_mutex); if (!fs_info->balance_ctl) { mutex_unlock(&fs_info->balance_mutex); @@ -3488,7 +3593,7 @@ static int btrfs_uuid_scan_kthread(void *data) path->keep_locks = 1; while (1) { - ret = btrfs_search_forward(root, &key, &max_key, path, 0); + ret = btrfs_search_forward(root, &key, path, 0); if (ret) { if (ret > 0) ret = 0; @@ -3539,7 +3644,7 @@ update_tree: BTRFS_UUID_KEY_SUBVOL, key.objectid); if (ret < 0) { - pr_warn("btrfs: uuid_tree_add failed %d\n", + btrfs_warn(fs_info, "uuid_tree_add failed %d", ret); break; } @@ -3551,7 +3656,7 @@ update_tree: BTRFS_UUID_KEY_RECEIVED_SUBVOL, key.objectid); if (ret < 0) { - pr_warn("btrfs: uuid_tree_add failed %d\n", + btrfs_warn(fs_info, "uuid_tree_add failed %d", ret); break; } @@ -3586,7 +3691,7 @@ out: if (trans && !IS_ERR(trans)) btrfs_end_transaction(trans, fs_info->uuid_root); if (ret) - pr_warn("btrfs: btrfs_uuid_scan_kthread failed %d\n", ret); + btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); else fs_info->update_uuid_tree_gen = 1; up(&fs_info->uuid_tree_rescan_sem); @@ -3650,7 +3755,7 @@ static int btrfs_uuid_rescan_kthread(void *data) */ ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); if (ret < 0) { - pr_warn("btrfs: iterating uuid_tree failed %d\n", ret); + btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); up(&fs_info->uuid_tree_rescan_sem); return ret; } @@ -3691,7 +3796,7 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); if (IS_ERR(task)) { /* fs_info->update_uuid_tree_gen remains 0 in all error case */ - pr_warn("btrfs: failed to start uuid_scan task\n"); + btrfs_warn(fs_info, "failed to start uuid_scan task"); up(&fs_info->uuid_tree_rescan_sem); return PTR_ERR(task); } @@ -3707,7 +3812,7 @@ int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); if (IS_ERR(task)) { /* fs_info->update_uuid_tree_gen remains 0 in all error case */ - pr_warn("btrfs: failed to start uuid_rescan task\n"); + btrfs_warn(fs_info, "failed to start uuid_rescan task"); up(&fs_info->uuid_tree_rescan_sem); return PTR_ERR(task); } @@ -3864,7 +3969,8 @@ static int btrfs_add_system_chunk(struct btrfs_root *root, u8 *ptr; array_size = btrfs_super_sys_array_size(super_copy); - if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) + if (array_size + item_size + sizeof(disk_key) + > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) return -EFBIG; ptr = super_copy->sys_chunk_array + array_size; @@ -3969,6 +4075,16 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) btrfs_set_fs_incompat(info, RAID56); } +#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r) \ + - sizeof(struct btrfs_item) \ + - sizeof(struct btrfs_chunk)) \ + / sizeof(struct btrfs_stripe) + 1) + +#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ + - 2 * sizeof(struct btrfs_disk_key) \ + - 2 * sizeof(struct btrfs_chunk)) \ + / sizeof(struct btrfs_stripe) + 1) + static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 start, u64 type) @@ -4018,6 +4134,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (type & BTRFS_BLOCK_GROUP_DATA) { max_stripe_size = 1024 * 1024 * 1024; max_chunk_size = 10 * max_stripe_size; + if (!devs_max) + devs_max = BTRFS_MAX_DEVS(info->chunk_root); } else if (type & BTRFS_BLOCK_GROUP_METADATA) { /* for larger filesystems, use larger metadata chunks */ if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) @@ -4025,11 +4143,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, else max_stripe_size = 256 * 1024 * 1024; max_chunk_size = max_stripe_size; + if (!devs_max) + devs_max = BTRFS_MAX_DEVS(info->chunk_root); } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { max_stripe_size = 32 * 1024 * 1024; max_chunk_size = 2 * max_stripe_size; + if (!devs_max) + devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; } else { - printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", + btrfs_err(info, "invalid chunk type 0x%llx requested", type); BUG_ON(1); } @@ -4061,7 +4183,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (!device->writeable) { WARN(1, KERN_ERR - "btrfs: read-only device in alloc_list\n"); + "BTRFS: read-only device in alloc_list\n"); continue; } @@ -4196,9 +4318,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, em = alloc_extent_map(); if (!em) { + kfree(map); ret = -ENOMEM; goto error; } + set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); em->bdev = (struct block_device *)map; em->start = start; em->len = num_bytes; @@ -4241,7 +4365,6 @@ error_del_extent: /* One for the tree reference */ free_extent_map(em); error: - kfree(map); kfree(devices_info); return ret; } @@ -4277,7 +4400,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, if (em->start != chunk_offset || em->len != chunk_size) { btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted" - " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset, + " %Lu-%Lu, found %Lu-%Lu", chunk_offset, chunk_size, em->start, em->len); free_extent_map(em); return -EINVAL; @@ -4453,7 +4576,6 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) write_unlock(&tree->map_tree.lock); if (!em) break; - kfree(em->bdev); /* once for us */ free_extent_map(em); /* once for the tree */ @@ -4479,15 +4601,16 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) * and exit, so return 1 so the callers don't try to use other copies. */ if (!em) { - btrfs_crit(fs_info, "No mapping for %Lu-%Lu\n", logical, + btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical, logical+len); return 1; } if (em->start > logical || em->start + em->len < logical) { btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got " - "%Lu-%Lu\n", logical, logical+len, em->start, + "%Lu-%Lu", logical, logical+len, em->start, em->start + em->len); + free_extent_map(em); return 1; } @@ -4666,8 +4789,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (em->start > logical || em->start + em->len < logical) { btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, " - "found %Lu-%Lu\n", logical, em->start, + "found %Lu-%Lu", logical, em->start, em->start + em->len); + free_extent_map(em); return -EINVAL; } @@ -4895,7 +5019,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, num_stripes = map->num_stripes; max_errors = nr_parity_stripes(map); - raid_map = kmalloc(sizeof(u64) * num_stripes, + raid_map = kmalloc_array(num_stripes, sizeof(u64), GFP_NOFS); if (!raid_map) { ret = -ENOMEM; @@ -5187,13 +5311,13 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, read_unlock(&em_tree->lock); if (!em) { - printk(KERN_ERR "btrfs: couldn't find em for chunk %Lu\n", + printk(KERN_ERR "BTRFS: couldn't find em for chunk %Lu\n", chunk_start); return -EIO; } if (em->start != chunk_start) { - printk(KERN_ERR "btrfs: bad chunk start, em=%Lu, wanted=%Lu\n", + printk(KERN_ERR "BTRFS: bad chunk start, em=%Lu, wanted=%Lu\n", em->start, chunk_start); free_extent_map(em); return -EIO; @@ -5255,9 +5379,19 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, return 0; } +static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err) +{ + if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED)) + bio_endio_nodec(bio, err); + else + bio_endio(bio, err); + kfree(bbio); +} + static void btrfs_end_bio(struct bio *bio, int err) { struct btrfs_bio *bbio = bio->bi_private; + struct btrfs_device *dev = bbio->stripes[0].dev; int is_orig_bio = 0; if (err) { @@ -5265,7 +5399,6 @@ static void btrfs_end_bio(struct bio *bio, int err) if (err == -EIO || err == -EREMOTEIO) { unsigned int stripe_index = btrfs_io_bio(bio)->stripe_index; - struct btrfs_device *dev; BUG_ON(stripe_index >= bbio->num_stripes); dev = bbio->stripes[stripe_index].dev; @@ -5287,11 +5420,14 @@ static void btrfs_end_bio(struct bio *bio, int err) if (bio == bbio->orig_bio) is_orig_bio = 1; + btrfs_bio_counter_dec(bbio->fs_info); + if (atomic_dec_and_test(&bbio->stripes_pending)) { if (!is_orig_bio) { bio_put(bio); bio = bbio->orig_bio; } + bio->bi_private = bbio->private; bio->bi_end_io = bbio->end_io; btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; @@ -5308,21 +5444,13 @@ static void btrfs_end_bio(struct bio *bio, int err) set_bit(BIO_UPTODATE, &bio->bi_flags); err = 0; } - kfree(bbio); - bio_endio(bio, err); + btrfs_end_bbio(bbio, bio, err); } else if (!is_orig_bio) { bio_put(bio); } } -struct async_sched { - struct bio *bio; - int rw; - struct btrfs_fs_info *info; - struct btrfs_work work; -}; - /* * see run_scheduled_bios for a description of why bios are collected for * async submit. @@ -5379,8 +5507,8 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, spin_unlock(&device->io_lock); if (should_queue) - btrfs_queue_worker(&root->fs_info->submit_workers, - &device->work); + btrfs_queue_work(root->fs_info->submit_workers, + &device->work); } static int bio_size_ok(struct block_device *bdev, struct bio *bio, @@ -5388,17 +5516,15 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio, { struct bio_vec *prev; struct request_queue *q = bdev_get_queue(bdev); - unsigned short max_sectors = queue_max_sectors(q); + unsigned int max_sectors = queue_max_sectors(q); struct bvec_merge_data bvm = { .bi_bdev = bdev, .bi_sector = sector, .bi_rw = bio->bi_rw, }; - if (bio->bi_vcnt == 0) { - WARN_ON(1); + if (WARN_ON(bio->bi_vcnt == 0)) return 1; - } prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; if (bio_sectors(bio) > max_sectors) @@ -5407,7 +5533,7 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio, if (!q->merge_bvec_fn) return 1; - bvm.bi_size = bio->bi_size - prev->bv_len; + bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len; if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) return 0; return 1; @@ -5422,7 +5548,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, bio->bi_private = bbio; btrfs_io_bio(bio)->stripe_index = dev_nr; bio->bi_end_io = btrfs_end_bio; - bio->bi_sector = physical >> 9; + bio->bi_iter.bi_sector = physical >> 9; #ifdef DEBUG { struct rcu_string *name; @@ -5437,6 +5563,9 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, } #endif bio->bi_bdev = dev->bdev; + + btrfs_bio_counter_inc_noblocked(root->fs_info); + if (async) btrfs_schedule_bio(root, dev, rw, bio); else @@ -5460,7 +5589,7 @@ again: while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) { if (bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset) < bvec->bv_len) { - u64 len = bio->bi_size; + u64 len = bio->bi_iter.bi_size; atomic_inc(&bbio->stripes_pending); submit_stripe_bio(root, bbio, bio, physical, dev_nr, @@ -5479,12 +5608,15 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) { atomic_inc(&bbio->error); if (atomic_dec_and_test(&bbio->stripes_pending)) { + /* Shoud be the original bio. */ + WARN_ON(bio != bbio->orig_bio); + bio->bi_private = bbio->private; bio->bi_end_io = bbio->end_io; btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; - bio->bi_sector = logical >> 9; - kfree(bbio); - bio_endio(bio, -EIO); + bio->bi_iter.bi_sector = logical >> 9; + + btrfs_end_bbio(bbio, bio, -EIO); } } @@ -5493,7 +5625,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, { struct btrfs_device *dev; struct bio *first_bio = bio; - u64 logical = (u64)bio->bi_sector << 9; + u64 logical = (u64)bio->bi_iter.bi_sector << 9; u64 length = 0; u64 map_length; u64 *raid_map = NULL; @@ -5502,31 +5634,41 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, int total_devs = 1; struct btrfs_bio *bbio = NULL; - length = bio->bi_size; + length = bio->bi_iter.bi_size; map_length = length; + btrfs_bio_counter_inc_blocked(root->fs_info); ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, mirror_num, &raid_map); - if (ret) /* -ENOMEM */ + if (ret) { + btrfs_bio_counter_dec(root->fs_info); return ret; + } total_devs = bbio->num_stripes; bbio->orig_bio = first_bio; bbio->private = first_bio->bi_private; bbio->end_io = first_bio->bi_end_io; + bbio->fs_info = root->fs_info; atomic_set(&bbio->stripes_pending, bbio->num_stripes); if (raid_map) { /* In this case, map_length has been set to the length of a single stripe; not the whole write */ if (rw & WRITE) { - return raid56_parity_write(root, bio, bbio, - raid_map, map_length); + ret = raid56_parity_write(root, bio, bbio, + raid_map, map_length); } else { - return raid56_parity_recover(root, bio, bbio, - raid_map, map_length, - mirror_num); + ret = raid56_parity_recover(root, bio, bbio, + raid_map, map_length, + mirror_num); } + /* + * FIXME, replace dosen't support raid56 yet, please fix + * it in the future. + */ + btrfs_bio_counter_dec(root->fs_info); + return ret; } if (map_length < length) { @@ -5561,6 +5703,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, BUG_ON(!bio); /* -ENOMEM */ } else { bio = first_bio; + bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED; } submit_stripe_bio(root, bbio, bio, @@ -5568,6 +5711,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, async_submit); dev_nr++; } + btrfs_bio_counter_dec(root->fs_info); return 0; } @@ -5631,10 +5775,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, struct btrfs_device *dev; u64 tmp; - if (!devid && !fs_info) { - WARN_ON(1); + if (WARN_ON(!devid && !fs_info)) return ERR_PTR(-EINVAL); - } dev = __alloc_device(); if (IS_ERR(dev)) @@ -5658,7 +5800,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, else generate_random_uuid(dev->uuid); - dev->work.func = pending_bios_fn; + btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL); return dev; } @@ -5703,6 +5845,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, return -ENOMEM; } + set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); em->bdev = (struct block_device *)map; em->start = logical; em->len = length; @@ -5727,7 +5870,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, uuid, NULL); if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { - kfree(map); free_extent_map(em); return -EIO; } @@ -5735,7 +5877,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, map->stripes[i].dev = add_missing_dev(root, devid, uuid); if (!map->stripes[i].dev) { - kfree(map); free_extent_map(em); return -EIO; } @@ -6027,10 +6168,14 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) - device->dev_root = fs_info->dev_root; - mutex_unlock(&fs_devices->device_list_mutex); + while (fs_devices) { + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) + device->dev_root = fs_info->dev_root; + mutex_unlock(&fs_devices->device_list_mutex); + + fs_devices = fs_devices->seed; + } } static void __btrfs_reset_dev_stats(struct btrfs_device *dev) @@ -6121,7 +6266,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, BUG_ON(!path); ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { - printk_in_rcu(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n", + printk_in_rcu(KERN_WARNING "BTRFS: " + "error %d while searching for dev_stats item for device %s!\n", ret, rcu_str_deref(device->name)); goto out; } @@ -6131,7 +6277,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, /* need to delete old one and insert a new one */ ret = btrfs_del_item(trans, dev_root, path); if (ret != 0) { - printk_in_rcu(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n", + printk_in_rcu(KERN_WARNING "BTRFS: " + "delete too small dev_stats item for device %s failed %d!\n", rcu_str_deref(device->name), ret); goto out; } @@ -6144,7 +6291,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, dev_root, path, &key, sizeof(*ptr)); if (ret < 0) { - printk_in_rcu(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n", + printk_in_rcu(KERN_WARNING "BTRFS: " + "insert dev_stats item for device %s failed %d!\n", rcu_str_deref(device->name), ret); goto out; } @@ -6197,16 +6345,14 @@ static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) { if (!dev->dev_stats_valid) return; - printk_ratelimited_in_rcu(KERN_ERR - "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " + "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", rcu_str_deref(dev->name), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), - btrfs_dev_stat_read(dev, - BTRFS_DEV_STAT_CORRUPTION_ERRS), - btrfs_dev_stat_read(dev, - BTRFS_DEV_STAT_GENERATION_ERRS)); + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); } static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) @@ -6219,7 +6365,8 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) if (i == BTRFS_DEV_STAT_VALUES_MAX) return; /* all values == 0, suppress message */ - printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + printk_in_rcu(KERN_INFO "BTRFS: " + "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", rcu_str_deref(dev->name), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), @@ -6240,12 +6387,10 @@ int btrfs_get_dev_stats(struct btrfs_root *root, mutex_unlock(&fs_devices->device_list_mutex); if (!dev) { - printk(KERN_WARNING - "btrfs: get dev_stats failed, device not found\n"); + btrfs_warn(root->fs_info, "get dev_stats failed, device not found"); return -ENODEV; } else if (!dev->dev_stats_valid) { - printk(KERN_WARNING - "btrfs: get dev_stats failed, not yet valid\n"); + btrfs_warn(root->fs_info, "get dev_stats failed, not yet valid"); return -ENODEV; } else if (stats->flags & BTRFS_DEV_STATS_RESET) { for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index b72f540c8b2..2aaa00c4781 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -43,9 +43,8 @@ struct btrfs_device { /* WRITE_SYNC bios */ struct btrfs_pending_bios pending_sync_bios; - int running_pending; u64 generation; - + int running_pending; int writeable; int in_fs_metadata; int missing; @@ -53,11 +52,11 @@ struct btrfs_device { int is_tgtdev_for_dev_replace; spinlock_t io_lock; + /* the mode sent to blkdev_get */ + fmode_t mode; struct block_device *bdev; - /* the mode sent to blkdev_get */ - fmode_t mode; struct rcu_string *name; @@ -78,16 +77,21 @@ struct btrfs_device { /* optimal io width for this device */ u32 io_width; + /* type and info about this device */ + u64 type; /* minimal io size for this device */ u32 sector_size; - /* type and info about this device */ - u64 type; /* physical drive uuid (or lvm uuid) */ u8 uuid[BTRFS_UUID_SIZE]; + /* for sending down flush barriers */ + int nobarriers; + struct bio *flush_bio; + struct completion flush_wait; + /* per-device scrub information */ struct scrub_ctx *scrub_device; @@ -103,10 +107,6 @@ struct btrfs_device { struct radix_tree_root reada_zones; struct radix_tree_root reada_extents; - /* for sending down flush barriers */ - struct bio *flush_bio; - struct completion flush_wait; - int nobarriers; /* disk I/O failure stats. For detailed description refer to * enum btrfs_dev_stat_values in ioctl.h */ @@ -132,7 +132,9 @@ struct btrfs_fs_devices { /* all of the devices in the FS, protected by a mutex * so we can safely walk it to write out the supers without - * worrying about add/remove by the multi-device code + * worrying about add/remove by the multi-device code. + * Scrubbing super can kick off supers writing by holding + * this mutex lock. */ struct mutex device_list_mutex; struct list_head devices; @@ -188,10 +190,14 @@ struct btrfs_bio_stripe { struct btrfs_bio; typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); +#define BTRFS_BIO_ORIG_BIO_SUBMITTED 0x1 + struct btrfs_bio { atomic_t stripes_pending; + struct btrfs_fs_info *fs_info; bio_end_io_t *end_io; struct bio *orig_bio; + unsigned long flags; void *private; atomic_t error; int max_errors; @@ -252,6 +258,7 @@ struct map_lookup { #define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) #define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) #define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) +#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5) /* * Profile changing flags. When SOFT is set we won't relocate chunk if diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 05740b9789e..ad8328d797e 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -22,11 +22,13 @@ #include <linux/rwsem.h> #include <linux/xattr.h> #include <linux/security.h> +#include <linux/posix_acl_xattr.h> #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" #include "xattr.h" #include "disk-io.h" +#include "props.h" ssize_t __btrfs_getxattr(struct inode *inode, const char *name, @@ -313,8 +315,8 @@ err: */ const struct xattr_handler *btrfs_xattr_handlers[] = { #ifdef CONFIG_BTRFS_FS_POSIX_ACL - &btrfs_xattr_acl_access_handler, - &btrfs_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif NULL, }; @@ -331,7 +333,8 @@ static bool btrfs_is_valid_xattr(const char *name) XATTR_SECURITY_PREFIX_LEN) || !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || - !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); + !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) || + !strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN); } ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, @@ -373,6 +376,10 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (!btrfs_is_valid_xattr(name)) return -EOPNOTSUPP; + if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) + return btrfs_set_prop(dentry->d_inode, name, + value, size, flags); + if (size == 0) value = ""; /* empty EA, do not remove */ @@ -402,6 +409,10 @@ int btrfs_removexattr(struct dentry *dentry, const char *name) if (!btrfs_is_valid_xattr(name)) return -EOPNOTSUPP; + if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) + return btrfs_set_prop(dentry->d_inode, name, + NULL, 0, XATTR_REPLACE); + return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0, XATTR_REPLACE); } diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index b3cc8039134..5049608d138 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -21,8 +21,6 @@ #include <linux/xattr.h> -extern const struct xattr_handler btrfs_xattr_acl_access_handler; -extern const struct xattr_handler btrfs_xattr_acl_default_handler; extern const struct xattr_handler *btrfs_xattr_handlers[]; extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 9acb846c3e7..b67d8fc8127 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -97,8 +97,8 @@ static int zlib_compress_pages(struct list_head *ws, *total_in = 0; if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { - printk(KERN_WARNING "btrfs: deflateInit failed\n"); - ret = -1; + printk(KERN_WARNING "BTRFS: deflateInit failed\n"); + ret = -EIO; goto out; } @@ -110,7 +110,7 @@ static int zlib_compress_pages(struct list_head *ws, out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (out_page == NULL) { - ret = -1; + ret = -ENOMEM; goto out; } cpage_out = kmap(out_page); @@ -125,10 +125,10 @@ static int zlib_compress_pages(struct list_head *ws, while (workspace->def_strm.total_in < len) { ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); if (ret != Z_OK) { - printk(KERN_DEBUG "btrfs: deflate in loop returned %d\n", + printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", ret); zlib_deflateEnd(&workspace->def_strm); - ret = -1; + ret = -EIO; goto out; } @@ -136,7 +136,7 @@ static int zlib_compress_pages(struct list_head *ws, if (workspace->def_strm.total_in > 8192 && workspace->def_strm.total_in < workspace->def_strm.total_out) { - ret = -1; + ret = -E2BIG; goto out; } /* we need another page for writing out. Test this @@ -147,12 +147,12 @@ static int zlib_compress_pages(struct list_head *ws, kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; - ret = -1; + ret = -E2BIG; goto out; } out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (out_page == NULL) { - ret = -1; + ret = -ENOMEM; goto out; } cpage_out = kmap(out_page); @@ -188,12 +188,12 @@ static int zlib_compress_pages(struct list_head *ws, zlib_deflateEnd(&workspace->def_strm); if (ret != Z_STREAM_END) { - ret = -1; + ret = -EIO; goto out; } if (workspace->def_strm.total_out >= workspace->def_strm.total_in) { - ret = -1; + ret = -E2BIG; goto out; } @@ -252,8 +252,8 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, } if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { - printk(KERN_WARNING "btrfs: inflateInit failed\n"); - return -1; + printk(KERN_WARNING "BTRFS: inflateInit failed\n"); + return -EIO; } while (workspace->inf_strm.total_in < srclen) { ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); @@ -295,7 +295,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, } } if (ret != Z_STREAM_END) - ret = -1; + ret = -EIO; else ret = 0; done: @@ -336,8 +336,8 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, } if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { - printk(KERN_WARNING "btrfs: inflateInit failed\n"); - return -1; + printk(KERN_WARNING "BTRFS: inflateInit failed\n"); + return -EIO; } while (bytes_left > 0) { @@ -354,7 +354,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, total_out = workspace->inf_strm.total_out; if (total_out == buf_start) { - ret = -1; + ret = -EIO; break; } @@ -382,7 +382,7 @@ next: } if (ret != Z_STREAM_END && bytes_left != 0) - ret = -1; + ret = -EIO; else ret = 0; diff --git a/fs/buffer.c b/fs/buffer.c index 4d7433534f5..eba6e4f621c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -77,7 +77,7 @@ EXPORT_SYMBOL(__lock_buffer); void unlock_buffer(struct buffer_head *bh) { clear_bit_unlock(BH_Lock, &bh->b_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&bh->b_state, BH_Lock); } EXPORT_SYMBOL(unlock_buffer); @@ -227,7 +227,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) int all_mapped = 1; index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); - page = find_get_page(bd_mapping, index); + page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED); if (!page) goto out; @@ -654,14 +654,16 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); static void __set_page_dirty(struct page *page, struct address_space *mapping, int warn) { - spin_lock_irq(&mapping->tree_lock); + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } @@ -1005,9 +1007,19 @@ grow_dev_page(struct block_device *bdev, sector_t block, struct buffer_head *bh; sector_t end_block; int ret = 0; /* Will call free_more_memory() */ + gfp_t gfp_mask; + + gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS; + gfp_mask |= __GFP_MOVABLE; + /* + * XXX: __getblk_slow() can not really deal with failure and + * will endlessly loop on improvised global reclaim. Prefer + * looping in the allocator rather than here, at least that + * code knows what it's doing. + */ + gfp_mask |= __GFP_NOFAIL; - page = find_or_create_page(inode->i_mapping, index, - (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); + page = find_or_create_page(inode->i_mapping, index, gfp_mask); if (!page) return ret; @@ -1302,7 +1314,7 @@ static void bh_lru_install(struct buffer_head *bh) } while (out < BH_LRU_SIZE) bhs[out++] = NULL; - memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs)); + memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs)); } bh_lru_unlock(); @@ -1354,12 +1366,13 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size) struct buffer_head *bh = lookup_bh_lru(bdev, block, size); if (bh == NULL) { + /* __find_get_block_slow will mark the page accessed */ bh = __find_get_block_slow(bdev, block); if (bh) bh_lru_install(bh); - } - if (bh) + } else touch_buffer(bh); + return bh; } EXPORT_SYMBOL(__find_get_block); @@ -1471,16 +1484,27 @@ EXPORT_SYMBOL(set_bh_page); /* * Called when truncating a buffer on a page completely. */ + +/* Bits that are cleared during an invalidate */ +#define BUFFER_FLAGS_DISCARD \ + (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \ + 1 << BH_Delay | 1 << BH_Unwritten) + static void discard_buffer(struct buffer_head * bh) { + unsigned long b_state, b_state_old; + lock_buffer(bh); clear_buffer_dirty(bh); bh->b_bdev = NULL; - clear_buffer_mapped(bh); - clear_buffer_req(bh); - clear_buffer_new(bh); - clear_buffer_delay(bh); - clear_buffer_unwritten(bh); + b_state = bh->b_state; + for (;;) { + b_state_old = cmpxchg(&bh->b_state, b_state, + (b_state & ~BUFFER_FLAGS_DISCARD)); + if (b_state_old == b_state) + break; + b_state = b_state_old; + } unlock_buffer(bh); } @@ -2102,8 +2126,8 @@ EXPORT_SYMBOL(generic_write_end); * Returns true if all buffers which correspond to a file portion * we want to read are uptodate. */ -int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, - unsigned long from) +int block_is_partially_uptodate(struct page *page, unsigned long from, + unsigned long count) { unsigned block_start, block_end, blocksize; unsigned to; @@ -2115,7 +2139,7 @@ int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, head = page_buffers(page); blocksize = head->b_size; - to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); + to = min_t(unsigned, PAGE_CACHE_SIZE - from, count); to = from + to; if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) return 0; @@ -2867,10 +2891,9 @@ EXPORT_SYMBOL(block_truncate_page); /* * The generic ->writepage function for buffer-backed address_spaces - * this form passes in the end_io handler used to finish the IO. */ -int block_write_full_page_endio(struct page *page, get_block_t *get_block, - struct writeback_control *wbc, bh_end_io_t *handler) +int block_write_full_page(struct page *page, get_block_t *get_block, + struct writeback_control *wbc) { struct inode * const inode = page->mapping->host; loff_t i_size = i_size_read(inode); @@ -2880,7 +2903,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block, /* Is the page fully inside i_size? */ if (page->index < end_index) return __block_write_full_page(inode, page, get_block, wbc, - handler); + end_buffer_async_write); /* Is the page fully outside i_size? (truncate in progress) */ offset = i_size & (PAGE_CACHE_SIZE-1); @@ -2903,18 +2926,8 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block, * writes to that region are not written out to the file." */ zero_user_segment(page, offset, PAGE_CACHE_SIZE); - return __block_write_full_page(inode, page, get_block, wbc, handler); -} -EXPORT_SYMBOL(block_write_full_page_endio); - -/* - * The generic ->writepage function for buffer-backed address_spaces - */ -int block_write_full_page(struct page *page, get_block_t *get_block, - struct writeback_control *wbc) -{ - return block_write_full_page_endio(page, get_block, wbc, - end_buffer_async_write); + return __block_write_full_page(inode, page, get_block, wbc, + end_buffer_async_write); } EXPORT_SYMBOL(block_write_full_page); @@ -2972,11 +2985,11 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) * let it through, and the IO layer will turn it into * an EIO. */ - if (unlikely(bio->bi_sector >= maxsector)) + if (unlikely(bio->bi_iter.bi_sector >= maxsector)) return; - maxsector -= bio->bi_sector; - bytes = bio->bi_size; + maxsector -= bio->bi_iter.bi_sector; + bytes = bio->bi_iter.bi_size; if (likely((bytes >> 9) <= maxsector)) return; @@ -2984,7 +2997,7 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) bytes = maxsector << 9; /* Truncate the bio.. */ - bio->bi_size = bytes; + bio->bi_iter.bi_size = bytes; bio->bi_io_vec[0].bv_len = bytes; /* ..and clear the end of the buffer for reads */ @@ -3019,14 +3032,14 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) */ bio = bio_alloc(GFP_NOIO, 1); - bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; bio->bi_io_vec[0].bv_page = bh->b_page; bio->bi_io_vec[0].bv_len = bh->b_size; bio->bi_io_vec[0].bv_offset = bh_offset(bh); bio->bi_vcnt = 1; - bio->bi_size = bh->b_size; + bio->bi_iter.bi_size = bh->b_size; bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; @@ -3076,7 +3089,7 @@ EXPORT_SYMBOL(submit_bh); * until the buffer gets unlocked). * * ll_rw_block sets b_end_io to simple completion handler that marks - * the buffer up-to-date (if approriate), unlocks the buffer and wakes + * the buffer up-to-date (if appropriate), unlocks the buffer and wakes * any waiters. * * All of the buffers must be for the same device, and must also be a diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c index 622f4696e48..d749731dc0e 100644 --- a/fs/cachefiles/bind.c +++ b/fs/cachefiles/bind.c @@ -50,18 +50,18 @@ int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args) cache->brun_percent < 100); if (*args) { - kerror("'bind' command doesn't take an argument"); + pr_err("'bind' command doesn't take an argument"); return -EINVAL; } if (!cache->rootdirname) { - kerror("No cache directory specified"); + pr_err("No cache directory specified"); return -EINVAL; } /* don't permit already bound caches to be re-bound */ if (test_bit(CACHEFILES_READY, &cache->flags)) { - kerror("Cache already bound"); + pr_err("Cache already bound"); return -EBUSY; } @@ -124,7 +124,6 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) /* check parameters */ ret = -EOPNOTSUPP; if (!root->d_inode || - !root->d_inode->i_op || !root->d_inode->i_op->lookup || !root->d_inode->i_op->mkdir || !root->d_inode->i_op->setxattr || @@ -229,9 +228,7 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) set_bit(CACHEFILES_READY, &cache->flags); dput(root); - printk(KERN_INFO "CacheFiles:" - " File cache on %s registered\n", - cache->cache.identifier); + pr_info("File cache on %s registered\n", cache->cache.identifier); /* check how much space the cache has */ cachefiles_has_space(cache, 0, 0); @@ -251,7 +248,7 @@ error_open_root: kmem_cache_free(cachefiles_object_jar, fsdef); error_root_object: cachefiles_end_secure(cache, saved_cred); - kerror("Failed to register: %d", ret); + pr_err("Failed to register: %d", ret); return ret; } @@ -263,9 +260,8 @@ void cachefiles_daemon_unbind(struct cachefiles_cache *cache) _enter(""); if (test_bit(CACHEFILES_READY, &cache->flags)) { - printk(KERN_INFO "CacheFiles:" - " File cache on %s unregistering\n", - cache->cache.identifier); + pr_info("File cache on %s unregistering\n", + cache->cache.identifier); fscache_withdraw_cache(&cache->cache); } diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 0a1467b1551..b078d3081d6 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -315,8 +315,7 @@ static unsigned int cachefiles_daemon_poll(struct file *file, static int cachefiles_daemon_range_error(struct cachefiles_cache *cache, char *args) { - kerror("Free space limits must be in range" - " 0%%<=stop<cull<run<100%%"); + pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%"); return -EINVAL; } @@ -476,12 +475,12 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args) _enter(",%s", args); if (!*args) { - kerror("Empty directory specified"); + pr_err("Empty directory specified"); return -EINVAL; } if (cache->rootdirname) { - kerror("Second cache directory specified"); + pr_err("Second cache directory specified"); return -EEXIST; } @@ -504,12 +503,12 @@ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args) _enter(",%s", args); if (!*args) { - kerror("Empty security context specified"); + pr_err("Empty security context specified"); return -EINVAL; } if (cache->secctx) { - kerror("Second security context specified"); + pr_err("Second security context specified"); return -EINVAL; } @@ -532,7 +531,7 @@ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args) _enter(",%s", args); if (!*args) { - kerror("Empty tag specified"); + pr_err("Empty tag specified"); return -EINVAL; } @@ -563,12 +562,12 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args) goto inval; if (!test_bit(CACHEFILES_READY, &cache->flags)) { - kerror("cull applied to unready cache"); + pr_err("cull applied to unready cache"); return -EIO; } if (test_bit(CACHEFILES_DEAD, &cache->flags)) { - kerror("cull applied to dead cache"); + pr_err("cull applied to dead cache"); return -EIO; } @@ -588,11 +587,11 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args) notdir: path_put(&path); - kerror("cull command requires dirfd to be a directory"); + pr_err("cull command requires dirfd to be a directory"); return -ENOTDIR; inval: - kerror("cull command requires dirfd and filename"); + pr_err("cull command requires dirfd and filename"); return -EINVAL; } @@ -615,7 +614,7 @@ static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args) return 0; inval: - kerror("debug command requires mask"); + pr_err("debug command requires mask"); return -EINVAL; } @@ -635,12 +634,12 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args) goto inval; if (!test_bit(CACHEFILES_READY, &cache->flags)) { - kerror("inuse applied to unready cache"); + pr_err("inuse applied to unready cache"); return -EIO; } if (test_bit(CACHEFILES_DEAD, &cache->flags)) { - kerror("inuse applied to dead cache"); + pr_err("inuse applied to dead cache"); return -EIO; } @@ -660,11 +659,11 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args) notdir: path_put(&path); - kerror("inuse command requires dirfd to be a directory"); + pr_err("inuse command requires dirfd to be a directory"); return -ENOTDIR; inval: - kerror("inuse command requires dirfd and filename"); + pr_err("inuse command requires dirfd and filename"); return -EINVAL; } diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 43eb5592cde..584743d456c 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -146,8 +146,7 @@ static int cachefiles_lookup_object(struct fscache_object *_object) if (ret < 0 && ret != -ETIMEDOUT) { if (ret != -ENOBUFS) - printk(KERN_WARNING - "CacheFiles: Lookup failed error %d\n", ret); + pr_warn("Lookup failed error %d\n", ret); fscache_object_lookup_error(&object->fscache); } @@ -270,7 +269,7 @@ static void cachefiles_drop_object(struct fscache_object *_object) #endif /* delete retired objects */ - if (test_bit(FSCACHE_COOKIE_RETIRED, &object->fscache.cookie->flags) && + if (test_bit(FSCACHE_OBJECT_RETIRED, &object->fscache.flags) && _object != cache->cache.fsdef ) { _debug("- retire object OBJ%x", object->fscache.debug_id); @@ -449,14 +448,14 @@ static int cachefiles_attr_changed(struct fscache_object *_object) _debug("discard tail %llx", oi_size); newattrs.ia_valid = ATTR_SIZE; newattrs.ia_size = oi_size & PAGE_MASK; - ret = notify_change(object->backer, &newattrs); + ret = notify_change(object->backer, &newattrs, NULL); if (ret < 0) goto truncate_failed; } newattrs.ia_valid = ATTR_SIZE; newattrs.ia_size = ni_size; - ret = notify_change(object->backer, &newattrs); + ret = notify_change(object->backer, &newattrs, NULL); truncate_failed: mutex_unlock(&object->backer->d_inode->i_mutex); diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 5349473df1b..3d50998abf5 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -9,6 +9,13 @@ * 2 of the Licence, or (at your option) any later version. */ +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) "CacheFiles: " fmt + + #include <linux/fscache-cache.h> #include <linux/timer.h> #include <linux/wait.h> @@ -245,11 +252,10 @@ extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, /* * error handling */ -#define kerror(FMT, ...) printk(KERN_ERR "CacheFiles: "FMT"\n", ##__VA_ARGS__) #define cachefiles_io_error(___cache, FMT, ...) \ do { \ - kerror("I/O Error: " FMT, ##__VA_ARGS__); \ + pr_err("I/O Error: " FMT, ##__VA_ARGS__); \ fscache_io_error(&(___cache)->cache); \ set_bit(CACHEFILES_DEAD, &(___cache)->flags); \ } while (0) @@ -310,8 +316,8 @@ do { \ #define ASSERT(X) \ do { \ if (unlikely(!(X))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "CacheFiles: Assertion failed\n"); \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ BUG(); \ } \ } while (0) @@ -319,9 +325,9 @@ do { \ #define ASSERTCMP(X, OP, Y) \ do { \ if (unlikely(!((X) OP (Y)))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "CacheFiles: Assertion failed\n"); \ - printk(KERN_ERR "%lx " #OP " %lx is false\n", \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + pr_err("%lx " #OP " %lx is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ BUG(); \ } \ @@ -330,8 +336,8 @@ do { \ #define ASSERTIF(C, X) \ do { \ if (unlikely((C) && !(X))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "CacheFiles: Assertion failed\n"); \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ BUG(); \ } \ } while (0) @@ -339,9 +345,9 @@ do { \ #define ASSERTIFCMP(C, X, OP, Y) \ do { \ if (unlikely((C) && !((X) OP (Y)))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "CacheFiles: Assertion failed\n"); \ - printk(KERN_ERR "%lx " #OP " %lx is false\n", \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + pr_err("%lx " #OP " %lx is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ BUG(); \ } \ diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c index 4bfa8cf43bf..180edfb45f6 100644 --- a/fs/cachefiles/main.c +++ b/fs/cachefiles/main.c @@ -68,8 +68,7 @@ static int __init cachefiles_init(void) SLAB_HWCACHE_ALIGN, cachefiles_object_init_once); if (!cachefiles_object_jar) { - printk(KERN_NOTICE - "CacheFiles: Failed to allocate an object jar\n"); + pr_notice("Failed to allocate an object jar\n"); goto error_object_jar; } @@ -77,7 +76,7 @@ static int __init cachefiles_init(void) if (ret < 0) goto error_proc; - printk(KERN_INFO "CacheFiles: Loaded\n"); + pr_info("Loaded\n"); return 0; error_proc: @@ -85,7 +84,7 @@ error_proc: error_object_jar: misc_deregister(&cachefiles_dev); error_dev: - kerror("failed to register: %d", ret); + pr_err("failed to register: %d", ret); return ret; } @@ -96,7 +95,7 @@ fs_initcall(cachefiles_init); */ static void __exit cachefiles_exit(void) { - printk(KERN_INFO "CacheFiles: Unloading\n"); + pr_info("Unloading\n"); cachefiles_proc_cleanup(); kmem_cache_destroy(cachefiles_object_jar); diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index f4a08d7fa2f..5bf2b41e66d 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -35,22 +35,21 @@ void __cachefiles_printk_object(struct cachefiles_object *object, struct fscache_cookie *cookie; unsigned keylen, loop; - printk(KERN_ERR "%sobject: OBJ%x\n", - prefix, object->fscache.debug_id); - printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n", + pr_err("%sobject: OBJ%x\n", prefix, object->fscache.debug_id); + pr_err("%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n", prefix, object->fscache.state->name, object->fscache.flags, work_busy(&object->fscache.work), object->fscache.events, object->fscache.event_mask); - printk(KERN_ERR "%sops=%u inp=%u exc=%u\n", + pr_err("%sops=%u inp=%u exc=%u\n", prefix, object->fscache.n_ops, object->fscache.n_in_progress, object->fscache.n_exclusive); - printk(KERN_ERR "%sparent=%p\n", + pr_err("%sparent=%p\n", prefix, object->fscache.parent); spin_lock(&object->fscache.lock); cookie = object->fscache.cookie; if (cookie) { - printk(KERN_ERR "%scookie=%p [pr=%p nd=%p fl=%lx]\n", + pr_err("%scookie=%p [pr=%p nd=%p fl=%lx]\n", prefix, object->fscache.cookie, object->fscache.cookie->parent, @@ -62,16 +61,16 @@ void __cachefiles_printk_object(struct cachefiles_object *object, else keylen = 0; } else { - printk(KERN_ERR "%scookie=NULL\n", prefix); + pr_err("%scookie=NULL\n", prefix); keylen = 0; } spin_unlock(&object->fscache.lock); if (keylen) { - printk(KERN_ERR "%skey=[%u] '", prefix, keylen); + pr_err("%skey=[%u] '", prefix, keylen); for (loop = 0; loop < keylen; loop++) - printk("%02x", keybuf[loop]); - printk("'\n"); + pr_cont("%02x", keybuf[loop]); + pr_cont("'\n"); } } @@ -131,13 +130,11 @@ found_dentry: dentry); if (fscache_object_is_live(&object->fscache)) { - printk(KERN_ERR "\n"); - printk(KERN_ERR "CacheFiles: Error:" - " Can't preemptively bury live object\n"); + pr_err("\n"); + pr_err("Error: Can't preemptively bury live object\n"); cachefiles_printk_object(object, NULL); } else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { - printk(KERN_ERR "CacheFiles: Error:" - " Object already preemptively buried\n"); + pr_err("Error: Object already preemptively buried\n"); } write_unlock(&cache->active_lock); @@ -160,7 +157,7 @@ try_again: write_lock(&cache->active_lock); if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) { - printk(KERN_ERR "CacheFiles: Error: Object already active\n"); + pr_err("Error: Object already active\n"); cachefiles_printk_object(object, NULL); BUG(); } @@ -193,9 +190,8 @@ try_again: * need to wait for it to be destroyed */ wait_for_old_object: if (fscache_object_is_live(&object->fscache)) { - printk(KERN_ERR "\n"); - printk(KERN_ERR "CacheFiles: Error:" - " Unexpected object collision\n"); + pr_err("\n"); + pr_err("Error: Unexpected object collision\n"); cachefiles_printk_object(object, xobject); BUG(); } @@ -241,9 +237,8 @@ wait_for_old_object: } if (timeout <= 0) { - printk(KERN_ERR "\n"); - printk(KERN_ERR "CacheFiles: Error: Overlong" - " wait for old active object to go away\n"); + pr_err("\n"); + pr_err("Error: Overlong wait for old active object to go away\n"); cachefiles_printk_object(object, xobject); goto requeue; } @@ -294,7 +289,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, if (ret < 0) { cachefiles_io_error(cache, "Unlink security error"); } else { - ret = vfs_unlink(dir->d_inode, rep); + ret = vfs_unlink(dir->d_inode, rep, NULL); if (preemptive) cachefiles_mark_object_buried(cache, rep); @@ -391,12 +386,12 @@ try_again: path.dentry = dir; path_to_graveyard.mnt = cache->mnt; path_to_graveyard.dentry = cache->graveyard; - ret = security_path_rename(&path, rep, &path_to_graveyard, grave); + ret = security_path_rename(&path, rep, &path_to_graveyard, grave, 0); if (ret < 0) { cachefiles_io_error(cache, "Rename security error %d", ret); } else { ret = vfs_rename(dir->d_inode, rep, - cache->graveyard->d_inode, grave); + cache->graveyard->d_inode, grave, NULL, 0); if (ret != 0 && ret != -ENOMEM) cachefiles_io_error(cache, "Rename failed with error %d", ret); @@ -548,7 +543,7 @@ lookup_again: next, next->d_inode, next->d_inode->i_ino); } else if (!S_ISDIR(next->d_inode->i_mode)) { - kerror("inode %lu is not a directory", + pr_err("inode %lu is not a directory", next->d_inode->i_ino); ret = -ENOBUFS; goto error; @@ -579,7 +574,7 @@ lookup_again: } else if (!S_ISDIR(next->d_inode->i_mode) && !S_ISREG(next->d_inode->i_mode) ) { - kerror("inode %lu is not a file or directory", + pr_err("inode %lu is not a file or directory", next->d_inode->i_ino); ret = -ENOBUFS; goto error; @@ -773,14 +768,13 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, ASSERT(subdir->d_inode); if (!S_ISDIR(subdir->d_inode->i_mode)) { - kerror("%s is not a directory", dirname); + pr_err("%s is not a directory", dirname); ret = -EIO; goto check_error; } ret = -EPERM; - if (!subdir->d_inode->i_op || - !subdir->d_inode->i_op->setxattr || + if (!subdir->d_inode->i_op->setxattr || !subdir->d_inode->i_op->getxattr || !subdir->d_inode->i_op->lookup || !subdir->d_inode->i_op->mkdir || @@ -801,13 +795,13 @@ check_error: mkdir_error: mutex_unlock(&dir->d_inode->i_mutex); dput(subdir); - kerror("mkdir %s failed with error %d", dirname, ret); + pr_err("mkdir %s failed with error %d", dirname, ret); return ERR_PTR(ret); lookup_error: mutex_unlock(&dir->d_inode->i_mutex); ret = PTR_ERR(subdir); - kerror("Lookup %s failed with error %d", dirname, ret); + pr_err("Lookup %s failed with error %d", dirname, ret); return ERR_PTR(ret); nomem_d_alloc: @@ -897,7 +891,7 @@ lookup_error: if (ret == -EIO) { cachefiles_io_error(cache, "Lookup failed"); } else if (ret != -ENOMEM) { - kerror("Internal error: %d", ret); + pr_err("Internal error: %d", ret); ret = -EIO; } @@ -956,7 +950,7 @@ error: } if (ret != -ENOMEM) { - kerror("Internal error: %d", ret); + pr_err("Internal error: %d", ret); ret = -EIO; } diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index ebaff368120..4b1fb5ca65b 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -265,24 +265,22 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object, goto nomem_monitor; } - ret = add_to_page_cache(newpage, bmapping, - netpage->index, cachefiles_gfp); + ret = add_to_page_cache_lru(newpage, bmapping, + netpage->index, cachefiles_gfp); if (ret == 0) goto installed_new_backing_page; if (ret != -EEXIST) goto nomem_page; } - /* we've installed a new backing page, so now we need to add it - * to the LRU list and start it reading */ + /* we've installed a new backing page, so now we need to start + * it reading */ installed_new_backing_page: _debug("- new %p", newpage); backpage = newpage; newpage = NULL; - lru_cache_add_file(backpage); - read_backing_page: ret = bmapping->a_ops->readpage(NULL, backpage); if (ret < 0) @@ -510,24 +508,23 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, goto nomem; } - ret = add_to_page_cache(newpage, bmapping, - netpage->index, cachefiles_gfp); + ret = add_to_page_cache_lru(newpage, bmapping, + netpage->index, + cachefiles_gfp); if (ret == 0) goto installed_new_backing_page; if (ret != -EEXIST) goto nomem; } - /* we've installed a new backing page, so now we need to add it - * to the LRU list and start it reading */ + /* we've installed a new backing page, so now we need + * to start it reading */ installed_new_backing_page: _debug("- new %p", newpage); backpage = newpage; newpage = NULL; - lru_cache_add_file(backpage); - reread_backing_page: ret = bmapping->a_ops->readpage(NULL, backpage); if (ret < 0) @@ -538,8 +535,8 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, monitor_backing_page: _debug("- monitor add"); - ret = add_to_page_cache(netpage, op->mapping, netpage->index, - cachefiles_gfp); + ret = add_to_page_cache_lru(netpage, op->mapping, + netpage->index, cachefiles_gfp); if (ret < 0) { if (ret == -EEXIST) { page_cache_release(netpage); @@ -549,8 +546,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, goto nomem; } - lru_cache_add_file(netpage); - /* install a monitor */ page_cache_get(netpage); monitor->netfs_page = netpage; @@ -613,8 +608,8 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, backing_page_already_uptodate: _debug("- uptodate"); - ret = add_to_page_cache(netpage, op->mapping, netpage->index, - cachefiles_gfp); + ret = add_to_page_cache_lru(netpage, op->mapping, + netpage->index, cachefiles_gfp); if (ret < 0) { if (ret == -EEXIST) { page_cache_release(netpage); @@ -631,8 +626,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, fscache_mark_page_cached(op, netpage); - lru_cache_add_file(netpage); - /* the netpage is unlocked and marked up to date here */ fscache_end_io(op, netpage, 0); page_cache_release(netpage); diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c index 039b5011d83..396c18ea276 100644 --- a/fs/cachefiles/security.c +++ b/fs/cachefiles/security.c @@ -34,9 +34,7 @@ int cachefiles_get_security_ID(struct cachefiles_cache *cache) ret = set_security_override_from_ctx(new, cache->secctx); if (ret < 0) { put_cred(new); - printk(KERN_ERR "CacheFiles:" - " Security denies permission to nominate" - " security context: error %d\n", + pr_err("Security denies permission to nominate security context: error %d\n", ret); goto error; } @@ -59,16 +57,14 @@ static int cachefiles_check_cache_dir(struct cachefiles_cache *cache, ret = security_inode_mkdir(root->d_inode, root, 0); if (ret < 0) { - printk(KERN_ERR "CacheFiles:" - " Security denies permission to make dirs: error %d", + pr_err("Security denies permission to make dirs: error %d", ret); return ret; } ret = security_inode_create(root->d_inode, root, 0); if (ret < 0) - printk(KERN_ERR "CacheFiles:" - " Security denies permission to create files: error %d", + pr_err("Security denies permission to create files: error %d", ret); return ret; diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 12b0eef8418..1ad51ffbb27 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -51,7 +51,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object) } if (ret != -EEXIST) { - kerror("Can't set xattr on %*.*s [%lu] (err %d)", + pr_err("Can't set xattr on %*.*s [%lu] (err %d)", dentry->d_name.len, dentry->d_name.len, dentry->d_name.name, dentry->d_inode->i_ino, -ret); @@ -64,7 +64,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object) if (ret == -ERANGE) goto bad_type_length; - kerror("Can't read xattr on %*.*s [%lu] (err %d)", + pr_err("Can't read xattr on %*.*s [%lu] (err %d)", dentry->d_name.len, dentry->d_name.len, dentry->d_name.name, dentry->d_inode->i_ino, -ret); @@ -85,14 +85,14 @@ error: return ret; bad_type_length: - kerror("Cache object %lu type xattr length incorrect", + pr_err("Cache object %lu type xattr length incorrect", dentry->d_inode->i_ino); ret = -EIO; goto error; bad_type: xtype[2] = 0; - kerror("Cache object %*.*s [%lu] type %s not %s", + pr_err("Cache object %*.*s [%lu] type %s not %s", dentry->d_name.len, dentry->d_name.len, dentry->d_name.name, dentry->d_inode->i_ino, xtype, type); @@ -293,7 +293,7 @@ error: return ret; bad_type_length: - kerror("Cache object %lu xattr length incorrect", + pr_err("Cache object %lu xattr length incorrect", dentry->d_inode->i_ino); ret = -EIO; goto error; diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index ac9a2ef5bb9..264e9bf83ff 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -25,3 +25,16 @@ config CEPH_FSCACHE caching support for Ceph clients using FS-Cache endif + +config CEPH_FS_POSIX_ACL + bool "Ceph POSIX Access Control Lists" + depends on CEPH_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 32e30106a2f..85a4230b9bf 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -10,3 +10,4 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ debugfs.o ceph-$(CONFIG_CEPH_FSCACHE) += cache.o +ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c new file mode 100644 index 00000000000..469f2e8657e --- /dev/null +++ b/fs/ceph/acl.c @@ -0,0 +1,194 @@ +/* + * linux/fs/ceph/acl.c + * + * Copyright (C) 2013 Guangliang Zhao, <lucienchao@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/ceph/ceph_debug.h> +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> +#include <linux/posix_acl.h> +#include <linux/sched.h> +#include <linux/slab.h> + +#include "super.h" + +static inline void ceph_set_cached_acl(struct inode *inode, + int type, struct posix_acl *acl) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + spin_lock(&ci->i_ceph_lock); + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) + set_cached_acl(inode, type, acl); + spin_unlock(&ci->i_ceph_lock); +} + +static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode, + int type) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct posix_acl *acl = ACL_NOT_CACHED; + + spin_lock(&ci->i_ceph_lock); + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) + acl = get_cached_acl(inode, type); + spin_unlock(&ci->i_ceph_lock); + + return acl; +} + +struct posix_acl *ceph_get_acl(struct inode *inode, int type) +{ + int size; + const char *name; + char *value = NULL; + struct posix_acl *acl; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + BUG(); + } + + size = __ceph_getxattr(inode, name, "", 0); + if (size > 0) { + value = kzalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + size = __ceph_getxattr(inode, name, value, size); + } + + if (size > 0) + acl = posix_acl_from_xattr(&init_user_ns, value, size); + else if (size == -ERANGE || size == -ENODATA || size == 0) + acl = NULL; + else + acl = ERR_PTR(-EIO); + + kfree(value); + + if (!IS_ERR(acl)) + ceph_set_cached_acl(inode, type, acl); + + return acl; +} + +int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int ret = 0, size = 0; + const char *name = NULL; + char *value = NULL; + struct iattr newattrs; + umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; + struct dentry *dentry; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + if (acl) { + ret = posix_acl_equiv_mode(acl, &new_mode); + if (ret < 0) + goto out; + if (ret == 0) + acl = NULL; + } + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) { + ret = acl ? -EINVAL : 0; + goto out; + } + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + ret = -EINVAL; + goto out; + } + + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_NOFS); + if (!value) { + ret = -ENOMEM; + goto out; + } + + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (ret < 0) + goto out_free; + } + + dentry = d_find_alias(inode); + if (new_mode != old_mode) { + newattrs.ia_mode = new_mode; + newattrs.ia_valid = ATTR_MODE; + ret = ceph_setattr(dentry, &newattrs); + if (ret) + goto out_dput; + } + + ret = __ceph_setxattr(dentry, name, value, size, 0); + if (ret) { + if (new_mode != old_mode) { + newattrs.ia_mode = old_mode; + newattrs.ia_valid = ATTR_MODE; + ceph_setattr(dentry, &newattrs); + } + goto out_dput; + } + + ceph_set_cached_acl(inode, type, acl); + +out_dput: + dput(dentry); +out_free: + kfree(value); +out: + return ret; +} + +int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir) +{ + struct posix_acl *default_acl, *acl; + int error; + + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (error) + return error; + + if (!default_acl && !acl) + cache_no_acl(inode); + + if (default_acl) { + error = ceph_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = ceph_set_acl(inode, acl, ACL_TYPE_ACCESS); + posix_acl_release(acl); + } + return error; +} diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6df8bd48142..90b3954d48e 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -209,15 +209,17 @@ static int readpage_nounlock(struct file *filp, struct page *page) err = 0; if (err < 0) { SetPageError(page); + ceph_fscache_readpage_cancel(inode, page); goto out; - } else if (err < PAGE_CACHE_SIZE) { + } + if (err < PAGE_CACHE_SIZE) /* zero fill remainder of page */ zero_user_segment(page, err, PAGE_CACHE_SIZE); - } - SetPageUptodate(page); + else + flush_dcache_page(page); - if (err == 0) - ceph_readpage_to_fscache(inode, page); + SetPageUptodate(page); + ceph_readpage_to_fscache(inode, page); out: return err < 0 ? err : 0; @@ -252,6 +254,8 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) for (i = 0; i < num_pages; i++) { struct page *page = osd_data->pages[i]; + if (rc < 0) + goto unlock; if (bytes < (int)PAGE_CACHE_SIZE) { /* zero (remainder of) page */ int s = bytes < 0 ? 0 : bytes; @@ -262,6 +266,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) flush_dcache_page(page); SetPageUptodate(page); ceph_readpage_to_fscache(inode, page); +unlock: unlock_page(page); page_cache_release(page); bytes -= PAGE_CACHE_SIZE; @@ -686,7 +691,7 @@ static int ceph_writepages_start(struct address_space *mapping, (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { - pr_warning("writepage_start %p on forced umount\n", inode); + pr_warn("writepage_start %p on forced umount\n", inode); return -EIO; /* we're in a forced umount, don't write! */ } if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) @@ -1179,8 +1184,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, * never get called. */ static ssize_t ceph_direct_io(int rw, struct kiocb *iocb, - const struct iovec *iov, - loff_t pos, unsigned long nr_segs) + struct iov_iter *iter, + loff_t pos) { WARN_ON(1); return -EINVAL; @@ -1203,6 +1208,41 @@ const struct address_space_operations ceph_aops = { /* * vm ops */ +static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_file_info *fi = vma->vm_file->private_data; + loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT; + int want, got, ret; + + dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", + inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_CACHE; + while (1) { + got = 0; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); + if (ret == 0) + break; + if (ret != -ERESTARTSYS) { + WARN_ON(1); + return VM_FAULT_SIGBUS; + } + } + dout("filemap_fault %p %llu~%zd got cap refs on %s\n", + inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got)); + + ret = filemap_fault(vma, vmf); + + dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n", + inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret); + ceph_put_cap_refs(ci, got); + + return ret; +} /* * Reuse write_begin here for simplicity. @@ -1210,23 +1250,41 @@ const struct address_space_operations ceph_aops = { static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = file_inode(vma->vm_file); - struct page *page = vmf->page; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_file_info *fi = vma->vm_file->private_data; struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct page *page = vmf->page; loff_t off = page_offset(page); - loff_t size, len; - int ret; + loff_t size = i_size_read(inode); + size_t len; + int want, got, ret; - /* Update time before taking page lock */ - file_update_time(vma->vm_file); - - size = i_size_read(inode); if (off + PAGE_CACHE_SIZE <= size) len = PAGE_CACHE_SIZE; else len = size & ~PAGE_CACHE_MASK; - dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode, - off, len, page, page->index); + dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n", + inode, ceph_vinop(inode), off, len, size); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + while (1) { + got = 0; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, off + len); + if (ret == 0) + break; + if (ret != -ERESTARTSYS) { + WARN_ON(1); + return VM_FAULT_SIGBUS; + } + } + dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", + inode, off, len, ceph_cap_string(got)); + + /* Update time before taking page lock */ + file_update_time(vma->vm_file); lock_page(page); @@ -1248,14 +1306,26 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = VM_FAULT_SIGBUS; } out: - dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret); - if (ret != VM_FAULT_LOCKED) + if (ret != VM_FAULT_LOCKED) { unlock_page(page); + } else { + int dirty; + spin_lock(&ci->i_ceph_lock); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + } + + dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n", + inode, off, len, ceph_cap_string(got), ret); + ceph_put_cap_refs(ci, got); + return ret; } static struct vm_operations_struct ceph_vmops = { - .fault = filemap_fault, + .fault = ceph_filemap_fault, .page_mkwrite = ceph_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 6bfe65e0b03..834f9f3723f 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -68,7 +68,7 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc) { fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index, &ceph_fscache_fsid_object_def, - fsc); + fsc, true); if (fsc->fscache == NULL) { pr_err("Unable to resgister fsid: %p fscache cookie", fsc); @@ -204,7 +204,8 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, ci->fscache = fscache_acquire_cookie(fsc->fscache, &ceph_fscache_inode_object_def, - ci); + ci, true); + fscache_check_consistency(ci->fscache); done: mutex_unlock(&inode->i_mutex); @@ -324,6 +325,9 @@ void ceph_invalidate_fscache_page(struct inode* inode, struct page *page) { struct ceph_inode_info *ci = ceph_inode(inode); + if (!PageFsCache(page)) + return; + fscache_wait_on_page_write(ci->fscache, page); fscache_uncache_page(ci->fscache, page); } diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index ba949408a33..5ac591bd012 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -48,6 +48,12 @@ void ceph_readpage_to_fscache(struct inode *inode, struct page *page); void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); void ceph_queue_revalidate(struct inode *inode); +static inline void ceph_fscache_update_objectsize(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + fscache_attr_changed(ci->fscache); +} + static inline void ceph_fscache_invalidate(struct inode *inode) { fscache_invalidate(ceph_inode(inode)->fscache); @@ -67,6 +73,14 @@ static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) return fscache_maybe_release_page(ci->fscache, page, gfp); } +static inline void ceph_fscache_readpage_cancel(struct inode *inode, + struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + if (fscache_cookie_valid(ci->fscache) && PageFsCache(page)) + __fscache_uncache_page(ci->fscache, page); +} + static inline void ceph_fscache_readpages_cancel(struct inode *inode, struct list_head *pages) { @@ -127,6 +141,10 @@ static inline void ceph_readpage_to_fscache(struct inode *inode, { } +static inline void ceph_fscache_update_objectsize(struct inode *inode) +{ +} + static inline void ceph_fscache_invalidate(struct inode *inode) { } @@ -145,6 +163,11 @@ static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) return 1; } +static inline void ceph_fscache_readpage_cancel(struct inode *inode, + struct page *page) +{ +} + static inline void ceph_fscache_readpages_cancel(struct inode *inode, struct list_head *pages) { diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 13976c33332..1fde164b74b 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -221,8 +221,8 @@ int ceph_unreserve_caps(struct ceph_mds_client *mdsc, return 0; } -static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc, - struct ceph_cap_reservation *ctx) +struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx) { struct ceph_cap *cap = NULL; @@ -508,15 +508,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, * it is < 0. (This is so we can atomically add the cap and add an * open file reference to it.) */ -int ceph_add_cap(struct inode *inode, - struct ceph_mds_session *session, u64 cap_id, - int fmode, unsigned issued, unsigned wanted, - unsigned seq, unsigned mseq, u64 realmino, int flags, - struct ceph_cap_reservation *caps_reservation) +void ceph_add_cap(struct inode *inode, + struct ceph_mds_session *session, u64 cap_id, + int fmode, unsigned issued, unsigned wanted, + unsigned seq, unsigned mseq, u64 realmino, int flags, + struct ceph_cap **new_cap) { struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_cap *new_cap = NULL; struct ceph_cap *cap; int mds = session->s_mds; int actual_wanted; @@ -531,20 +530,10 @@ int ceph_add_cap(struct inode *inode, if (fmode >= 0) wanted |= ceph_caps_for_mode(fmode); -retry: - spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); if (!cap) { - if (new_cap) { - cap = new_cap; - new_cap = NULL; - } else { - spin_unlock(&ci->i_ceph_lock); - new_cap = get_cap(mdsc, caps_reservation); - if (new_cap == NULL) - return -ENOMEM; - goto retry; - } + cap = *new_cap; + *new_cap = NULL; cap->issued = 0; cap->implemented = 0; @@ -555,21 +544,31 @@ retry: cap->ci = ci; __insert_cap_node(ci, cap); - /* clear out old exporting info? (i.e. on cap import) */ - if (ci->i_cap_exporting_mds == mds) { - ci->i_cap_exporting_issued = 0; - ci->i_cap_exporting_mseq = 0; - ci->i_cap_exporting_mds = -1; - } - /* add to session cap list */ cap->session = session; spin_lock(&session->s_cap_lock); list_add_tail(&cap->session_caps, &session->s_caps); session->s_nr_caps++; spin_unlock(&session->s_cap_lock); - } else if (new_cap) - ceph_put_cap(mdsc, new_cap); + } else { + /* + * auth mds of the inode changed. we received the cap export + * message, but still haven't received the cap import message. + * handle_cap_export() updated the new auth MDS' cap. + * + * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing + * a message that was send before the cap import message. So + * don't remove caps. + */ + if (ceph_seq_cmp(seq, cap->seq) <= 0) { + WARN_ON(cap != ci->i_auth_cap); + WARN_ON(cap->cap_id != cap_id); + seq = cap->seq; + mseq = cap->mseq; + issued |= cap->issued; + flags |= CEPH_CAP_FLAG_AUTH; + } + } if (!ci->i_snap_realm) { /* @@ -609,17 +608,12 @@ retry: if (flags & CEPH_CAP_FLAG_AUTH) { if (ci->i_auth_cap == NULL || - ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) + ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { ci->i_auth_cap = cap; - } else if (ci->i_auth_cap == cap) { - ci->i_auth_cap = NULL; - spin_lock(&mdsc->cap_dirty_lock); - if (!list_empty(&ci->i_dirty_item)) { - dout(" moving %p to cap_dirty_migrating\n", inode); - list_move(&ci->i_dirty_item, - &mdsc->cap_dirty_migrating); + cap->mds_wanted = wanted; } - spin_unlock(&mdsc->cap_dirty_lock); + } else { + WARN_ON(ci->i_auth_cap == cap); } dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", @@ -628,7 +622,7 @@ retry: cap->cap_id = cap_id; cap->issued = issued; cap->implemented |= issued; - if (mseq > cap->mseq) + if (ceph_seq_cmp(mseq, cap->mseq) > 0) cap->mds_wanted = wanted; else cap->mds_wanted |= wanted; @@ -639,9 +633,6 @@ retry: if (fmode >= 0) __ceph_get_fmode(ci, fmode); - spin_unlock(&ci->i_ceph_lock); - wake_up_all(&ci->i_cap_wq); - return 0; } /* @@ -676,7 +667,7 @@ static int __cap_is_valid(struct ceph_cap *cap) */ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) { - int have = ci->i_snap_caps | ci->i_cap_exporting_issued; + int have = ci->i_snap_caps; struct ceph_cap *cap; struct rb_node *p; @@ -816,7 +807,7 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci, for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); - if (cap != ocap && __cap_is_valid(cap) && + if (cap != ocap && (cap->implemented & ~cap->issued & mask)) return 1; } @@ -878,7 +869,10 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) cap = rb_entry(p, struct ceph_cap, ci_node); if (!__cap_is_valid(cap)) continue; - mds_wanted |= cap->mds_wanted; + if (cap == ci->i_auth_cap) + mds_wanted |= cap->mds_wanted; + else + mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR); } return mds_wanted; } @@ -888,7 +882,19 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) */ static int __ceph_is_any_caps(struct ceph_inode_info *ci) { - return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0; + return !RB_EMPTY_ROOT(&ci->i_caps); +} + +int ceph_is_any_caps(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + spin_lock(&ci->i_ceph_lock); + ret = __ceph_is_any_caps(ci); + spin_unlock(&ci->i_ceph_lock); + + return ret; } /* @@ -897,7 +903,7 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci) * caller should hold i_ceph_lock. * caller will not hold session s_mutex if called from destroy_inode. */ -void __ceph_remove_cap(struct ceph_cap *cap) +void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) { struct ceph_mds_session *session = cap->session; struct ceph_inode_info *ci = cap->ci; @@ -909,6 +915,16 @@ void __ceph_remove_cap(struct ceph_cap *cap) /* remove from session list */ spin_lock(&session->s_cap_lock); + /* + * s_cap_reconnect is protected by s_cap_lock. no one changes + * s_cap_gen while session is in the reconnect state. + */ + if (queue_release && + (!session->s_cap_reconnect || + cap->cap_gen == session->s_cap_gen)) + __queue_cap_release(session, ci->i_vino.ino, cap->cap_id, + cap->mseq, cap->issue_seq); + if (session->s_cap_iterator == cap) { /* not yet, we are iterating over this very cap */ dout("__ceph_remove_cap delaying %p removal from session %p\n", @@ -1023,7 +1039,6 @@ void __queue_cap_release(struct ceph_mds_session *session, struct ceph_mds_cap_release *head; struct ceph_mds_cap_item *item; - spin_lock(&session->s_cap_lock); BUG_ON(!session->s_num_cap_releases); msg = list_first_entry(&session->s_cap_releases, struct ceph_msg, list_head); @@ -1052,7 +1067,6 @@ void __queue_cap_release(struct ceph_mds_session *session, (int)CEPH_CAPS_PER_RELEASE, (int)msg->front.iov_len); } - spin_unlock(&session->s_cap_lock); } /* @@ -1067,12 +1081,8 @@ void ceph_queue_caps_release(struct inode *inode) p = rb_first(&ci->i_caps); while (p) { struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); - struct ceph_mds_session *session = cap->session; - - __queue_cap_release(session, ceph_ino(inode), cap->cap_id, - cap->mseq, cap->issue_seq); p = rb_next(p); - __ceph_remove_cap(cap); + __ceph_remove_cap(cap, true); } } @@ -1379,13 +1389,10 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) ci->i_snap_realm->cached_context); dout(" inode %p now dirty snapc %p auth cap %p\n", &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); + WARN_ON(!ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); - if (ci->i_auth_cap) - list_add(&ci->i_dirty_item, &mdsc->cap_dirty); - else - list_add(&ci->i_dirty_item, - &mdsc->cap_dirty_migrating); + list_add(&ci->i_dirty_item, &mdsc->cap_dirty); spin_unlock(&mdsc->cap_dirty_lock); if (ci->i_flushing_caps == 0) { ihold(inode); @@ -1731,13 +1738,12 @@ ack: /* * Try to flush dirty caps back to the auth mds. */ -static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, - unsigned *flush_tid) +static int try_flush_caps(struct inode *inode, unsigned *flush_tid) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); - int unlock_session = session ? 0 : 1; int flushing = 0; + struct ceph_mds_session *session = NULL; retry: spin_lock(&ci->i_ceph_lock); @@ -1751,13 +1757,14 @@ retry: int want = __ceph_caps_wanted(ci); int delayed; - if (!session) { + if (!session || session != cap->session) { spin_unlock(&ci->i_ceph_lock); + if (session) + mutex_unlock(&session->s_mutex); session = cap->session; mutex_lock(&session->s_mutex); goto retry; } - BUG_ON(session != cap->session); if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) goto out; @@ -1776,7 +1783,7 @@ retry: out: spin_unlock(&ci->i_ceph_lock); out_unlocked: - if (session && unlock_session) + if (session) mutex_unlock(&session->s_mutex); return flushing; } @@ -1861,7 +1868,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) return ret; mutex_lock(&inode->i_mutex); - dirty = try_flush_caps(inode, NULL, &flush_tid); + dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); /* @@ -1896,7 +1903,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) dout("write_inode %p wait=%d\n", inode, wait); if (wait) { - dirty = try_flush_caps(inode, NULL, &flush_tid); + dirty = try_flush_caps(inode, &flush_tid); if (dirty) err = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); @@ -2346,11 +2353,11 @@ static void invalidate_aliases(struct inode *inode) d_prune_aliases(inode); /* * For non-directory inode, d_find_alias() only returns - * connected dentry. After calling d_invalidate(), the - * dentry become disconnected. + * hashed dentry. After calling d_invalidate(), the + * dentry becomes unhashed. * * For directory inode, d_find_alias() can return - * disconnected dentry. But directory inode should have + * unhashed dentry. But directory inode should have * one alias at most. */ while ((dn = d_find_alias(inode))) { @@ -2372,38 +2379,52 @@ static void invalidate_aliases(struct inode *inode) * actually be a revocation if it specifies a smaller cap set.) * * caller holds s_mutex and i_ceph_lock, we drop both. - * - * return value: - * 0 - ok - * 1 - check_caps on auth cap only (writeback) - * 2 - check_caps (ack revoke) */ -static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, +static void handle_cap_grant(struct ceph_mds_client *mdsc, + struct inode *inode, struct ceph_mds_caps *grant, + void *snaptrace, int snaptrace_len, + struct ceph_buffer *xattr_buf, struct ceph_mds_session *session, - struct ceph_cap *cap, - struct ceph_buffer *xattr_buf) - __releases(ci->i_ceph_lock) + struct ceph_cap *cap, int issued) + __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; int seq = le32_to_cpu(grant->seq); int newcaps = le32_to_cpu(grant->caps); - int issued, implemented, used, wanted, dirty; + int used, wanted, dirty; u64 size = le64_to_cpu(grant->size); u64 max_size = le64_to_cpu(grant->max_size); struct timespec mtime, atime, ctime; int check_caps = 0; - int wake = 0; - int writeback = 0; - int queue_invalidate = 0; - int deleted_inode = 0; - int queue_revalidate = 0; + bool wake = 0; + bool writeback = 0; + bool queue_trunc = 0; + bool queue_invalidate = 0; + bool queue_revalidate = 0; + bool deleted_inode = 0; dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", inode, cap, mds, seq, ceph_cap_string(newcaps)); dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, inode->i_size); + + /* + * auth mds of the inode changed. we received the cap export message, + * but still haven't received the cap import message. handle_cap_export + * updated the new auth MDS' cap. + * + * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message + * that was sent before the cap import message. So don't remove caps. + */ + if (ceph_seq_cmp(seq, cap->seq) <= 0) { + WARN_ON(cap != ci->i_auth_cap); + WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id)); + seq = cap->seq; + newcaps |= cap->issued; + } + /* * If CACHE is being revoked, and we have no dirty buffers, * try to invalidate (once). (If there are dirty buffers, we @@ -2425,15 +2446,13 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } /* side effects now are allowed */ - - issued = __ceph_caps_issued(ci, &implemented); - issued |= implemented | __ceph_caps_dirty(ci); - cap->cap_gen = session->s_cap_gen; + cap->seq = seq; __check_cap_issue(ci, cap, newcaps); - if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { + if ((newcaps & CEPH_CAP_AUTH_SHARED) && + (issued & CEPH_CAP_AUTH_EXCL) == 0) { inode->i_mode = le32_to_cpu(grant->mode); inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); @@ -2442,7 +2461,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, from_kgid(&init_user_ns, inode->i_gid)); } - if ((issued & CEPH_CAP_LINK_EXCL) == 0) { + if ((newcaps & CEPH_CAP_AUTH_SHARED) && + (issued & CEPH_CAP_LINK_EXCL) == 0) { set_nlink(inode, le32_to_cpu(grant->nlink)); if (inode->i_nlink == 0 && (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL))) @@ -2460,6 +2480,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, ceph_buffer_put(ci->i_xattrs.blob); ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); ci->i_xattrs.version = version; + ceph_forget_all_cached_acls(inode); } } @@ -2468,26 +2489,35 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1) queue_revalidate = 1; - /* size/ctime/mtime/atime? */ - ceph_fill_file_size(inode, issued, - le32_to_cpu(grant->truncate_seq), - le64_to_cpu(grant->truncate_size), size); - ceph_decode_timespec(&mtime, &grant->mtime); - ceph_decode_timespec(&atime, &grant->atime); - ceph_decode_timespec(&ctime, &grant->ctime); - ceph_fill_file_time(inode, issued, - le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, - &atime); - - /* max size increase? */ - if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { - dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); - ci->i_max_size = max_size; - if (max_size >= ci->i_wanted_max_size) { - ci->i_wanted_max_size = 0; /* reset */ - ci->i_requested_max_size = 0; + if (newcaps & CEPH_CAP_ANY_RD) { + /* ctime/mtime/atime? */ + ceph_decode_timespec(&mtime, &grant->mtime); + ceph_decode_timespec(&atime, &grant->atime); + ceph_decode_timespec(&ctime, &grant->ctime); + ceph_fill_file_time(inode, issued, + le32_to_cpu(grant->time_warp_seq), + &ctime, &mtime, &atime); + } + + if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { + /* file layout may have changed */ + ci->i_layout = grant->layout; + /* size/truncate_seq? */ + queue_trunc = ceph_fill_file_size(inode, issued, + le32_to_cpu(grant->truncate_seq), + le64_to_cpu(grant->truncate_size), + size); + /* max size increase? */ + if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { + dout("max_size %lld -> %llu\n", + ci->i_max_size, max_size); + ci->i_max_size = max_size; + if (max_size >= ci->i_wanted_max_size) { + ci->i_wanted_max_size = 0; /* reset */ + ci->i_requested_max_size = 0; + } + wake = 1; } - wake = 1; } /* check cap bits */ @@ -2507,11 +2537,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, check_caps = 1; } - cap->seq = seq; - - /* file layout may have changed */ - ci->i_layout = grant->layout; - /* revocation, grant, or no-op? */ if (cap->issued & ~newcaps) { int revoking = cap->issued & ~newcaps; @@ -2553,6 +2578,23 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, spin_unlock(&ci->i_ceph_lock); + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { + down_write(&mdsc->snap_rwsem); + ceph_update_snap_trace(mdsc, snaptrace, + snaptrace + snaptrace_len, false); + downgrade_write(&mdsc->snap_rwsem); + kick_flushing_inode_caps(mdsc, session, inode); + up_read(&mdsc->snap_rwsem); + if (newcaps & ~issued) + wake = 1; + } + + if (queue_trunc) { + ceph_queue_vmtruncate(inode); + ceph_queue_revalidate(inode); + } else if (queue_revalidate) + ceph_queue_revalidate(inode); + if (writeback) /* * queue inode for writeback: we can't actually call @@ -2564,8 +2606,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, ceph_queue_invalidate(inode); if (deleted_inode) invalidate_aliases(inode); - if (queue_revalidate) - ceph_queue_revalidate(inode); if (wake) wake_up_all(&ci->i_cap_wq); @@ -2737,123 +2777,200 @@ static void handle_cap_trunc(struct inode *inode, * caller holds s_mutex */ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, - struct ceph_mds_session *session, - int *open_target_sessions) + struct ceph_mds_cap_peer *ph, + struct ceph_mds_session *session) { struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_session *tsession = NULL; + struct ceph_cap *cap, *tcap, *new_cap = NULL; struct ceph_inode_info *ci = ceph_inode(inode); - int mds = session->s_mds; + u64 t_cap_id; unsigned mseq = le32_to_cpu(ex->migrate_seq); - struct ceph_cap *cap = NULL, *t; - struct rb_node *p; - int remember = 1; + unsigned t_seq, t_mseq; + int target, issued; + int mds = session->s_mds; - dout("handle_cap_export inode %p ci %p mds%d mseq %d\n", - inode, ci, mds, mseq); + if (ph) { + t_cap_id = le64_to_cpu(ph->cap_id); + t_seq = le32_to_cpu(ph->seq); + t_mseq = le32_to_cpu(ph->mseq); + target = le32_to_cpu(ph->mds); + } else { + t_cap_id = t_seq = t_mseq = 0; + target = -1; + } + dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n", + inode, ci, mds, mseq, target); +retry: spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id)) + goto out_unlock; - /* make sure we haven't seen a higher mseq */ - for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { - t = rb_entry(p, struct ceph_cap, ci_node); - if (ceph_seq_cmp(t->mseq, mseq) > 0) { - dout(" higher mseq on cap from mds%d\n", - t->session->s_mds); - remember = 0; - } - if (t->session->s_mds == mds) - cap = t; + if (target < 0) { + __ceph_remove_cap(cap, false); + goto out_unlock; } - if (cap) { - if (remember) { - /* make note */ - ci->i_cap_exporting_mds = mds; - ci->i_cap_exporting_mseq = mseq; - ci->i_cap_exporting_issued = cap->issued; - - /* - * make sure we have open sessions with all possible - * export targets, so that we get the matching IMPORT - */ - *open_target_sessions = 1; + /* + * now we know we haven't received the cap import message yet + * because the exported cap still exist. + */ - /* - * we can't flush dirty caps that we've seen the - * EXPORT but no IMPORT for - */ - spin_lock(&mdsc->cap_dirty_lock); - if (!list_empty(&ci->i_dirty_item)) { - dout(" moving %p to cap_dirty_migrating\n", - inode); - list_move(&ci->i_dirty_item, - &mdsc->cap_dirty_migrating); + issued = cap->issued; + WARN_ON(issued != cap->implemented); + + tcap = __get_cap_for_mds(ci, target); + if (tcap) { + /* already have caps from the target */ + if (tcap->cap_id != t_cap_id || + ceph_seq_cmp(tcap->seq, t_seq) < 0) { + dout(" updating import cap %p mds%d\n", tcap, target); + tcap->cap_id = t_cap_id; + tcap->seq = t_seq - 1; + tcap->issue_seq = t_seq - 1; + tcap->mseq = t_mseq; + tcap->issued |= issued; + tcap->implemented |= issued; + if (cap == ci->i_auth_cap) + ci->i_auth_cap = tcap; + if (ci->i_flushing_caps && ci->i_auth_cap == tcap) { + spin_lock(&mdsc->cap_dirty_lock); + list_move_tail(&ci->i_flushing_item, + &tcap->session->s_cap_flushing); + spin_unlock(&mdsc->cap_dirty_lock); } - spin_unlock(&mdsc->cap_dirty_lock); } - __ceph_remove_cap(cap); + __ceph_remove_cap(cap, false); + goto out_unlock; + } else if (tsession) { + /* add placeholder for the export tagert */ + int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; + ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, + t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); + + __ceph_remove_cap(cap, false); + goto out_unlock; } - /* else, we already released it */ spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&session->s_mutex); + + /* open target session */ + tsession = ceph_mdsc_open_export_target_session(mdsc, target); + if (!IS_ERR(tsession)) { + if (mds > target) { + mutex_lock(&session->s_mutex); + mutex_lock_nested(&tsession->s_mutex, + SINGLE_DEPTH_NESTING); + } else { + mutex_lock(&tsession->s_mutex); + mutex_lock_nested(&session->s_mutex, + SINGLE_DEPTH_NESTING); + } + ceph_add_cap_releases(mdsc, tsession); + new_cap = ceph_get_cap(mdsc, NULL); + } else { + WARN_ON(1); + tsession = NULL; + target = -1; + } + goto retry; + +out_unlock: + spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&session->s_mutex); + if (tsession) { + mutex_unlock(&tsession->s_mutex); + ceph_put_mds_session(tsession); + } + if (new_cap) + ceph_put_cap(mdsc, new_cap); } /* - * Handle cap IMPORT. If there are temp bits from an older EXPORT, - * clean them up. + * Handle cap IMPORT. * - * caller holds s_mutex. + * caller holds s_mutex. acquires i_ceph_lock */ static void handle_cap_import(struct ceph_mds_client *mdsc, struct inode *inode, struct ceph_mds_caps *im, + struct ceph_mds_cap_peer *ph, struct ceph_mds_session *session, - void *snaptrace, int snaptrace_len) + struct ceph_cap **target_cap, int *old_issued) + __acquires(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap *cap, *ocap, *new_cap = NULL; int mds = session->s_mds; - unsigned issued = le32_to_cpu(im->caps); + int issued; + unsigned caps = le32_to_cpu(im->caps); unsigned wanted = le32_to_cpu(im->wanted); unsigned seq = le32_to_cpu(im->seq); unsigned mseq = le32_to_cpu(im->migrate_seq); u64 realmino = le64_to_cpu(im->realm); u64 cap_id = le64_to_cpu(im->cap_id); + u64 p_cap_id; + int peer; - if (ci->i_cap_exporting_mds >= 0 && - ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) { - dout("handle_cap_import inode %p ci %p mds%d mseq %d" - " - cleared exporting from mds%d\n", - inode, ci, mds, mseq, - ci->i_cap_exporting_mds); - ci->i_cap_exporting_issued = 0; - ci->i_cap_exporting_mseq = 0; - ci->i_cap_exporting_mds = -1; + if (ph) { + p_cap_id = le64_to_cpu(ph->cap_id); + peer = le32_to_cpu(ph->mds); + } else { + p_cap_id = 0; + peer = -1; + } - spin_lock(&mdsc->cap_dirty_lock); - if (!list_empty(&ci->i_dirty_item)) { - dout(" moving %p back to cap_dirty\n", inode); - list_move(&ci->i_dirty_item, &mdsc->cap_dirty); + dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n", + inode, ci, mds, mseq, peer); + +retry: + spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap) { + if (!new_cap) { + spin_unlock(&ci->i_ceph_lock); + new_cap = ceph_get_cap(mdsc, NULL); + goto retry; } - spin_unlock(&mdsc->cap_dirty_lock); + cap = new_cap; } else { - dout("handle_cap_import inode %p ci %p mds%d mseq %d\n", - inode, ci, mds, mseq); + if (new_cap) { + ceph_put_cap(mdsc, new_cap); + new_cap = NULL; + } } - down_write(&mdsc->snap_rwsem); - ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len, - false); - downgrade_write(&mdsc->snap_rwsem); - ceph_add_cap(inode, session, cap_id, -1, - issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, - NULL /* no caps context */); - kick_flushing_inode_caps(mdsc, session, inode); - up_read(&mdsc->snap_rwsem); + __ceph_caps_issued(ci, &issued); + issued |= __ceph_caps_dirty(ci); + + ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq, + realmino, CEPH_CAP_FLAG_AUTH, &new_cap); + + ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; + if (ocap && ocap->cap_id == p_cap_id) { + dout(" remove export cap %p mds%d flags %d\n", + ocap, peer, ph->flags); + if ((ph->flags & CEPH_CAP_FLAG_AUTH) && + (ocap->seq != le32_to_cpu(ph->seq) || + ocap->mseq != le32_to_cpu(ph->mseq))) { + pr_err("handle_cap_import: mismatched seq/mseq: " + "ino (%llx.%llx) mds%d seq %d mseq %d " + "importer mds%d has peer seq %d mseq %d\n", + ceph_vinop(inode), peer, ocap->seq, + ocap->mseq, mds, le32_to_cpu(ph->seq), + le32_to_cpu(ph->mseq)); + } + __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); + } /* make sure we re-request max_size, if necessary */ - spin_lock(&ci->i_ceph_lock); - ci->i_wanted_max_size = 0; /* reset */ + ci->i_wanted_max_size = 0; ci->i_requested_max_size = 0; - spin_unlock(&ci->i_ceph_lock); + + *old_issued = issued; + *target_cap = cap; } /* @@ -2871,8 +2988,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_inode_info *ci; struct ceph_cap *cap; struct ceph_mds_caps *h; + struct ceph_mds_cap_peer *peer = NULL; int mds = session->s_mds; - int op; + int op, issued; u32 seq, mseq; struct ceph_vino vino; u64 cap_id; @@ -2881,12 +2999,13 @@ void ceph_handle_caps(struct ceph_mds_session *session, void *snaptrace; size_t snaptrace_len; void *flock; + void *end; u32 flock_len; - int open_target_sessions = 0; dout("handle_caps from mds%d\n", mds); /* decode */ + end = msg->front.iov_base + msg->front.iov_len; tid = le64_to_cpu(msg->hdr.tid); if (msg->front.iov_len < sizeof(*h)) goto bad; @@ -2904,17 +3023,28 @@ void ceph_handle_caps(struct ceph_mds_session *session, snaptrace_len = le32_to_cpu(h->snap_trace_len); if (le16_to_cpu(msg->hdr.version) >= 2) { - void *p, *end; - - p = snaptrace + snaptrace_len; - end = msg->front.iov_base + msg->front.iov_len; + void *p = snaptrace + snaptrace_len; ceph_decode_32_safe(&p, end, flock_len, bad); + if (p + flock_len > end) + goto bad; flock = p; } else { flock = NULL; flock_len = 0; } + if (le16_to_cpu(msg->hdr.version) >= 3) { + if (op == CEPH_CAP_OP_IMPORT) { + void *p = flock + flock_len; + if (p + sizeof(*peer) > end) + goto bad; + peer = p; + } else if (op == CEPH_CAP_OP_EXPORT) { + /* recorded in unused fields */ + peer = (void *)&h->size; + } + } + mutex_lock(&session->s_mutex); session->s_seq++; dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, @@ -2931,9 +3061,12 @@ void ceph_handle_caps(struct ceph_mds_session *session, if (!inode) { dout(" i don't have ino %llx\n", vino.ino); - if (op == CEPH_CAP_OP_IMPORT) + if (op == CEPH_CAP_OP_IMPORT) { + spin_lock(&session->s_cap_lock); __queue_cap_release(session, vino.ino, cap_id, mseq, seq); + spin_unlock(&session->s_cap_lock); + } goto flush_cap_releases; } @@ -2944,12 +3077,15 @@ void ceph_handle_caps(struct ceph_mds_session *session, goto done; case CEPH_CAP_OP_EXPORT: - handle_cap_export(inode, h, session, &open_target_sessions); - goto done; + handle_cap_export(inode, h, peer, session); + goto done_unlocked; case CEPH_CAP_OP_IMPORT: - handle_cap_import(mdsc, inode, h, session, - snaptrace, snaptrace_len); + handle_cap_import(mdsc, inode, h, peer, session, + &cap, &issued); + handle_cap_grant(mdsc, inode, h, snaptrace, snaptrace_len, + msg->middle, session, cap, issued); + goto done_unlocked; } /* the rest require a cap */ @@ -2966,8 +3102,10 @@ void ceph_handle_caps(struct ceph_mds_session *session, switch (op) { case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: - case CEPH_CAP_OP_IMPORT: - handle_cap_grant(inode, h, session, cap, msg->middle); + __ceph_caps_issued(ci, &issued); + issued |= __ceph_caps_dirty(ci); + handle_cap_grant(mdsc, inode, h, NULL, 0, msg->middle, + session, cap, issued); goto done_unlocked; case CEPH_CAP_OP_FLUSH_ACK: @@ -3000,8 +3138,6 @@ done: done_unlocked: if (inode) iput(inode); - if (open_target_sessions) - ceph_mdsc_open_export_target_sessions(mdsc, session); return; bad: @@ -3143,7 +3279,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, rel->seq = cpu_to_le32(cap->seq); rel->issue_seq = cpu_to_le32(cap->issue_seq), rel->mseq = cpu_to_le32(cap->mseq); - rel->caps = cpu_to_le32(cap->issued); + rel->caps = cpu_to_le32(cap->implemented); rel->wanted = cpu_to_le32(cap->mds_wanted); rel->dname_len = 0; rel->dname_seq = 0; diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 6d59006bfa2..5a743ac141a 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -71,9 +71,9 @@ static int mdsc_show(struct seq_file *s, void *p) seq_printf(s, "%s", ceph_mds_op_name(req->r_op)); if (req->r_got_unsafe) - seq_printf(s, "\t(unsafe)"); + seq_puts(s, "\t(unsafe)"); else - seq_printf(s, "\t"); + seq_puts(s, "\t"); if (req->r_inode) { seq_printf(s, " #%llx", ceph_ino(req->r_inode)); @@ -93,6 +93,8 @@ static int mdsc_show(struct seq_file *s, void *p) } else if (req->r_path1) { seq_printf(s, " #%llx/%s", req->r_ino1.ino, req->r_path1); + } else { + seq_printf(s, " #%llx", req->r_ino1.ino); } if (req->r_old_dentry) { @@ -102,7 +104,8 @@ static int mdsc_show(struct seq_file *s, void *p) path = NULL; spin_lock(&req->r_old_dentry->d_lock); seq_printf(s, " #%llx/%.*s (%s)", - ceph_ino(req->r_old_dentry_dir), + req->r_old_dentry_dir ? + ceph_ino(req->r_old_dentry_dir) : 0, req->r_old_dentry->d_name.len, req->r_old_dentry->d_name.name, path ? path : ""); @@ -116,7 +119,7 @@ static int mdsc_show(struct seq_file *s, void *p) seq_printf(s, " %s", req->r_path2); } - seq_printf(s, "\n"); + seq_puts(s, "\n"); } mutex_unlock(&mdsc->mutex); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 868b61d56ca..c29d6ae6887 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -100,6 +100,14 @@ static unsigned fpos_off(loff_t p) return p & 0xffffffff; } +static int fpos_cmp(loff_t l, loff_t r) +{ + int v = ceph_frag_compare(fpos_frag(l), fpos_frag(r)); + if (v) + return v; + return (int)(fpos_off(l) - fpos_off(r)); +} + /* * When possible, we try to satisfy a readdir by peeking at the * dcache. We make this work by carefully ordering dentries on @@ -111,7 +119,8 @@ static unsigned fpos_off(loff_t p) * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by * the MDS if/when the directory is modified). */ -static int __dcache_readdir(struct file *file, struct dir_context *ctx) +static int __dcache_readdir(struct file *file, struct dir_context *ctx, + u32 shared_gen) { struct ceph_file_info *fi = file->private_data; struct dentry *parent = file->f_dentry; @@ -125,14 +134,14 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx) last = fi->dentry; fi->dentry = NULL; - dout("__dcache_readdir %p at %llu (last %p)\n", dir, ctx->pos, - last); + dout("__dcache_readdir %p v%u at %llu (last %p)\n", + dir, shared_gen, ctx->pos, last); spin_lock(&parent->d_lock); /* start at beginning? */ if (ctx->pos == 2 || last == NULL || - ctx->pos < ceph_dentry(last)->offset) { + fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) { if (list_empty(&parent->d_subdirs)) goto out_unlock; p = parent->d_subdirs.prev; @@ -153,10 +162,11 @@ more: goto out_unlock; } spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - if (!d_unhashed(dentry) && dentry->d_inode && + if (di->lease_shared_gen == shared_gen && + !d_unhashed(dentry) && dentry->d_inode && ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && - ctx->pos <= di->offset) + fpos_cmp(ctx->pos, di->offset) <= 0) break; dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry, dentry->d_name.len, dentry->d_name.name, di->offset, @@ -172,9 +182,16 @@ more: spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); + /* make sure a dentry wasn't dropped while we didn't have parent lock */ + if (!ceph_dir_is_complete(dir)) { + dout(" lost dir complete on %p; falling back to mds\n", dir); + dput(dentry); + err = -EAGAIN; + goto out; + } + dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos, dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); - ctx->pos = di->offset; if (!dir_emit(ctx, dentry->d_name.name, dentry->d_name.len, ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), @@ -182,25 +199,18 @@ more: if (last) { /* remember our position */ fi->dentry = last; - fi->next_offset = di->offset; + fi->next_offset = fpos_off(di->offset); } dput(dentry); return 0; } + ctx->pos = di->offset + 1; + if (last) dput(last); last = dentry; - ctx->pos++; - - /* make sure a dentry wasn't dropped while we didn't have parent lock */ - if (!ceph_dir_is_complete(dir)) { - dout(" lost dir complete on %p; falling back to mds\n", dir); - err = -EAGAIN; - goto out; - } - spin_lock(&parent->d_lock); p = p->prev; /* advance to next dentry */ goto more; @@ -244,8 +254,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) int err; u32 ftype; struct ceph_mds_reply_info_parsed *rinfo; - const int max_entries = fsc->mount_options->max_readdir; - const int max_bytes = fsc->mount_options->max_readdir_bytes; dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off); if (fi->flags & CEPH_F_ATEND) @@ -283,10 +291,13 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ceph_snap(inode) != CEPH_SNAPDIR && __ceph_dir_is_complete(ci) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { + u32 shared_gen = ci->i_shared_gen; spin_unlock(&ci->i_ceph_lock); - err = __dcache_readdir(file, ctx); + err = __dcache_readdir(file, ctx, shared_gen); if (err != -EAGAIN) return err; + frag = fpos_frag(ctx->pos); + off = fpos_off(ctx->pos); } else { spin_unlock(&ci->i_ceph_lock); } @@ -314,14 +325,16 @@ more: fi->last_readdir = NULL; } - /* requery frag tree, as the frag topology may have changed */ - frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL); - dout("readdir fetching %llx.%llx frag %x offset '%s'\n", ceph_vinop(inode), frag, fi->last_name); req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); + err = ceph_alloc_readdir_reply_buffer(req, inode); + if (err) { + ceph_mdsc_put_request(req); + return err; + } req->r_inode = inode; ihold(inode); req->r_dentry = dget(file->f_dentry); @@ -332,9 +345,6 @@ more: req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); req->r_readdir_offset = fi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); - req->r_args.readdir.max_entries = cpu_to_le32(max_entries); - req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes); - req->r_num_caps = max_entries + 1; err = ceph_mdsc_do_request(mdsc, NULL, req); if (err < 0) { ceph_mdsc_put_request(req); @@ -352,6 +362,16 @@ more: } /* note next offset and last dentry name */ + rinfo = &req->r_reply_info; + if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { + frag = le32_to_cpu(rinfo->dir_dir->frag); + if (ceph_frag_is_leftmost(frag)) + fi->next_offset = 2; + else + fi->next_offset = 0; + off = fi->next_offset; + } + fi->frag = frag; fi->offset = fi->next_offset; fi->last_readdir = req; @@ -363,7 +383,6 @@ more: else fi->next_offset = 0; } else { - rinfo = &req->r_reply_info; err = note_last_dentry(fi, rinfo->dir_dname[rinfo->dir_nr-1], rinfo->dir_dname_len[rinfo->dir_nr-1]); @@ -429,7 +448,6 @@ more: if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { dout(" marking %p complete\n", inode); __ceph_dir_set_complete(ci, fi->dir_release_count); - ci->i_max_offset = ctx->pos; } spin_unlock(&ci->i_ceph_lock); @@ -437,7 +455,7 @@ more: return 0; } -static void reset_readdir(struct ceph_file_info *fi) +static void reset_readdir(struct ceph_file_info *fi, unsigned frag) { if (fi->last_readdir) { ceph_mdsc_put_request(fi->last_readdir); @@ -445,7 +463,10 @@ static void reset_readdir(struct ceph_file_info *fi) } kfree(fi->last_name); fi->last_name = NULL; - fi->next_offset = 2; /* compensate for . and .. */ + if (ceph_frag_is_leftmost(frag)) + fi->next_offset = 2; /* compensate for . and .. */ + else + fi->next_offset = 0; if (fi->dentry) { dput(fi->dentry); fi->dentry = NULL; @@ -457,7 +478,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) { struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_mapping->host; - loff_t old_offset = offset; + loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset); loff_t retval; mutex_lock(&inode->i_mutex); @@ -474,7 +495,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) goto out; } - if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { + if (offset >= 0) { if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; @@ -487,14 +508,14 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) * seek to new frag, or seek prior to current chunk. */ if (offset == 0 || - fpos_frag(offset) != fpos_frag(old_offset) || + fpos_frag(offset) != fi->frag || fpos_off(offset) < fi->offset) { dout("dir_llseek dropping %p content\n", file); - reset_readdir(fi); + reset_readdir(fi, fpos_frag(offset)); } /* bump dir_release_count if we did a forward seek */ - if (offset > old_offset) + if (fpos_cmp(offset, old_offset) > 0) fi->dir_release_count--; } out: @@ -684,7 +705,10 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, if (!err && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); ceph_mdsc_put_request(req); - if (err) + + if (!err) + ceph_init_acl(dentry, dentry->d_inode, dir); + else d_drop(dentry); return err; } @@ -722,7 +746,9 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, if (!err && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); ceph_mdsc_put_request(req); - if (err) + if (!err) + ceph_init_acl(dentry, dentry->d_inode, dir); + else d_drop(dentry); return err; } @@ -763,7 +789,9 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) err = ceph_handle_notrace_create(dir, dentry); ceph_mdsc_put_request(req); out: - if (err < 0) + if (!err) + ceph_init_acl(dentry, dentry->d_inode, dir); + else d_drop(dentry); return err; } @@ -788,8 +816,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, } req->r_dentry = dget(dentry); req->r_num_caps = 2; - req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */ - req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); + req->r_old_dentry = dget(old_dentry); req->r_locked_dir = dir; req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; @@ -887,10 +914,11 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); + ihold(old_dir); req->r_dentry = dget(new_dentry); req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); - req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); + req->r_old_dentry_dir = old_dir; req->r_locked_dir = new_dir; req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; @@ -908,14 +936,16 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, * to do it here. */ - /* d_move screws up d_subdirs order */ - ceph_dir_clear_complete(new_dir); - d_move(old_dentry, new_dentry); /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(new_dentry); + + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_complete(old_dir); + ceph_dir_clear_complete(new_dir); + } ceph_mdsc_put_request(req); return err; @@ -1028,14 +1058,19 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) valid = 1; } else if (dentry_lease_is_valid(dentry) || dir_lease_is_valid(dir, dentry)) { - valid = 1; + if (dentry->d_inode) + valid = ceph_is_any_caps(dentry->d_inode); + else + valid = 1; } dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); - if (valid) + if (valid) { ceph_dentry_lru_touch(dentry); - else + } else { + ceph_dir_clear_complete(dir); d_drop(dentry); + } iput(dir); return valid; } @@ -1284,6 +1319,8 @@ const struct inode_operations ceph_dir_iops = { .getxattr = ceph_getxattr, .listxattr = ceph_listxattr, .removexattr = ceph_removexattr, + .get_acl = ceph_get_acl, + .set_acl = ceph_set_acl, .mknod = ceph_mknod, .symlink = ceph_symlink, .mkdir = ceph_mkdir, diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 16796be53ca..8d7d782f438 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -8,23 +8,6 @@ #include "mds_client.h" /* - * NFS export support - * - * NFS re-export of a ceph mount is, at present, only semireliable. - * The basic issue is that the Ceph architectures doesn't lend itself - * well to generating filehandles that will remain valid forever. - * - * So, we do our best. If you're lucky, your inode will be in the - * client's cache. If it's not, and you have a connectable fh, then - * the MDS server may be able to find it for you. Otherwise, you get - * ESTALE. - * - * There are ways to this more reliable, but in the non-connectable fh - * case, we won't every work perfectly, and in the connectable case, - * some changes are needed on the MDS side to work better. - */ - -/* * Basic fh */ struct ceph_nfs_fh { @@ -32,22 +15,12 @@ struct ceph_nfs_fh { } __attribute__ ((packed)); /* - * Larger 'connectable' fh that includes parent ino and name hash. - * Use this whenever possible, as it works more reliably. + * Larger fh that includes parent ino. */ struct ceph_nfs_confh { u64 ino, parent_ino; - u32 parent_name_hash; } __attribute__ ((packed)); -/* - * The presence of @parent_inode here tells us whether NFS wants a - * connectable file handle. However, we want to make a connectionable - * file handle unconditionally so that the MDS gets as much of a hint - * as possible. That means we only use @parent_dentry to indicate - * whether nfsd wants a connectable fh, and whether we should indicate - * failure from a too-small @max_len. - */ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, struct inode *parent_inode) { @@ -56,54 +29,36 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, struct ceph_nfs_confh *cfh = (void *)rawfh; int connected_handle_length = sizeof(*cfh)/4; int handle_length = sizeof(*fh)/4; - struct dentry *dentry; - struct dentry *parent; /* don't re-export snaps */ if (ceph_snap(inode) != CEPH_NOSNAP) return -EINVAL; - dentry = d_find_alias(inode); + if (parent_inode && (*max_len < connected_handle_length)) { + *max_len = connected_handle_length; + return FILEID_INVALID; + } else if (*max_len < handle_length) { + *max_len = handle_length; + return FILEID_INVALID; + } - /* if we found an alias, generate a connectable fh */ - if (*max_len >= connected_handle_length && dentry) { - dout("encode_fh %p connectable\n", dentry); - spin_lock(&dentry->d_lock); - parent = dentry->d_parent; + if (parent_inode) { + dout("encode_fh %llx with parent %llx\n", + ceph_ino(inode), ceph_ino(parent_inode)); cfh->ino = ceph_ino(inode); - cfh->parent_ino = ceph_ino(parent->d_inode); - cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode, - dentry); + cfh->parent_ino = ceph_ino(parent_inode); *max_len = connected_handle_length; - type = 2; - spin_unlock(&dentry->d_lock); - } else if (*max_len >= handle_length) { - if (parent_inode) { - /* nfsd wants connectable */ - *max_len = connected_handle_length; - type = FILEID_INVALID; - } else { - dout("encode_fh %p\n", dentry); - fh->ino = ceph_ino(inode); - *max_len = handle_length; - type = 1; - } + type = FILEID_INO32_GEN_PARENT; } else { + dout("encode_fh %llx\n", ceph_ino(inode)); + fh->ino = ceph_ino(inode); *max_len = handle_length; - type = FILEID_INVALID; + type = FILEID_INO32_GEN; } - if (dentry) - dput(dentry); return type; } -/* - * convert regular fh to dentry - * - * FIXME: we should try harder by querying the mds for the ino. - */ -static struct dentry *__fh_to_dentry(struct super_block *sb, - struct ceph_nfs_fh *fh, int fh_len) +static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) { struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; struct inode *inode; @@ -111,11 +66,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, struct ceph_vino vino; int err; - if (fh_len < sizeof(*fh) / 4) - return ERR_PTR(-ESTALE); - - dout("__fh_to_dentry %llx\n", fh->ino); - vino.ino = fh->ino; + vino.ino = ino; vino.snap = CEPH_NOSNAP; inode = ceph_find_inode(sb, vino); if (!inode) { @@ -139,139 +90,161 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, dentry = d_obtain_alias(inode); if (IS_ERR(dentry)) { - pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n", - fh->ino, inode); iput(inode); return dentry; } err = ceph_init_dentry(dentry); if (err < 0) { - iput(inode); + dput(dentry); return ERR_PTR(err); } - dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry); + dout("__fh_to_dentry %llx %p dentry %p\n", ino, inode, dentry); return dentry; } /* - * convert connectable fh to dentry + * convert regular fh to dentry */ -static struct dentry *__cfh_to_dentry(struct super_block *sb, - struct ceph_nfs_confh *cfh, int fh_len) +static struct dentry *ceph_fh_to_dentry(struct super_block *sb, + struct fid *fid, + int fh_len, int fh_type) +{ + struct ceph_nfs_fh *fh = (void *)fid->raw; + + if (fh_type != FILEID_INO32_GEN && + fh_type != FILEID_INO32_GEN_PARENT) + return NULL; + if (fh_len < sizeof(*fh) / 4) + return NULL; + + dout("fh_to_dentry %llx\n", fh->ino); + return __fh_to_dentry(sb, fh->ino); +} + +static struct dentry *__get_parent(struct super_block *sb, + struct dentry *child, u64 ino) { struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; + struct ceph_mds_request *req; struct inode *inode; struct dentry *dentry; - struct ceph_vino vino; int err; - if (fh_len < sizeof(*cfh) / 4) - return ERR_PTR(-ESTALE); - - dout("__cfh_to_dentry %llx (%llx/%x)\n", - cfh->ino, cfh->parent_ino, cfh->parent_name_hash); - - vino.ino = cfh->ino; - vino.snap = CEPH_NOSNAP; - inode = ceph_find_inode(sb, vino); - if (!inode) { - struct ceph_mds_request *req; - - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH, - USE_ANY_MDS); - if (IS_ERR(req)) - return ERR_CAST(req); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT, + USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); - req->r_ino1 = vino; - req->r_ino2.ino = cfh->parent_ino; - req->r_ino2.snap = CEPH_NOSNAP; - req->r_path2 = kmalloc(16, GFP_NOFS); - snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash); - req->r_num_caps = 1; - err = ceph_mdsc_do_request(mdsc, NULL, req); - inode = req->r_target_inode; - if (inode) - ihold(inode); - ceph_mdsc_put_request(req); - if (!inode) - return ERR_PTR(err ? err : -ESTALE); + if (child) { + req->r_inode = child->d_inode; + ihold(child->d_inode); + } else { + req->r_ino1 = (struct ceph_vino) { + .ino = ino, + .snap = CEPH_NOSNAP, + }; } + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, NULL, req); + inode = req->r_target_inode; + if (inode) + ihold(inode); + ceph_mdsc_put_request(req); + if (!inode) + return ERR_PTR(-ENOENT); dentry = d_obtain_alias(inode); if (IS_ERR(dentry)) { - pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n", - cfh->ino, inode); iput(inode); return dentry; } err = ceph_init_dentry(dentry); if (err < 0) { - iput(inode); + dput(dentry); return ERR_PTR(err); } - dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry); + dout("__get_parent ino %llx parent %p ino %llx.%llx\n", + child ? ceph_ino(child->d_inode) : ino, + dentry, ceph_vinop(inode)); return dentry; } -static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) +static struct dentry *ceph_get_parent(struct dentry *child) { - if (fh_type == 1) - return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw, - fh_len); - else - return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw, - fh_len); + /* don't re-export snaps */ + if (ceph_snap(child->d_inode) != CEPH_NOSNAP) + return ERR_PTR(-EINVAL); + + dout("get_parent %p ino %llx.%llx\n", + child, ceph_vinop(child->d_inode)); + return __get_parent(child->d_sb, child, 0); } /* - * get parent, if possible. - * - * FIXME: we could do better by querying the mds to discover the - * parent. + * convert regular fh to parent */ static struct dentry *ceph_fh_to_parent(struct super_block *sb, - struct fid *fid, + struct fid *fid, int fh_len, int fh_type) { struct ceph_nfs_confh *cfh = (void *)fid->raw; - struct ceph_vino vino; - struct inode *inode; struct dentry *dentry; - int err; - if (fh_type == 1) - return ERR_PTR(-ESTALE); + if (fh_type != FILEID_INO32_GEN_PARENT) + return NULL; if (fh_len < sizeof(*cfh) / 4) - return ERR_PTR(-ESTALE); + return NULL; - pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino, - cfh->parent_name_hash); + dout("fh_to_parent %llx\n", cfh->parent_ino); + dentry = __get_parent(sb, NULL, cfh->ino); + if (IS_ERR(dentry) && PTR_ERR(dentry) == -ENOENT) + dentry = __fh_to_dentry(sb, cfh->parent_ino); + return dentry; +} - vino.ino = cfh->ino; - vino.snap = CEPH_NOSNAP; - inode = ceph_find_inode(sb, vino); - if (!inode) - return ERR_PTR(-ESTALE); +static int ceph_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct ceph_mds_client *mdsc; + struct ceph_mds_request *req; + int err; - dentry = d_obtain_alias(inode); - if (IS_ERR(dentry)) { - pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n", - cfh->ino, inode); - iput(inode); - return dentry; - } - err = ceph_init_dentry(dentry); - if (err < 0) { - iput(inode); - return ERR_PTR(err); + mdsc = ceph_inode_to_client(child->d_inode)->mdsc; + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME, + USE_ANY_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + + mutex_lock(&parent->d_inode->i_mutex); + + req->r_inode = child->d_inode; + ihold(child->d_inode); + req->r_ino2 = ceph_vino(parent->d_inode); + req->r_locked_dir = parent->d_inode; + req->r_num_caps = 2; + err = ceph_mdsc_do_request(mdsc, NULL, req); + + mutex_unlock(&parent->d_inode->i_mutex); + + if (!err) { + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + memcpy(name, rinfo->dname, rinfo->dname_len); + name[rinfo->dname_len] = 0; + dout("get_name %p ino %llx.%llx name %s\n", + child, ceph_vinop(child->d_inode), name); + } else { + dout("get_name %p ino %llx.%llx err %d\n", + child, ceph_vinop(child->d_inode), err); } - dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry); - return dentry; + + ceph_mdsc_put_request(req); + return err; } const struct export_operations ceph_export_ops = { .encode_fh = ceph_encode_fh, .fh_to_dentry = ceph_fh_to_dentry, .fh_to_parent = ceph_fh_to_parent, + .get_parent = ceph_get_parent, + .get_name = ceph_get_name, }; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3de89829e2a..302085100c2 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -210,7 +210,7 @@ int ceph_open(struct inode *inode, struct file *file) ihold(inode); req->r_num_caps = 1; - if (flags & (O_CREAT|O_TRUNC)) + if (flags & O_CREAT) parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); iput(parent_inode); @@ -286,12 +286,14 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, } else { dout("atomic_open finish_open on dn %p\n", dn); if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { + ceph_init_acl(dentry, dentry->d_inode, dir); *opened |= FILE_CREATED; } err = finish_open(file, dentry, ceph_open, opened); } - out_err: + if (!req->r_err && req->r_target_inode) + ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode); ceph_mdsc_put_request(req); dout("atomic_open result=%d\n", err); return err; @@ -408,51 +410,82 @@ more: * * If the read spans object boundary, just do multiple reads. */ -static ssize_t ceph_sync_read(struct file *file, char __user *data, - unsigned len, loff_t *poff, int *checkeof) +static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, + int *checkeof) { + struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct page **pages; - u64 off = *poff; + u64 off = iocb->ki_pos; int num_pages, ret; + size_t len = iov_iter_count(i); - dout("sync_read on file %p %llu~%u %s\n", file, off, len, + dout("sync_read on file %p %llu~%u %s\n", file, off, + (unsigned)len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); - - if (file->f_flags & O_DIRECT) { - num_pages = calc_pages_for((unsigned long)data, len); - pages = ceph_get_direct_page_vector(data, num_pages, true); - } else { - num_pages = calc_pages_for(off, len); - pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); - } - if (IS_ERR(pages)) - return PTR_ERR(pages); - /* * flush any page cache pages in this range. this * will make concurrent normal and sync io slow, * but it will at least behave sensibly when they are * in sequence. */ - ret = filemap_write_and_wait(inode->i_mapping); + ret = filemap_write_and_wait_range(inode->i_mapping, off, + off + len); if (ret < 0) - goto done; + return ret; - ret = striped_read(inode, off, len, pages, num_pages, checkeof, - file->f_flags & O_DIRECT, - (unsigned long)data & ~PAGE_MASK); + if (file->f_flags & O_DIRECT) { + while (iov_iter_count(i)) { + size_t start; + ssize_t n; - if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) - ret = ceph_copy_page_vector_to_user(pages, data, off, ret); - if (ret >= 0) - *poff = off + ret; + n = iov_iter_get_pages_alloc(i, &pages, INT_MAX, &start); + if (n < 0) + return n; -done: - if (file->f_flags & O_DIRECT) - ceph_put_page_vector(pages, num_pages, true); - else + num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE; + + ret = striped_read(inode, off, n, + pages, num_pages, checkeof, + 1, start); + + ceph_put_page_vector(pages, num_pages, true); + + if (ret <= 0) + break; + off += ret; + iov_iter_advance(i, ret); + if (ret < n) + break; + } + } else { + num_pages = calc_pages_for(off, len); + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); + if (IS_ERR(pages)) + return PTR_ERR(pages); + ret = striped_read(inode, off, len, pages, + num_pages, checkeof, 0, 0); + if (ret > 0) { + int l, k = 0; + size_t left = ret; + + while (left) { + int copy = min_t(size_t, PAGE_SIZE, left); + l = copy_page_to_iter(pages[k++], 0, copy, i); + off += l; + left -= l; + if (l < copy) + break; + } + } ceph_release_page_vector(pages, num_pages); + } + + if (off > iocb->ki_pos) { + ret = off - iocb->ki_pos; + iocb->ki_pos = off; + } + dout("sync_read result %d\n", ret); return ret; } @@ -489,148 +522,249 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) } } + /* - * Synchronous write, straight from __user pointer or user pages (if - * O_DIRECT). + * Synchronous write, straight from __user pointer or user pages. * * If write spans object boundary, just do multiple writes. (For a * correct atomic write, we should e.g. take write locks on all * objects, rollback on failure, etc.) */ -static ssize_t ceph_sync_write(struct file *file, const char __user *data, - size_t left, loff_t pos, loff_t *ppos) +static ssize_t +ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from) { + struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_snap_context *snapc; struct ceph_vino vino; struct ceph_osd_request *req; - int num_ops = 1; struct page **pages; int num_pages; - u64 len; int written = 0; int flags; int check_caps = 0; - int page_align, io_align; - unsigned long buf_align; int ret; struct timespec mtime = CURRENT_TIME; - bool own_pages = false; + loff_t pos = iocb->ki_pos; + size_t count = iov_iter_count(from); if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout("sync_write on file %p %lld~%u %s\n", file, pos, - (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); + dout("sync_direct_write on file %p %lld~%u\n", file, pos, + (unsigned)count); - ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); + ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); if (ret < 0) return ret; ret = invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_CACHE_SHIFT, - (pos + left) >> PAGE_CACHE_SHIFT); + (pos + count) >> PAGE_CACHE_SHIFT); if (ret < 0) dout("invalidate_inode_pages2_range returned %d\n", ret); flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE; - if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0) - flags |= CEPH_OSD_FLAG_ACK; - else - num_ops++; /* Also include a 'startsync' command. */ - - /* - * we may need to do multiple writes here if we span an object - * boundary. this isn't atomic, unfortunately. :( - */ -more: - io_align = pos & ~PAGE_MASK; - buf_align = (unsigned long)data & ~PAGE_MASK; - len = left; - snapc = ci->i_snap_realm->cached_context; - vino = ceph_vino(inode); - req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - vino, pos, &len, num_ops, - CEPH_OSD_OP_WRITE, flags, snapc, - ci->i_truncate_seq, ci->i_truncate_size, - false); - if (IS_ERR(req)) - return PTR_ERR(req); + while (iov_iter_count(from) > 0) { + u64 len = iov_iter_single_seg_count(from); + size_t start; + ssize_t n; + + snapc = ci->i_snap_realm->cached_context; + vino = ceph_vino(inode); + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + vino, pos, &len, + 2,/*include a 'startsync' command*/ + CEPH_OSD_OP_WRITE, flags, snapc, + ci->i_truncate_seq, + ci->i_truncate_size, + false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + break; + } - /* write from beginning of first page, regardless of io alignment */ - page_align = file->f_flags & O_DIRECT ? buf_align : io_align; - num_pages = calc_pages_for(page_align, len); - if (file->f_flags & O_DIRECT) { - pages = ceph_get_direct_page_vector(data, num_pages, false); - if (IS_ERR(pages)) { - ret = PTR_ERR(pages); - goto out; + n = iov_iter_get_pages_alloc(from, &pages, len, &start); + if (unlikely(n < 0)) { + ret = n; + ceph_osdc_put_request(req); + break; } + num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE; /* * throw out any page cache pages in this range. this * may block. */ truncate_inode_pages_range(inode->i_mapping, pos, - (pos+len) | (PAGE_CACHE_SIZE-1)); - } else { + (pos+n) | (PAGE_CACHE_SIZE-1)); + osd_req_op_extent_osd_data_pages(req, 0, pages, n, start, + false, false); + + /* BUG_ON(vino.snap != CEPH_NOSNAP); */ + ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); + + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + + ceph_put_page_vector(pages, num_pages, false); + + ceph_osdc_put_request(req); + if (ret) + break; + pos += n; + written += n; + iov_iter_advance(from, n); + + if (pos > i_size_read(inode)) { + check_caps = ceph_inode_set_size(inode, pos); + if (check_caps) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY, + NULL); + } + } + + if (ret != -EOLDSNAPC && written > 0) { + iocb->ki_pos = pos; + ret = written; + } + return ret; +} + + +/* + * Synchronous write, straight from __user pointer or user pages. + * + * If write spans object boundary, just do multiple writes. (For a + * correct atomic write, we should e.g. take write locks on all + * objects, rollback on failure, etc.) + */ +static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_snap_context *snapc; + struct ceph_vino vino; + struct ceph_osd_request *req; + struct page **pages; + u64 len; + int num_pages; + int written = 0; + int flags; + int check_caps = 0; + int ret; + struct timespec mtime = CURRENT_TIME; + loff_t pos = iocb->ki_pos; + size_t count = iov_iter_count(from); + + if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) + return -EROFS; + + dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count); + + ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); + if (ret < 0) + return ret; + + ret = invalidate_inode_pages2_range(inode->i_mapping, + pos >> PAGE_CACHE_SHIFT, + (pos + count) >> PAGE_CACHE_SHIFT); + if (ret < 0) + dout("invalidate_inode_pages2_range returned %d\n", ret); + + flags = CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ONDISK | + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ACK; + + while ((len = iov_iter_count(from)) > 0) { + size_t left; + int n; + + snapc = ci->i_snap_realm->cached_context; + vino = ceph_vino(inode); + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + vino, pos, &len, 1, + CEPH_OSD_OP_WRITE, flags, snapc, + ci->i_truncate_seq, + ci->i_truncate_size, + false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + break; + } + + /* + * write from beginning of first page, + * regardless of io alignment + */ + num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out; } - ret = ceph_copy_user_to_page_vector(pages, data, pos, len); + + left = len; + for (n = 0; n < num_pages; n++) { + size_t plen = min_t(size_t, left, PAGE_SIZE); + ret = copy_page_from_iter(pages[n], 0, plen, from); + if (ret != plen) { + ret = -EFAULT; + break; + } + left -= ret; + } + if (ret < 0) { ceph_release_page_vector(pages, num_pages); goto out; } - if ((file->f_flags & O_SYNC) == 0) { - /* get a second commit callback */ - req->r_unsafe_callback = ceph_sync_write_unsafe; - req->r_inode = inode; - own_pages = true; - } - } - osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, - false, own_pages); + /* get a second commit callback */ + req->r_unsafe_callback = ceph_sync_write_unsafe; + req->r_inode = inode; - /* BUG_ON(vino.snap != CEPH_NOSNAP); */ - ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, + false, true); - ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); - if (!ret) - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + /* BUG_ON(vino.snap != CEPH_NOSNAP); */ + ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); - if (file->f_flags & O_DIRECT) - ceph_put_page_vector(pages, num_pages, false); - else if (file->f_flags & O_SYNC) - ceph_release_page_vector(pages, num_pages); + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); out: - ceph_osdc_put_request(req); - if (ret == 0) { - pos += len; - written += len; - left -= len; - data += len; - if (left) - goto more; + ceph_osdc_put_request(req); + if (ret == 0) { + pos += len; + written += len; + + if (pos > i_size_read(inode)) { + check_caps = ceph_inode_set_size(inode, pos); + if (check_caps) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY, + NULL); + } + } else + break; + } + if (ret != -EOLDSNAPC && written > 0) { ret = written; - *ppos = pos; - if (pos > i_size_read(inode)) - check_caps = ceph_inode_set_size(inode, pos); - if (check_caps) - ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, - NULL); - } else if (ret != -EOLDSNAPC && written > 0) { - ret = written; + iocb->ki_pos = pos; } return ret; } @@ -642,60 +776,69 @@ out: * * Hmm, the sync read case isn't actually async... should it be? */ -static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *filp = iocb->ki_filp; struct ceph_file_info *fi = filp->private_data; - loff_t *ppos = &iocb->ki_pos; - size_t len = iov->iov_len; + size_t len = iocb->ki_nbytes; struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); - void __user *base = iov->iov_base; ssize_t ret; int want, got = 0; int checkeof = 0, read = 0; - dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", - inode, ceph_vinop(inode), pos, (unsigned)len, inode); again: + dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode); + if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); if (ret < 0) - goto out; - dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", - inode, ceph_vinop(inode), pos, (unsigned)len, - ceph_cap_string(got)); + return ret; if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || - (fi->flags & CEPH_F_SYNC)) + (fi->flags & CEPH_F_SYNC)) { + + dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, + ceph_cap_string(got)); + /* hmm, this isn't really async... */ - ret = ceph_sync_read(filp, base, len, ppos, &checkeof); - else - ret = generic_file_aio_read(iocb, iov, nr_segs, pos); + ret = ceph_sync_read(iocb, to, &checkeof); + } else { + dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, + ceph_cap_string(got)); -out: + ret = generic_file_read_iter(iocb, to); + } dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); ceph_put_cap_refs(ci, got); if (checkeof && ret >= 0) { - int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); + int statret = ceph_do_getattr(inode, + CEPH_STAT_CAP_SIZE); /* hit EOF or hole? */ - if (statret == 0 && *ppos < inode->i_size) { - dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size); + if (statret == 0 && iocb->ki_pos < inode->i_size && + ret < len) { + dout("sync_read hit hole, ppos %lld < size %lld" + ", reading more\n", iocb->ki_pos, + inode->i_size); + + iov_iter_advance(to, ret); read += ret; - base += ret; len -= ret; checkeof = 0; goto again; } } + if (ret >= 0) ret += read; @@ -712,8 +855,7 @@ out: * * If we are near ENOSPC, write synchronously. */ -static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct ceph_file_info *fi = file->private_data; @@ -721,18 +863,15 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; - ssize_t count, written = 0; + ssize_t count = iov_iter_count(from), written = 0; int err, want, got; + loff_t pos = iocb->ki_pos; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; mutex_lock(&inode->i_mutex); - err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); - if (err) - goto out; - /* We can write back this queue in page reclaim */ current->backing_dev_info = file->f_mapping->backing_dev_info; @@ -742,6 +881,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, if (count == 0) goto out; + iov_iter_truncate(from, count); err = file_remove_suid(file); if (err) @@ -772,20 +912,27 @@ retry_snap: inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || - (iocb->ki_filp->f_flags & O_DIRECT) || - (fi->flags & CEPH_F_SYNC)) { + (file->f_flags & O_DIRECT) || (fi->flags & CEPH_F_SYNC)) { + struct iov_iter data; mutex_unlock(&inode->i_mutex); - written = ceph_sync_write(file, iov->iov_base, count, - pos, &iocb->ki_pos); + /* we might need to revert back to that point */ + data = *from; + if (file->f_flags & O_DIRECT) + written = ceph_sync_direct_write(iocb, &data); + else + written = ceph_sync_write(iocb, &data); if (written == -EOLDSNAPC) { dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n", inode, ceph_vinop(inode), - pos, (unsigned)iov->iov_len); + pos, (unsigned)count); mutex_lock(&inode->i_mutex); goto retry_snap; } + if (written > 0) + iov_iter_advance(from, written); } else { + loff_t old_size = inode->i_size; /* * No need to acquire the i_truncate_mutex. Because * the MDS revokes Fwb caps before sending truncate @@ -793,9 +940,11 @@ retry_snap: * are pending vmtruncate. So write and vmtruncate * can not run at the same time */ - written = generic_file_buffered_write(iocb, iov, nr_segs, - pos, &iocb->ki_pos, - count, 0); + written = generic_perform_write(file, from, pos); + if (likely(written >= 0)) + iocb->ki_pos = pos + written; + if (inode->i_size > old_size) + ceph_fscache_update_objectsize(inode); mutex_unlock(&inode->i_mutex); } @@ -809,7 +958,7 @@ retry_snap: } dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", - inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, + inode, ceph_vinop(inode), pos, (unsigned)count, ceph_cap_string(got)); ceph_put_cap_refs(ci, got); @@ -1018,7 +1167,7 @@ static long ceph_fallocate(struct file *file, int mode, loff_t offset, loff_t length) { struct ceph_file_info *fi = file->private_data; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->client->osdc; @@ -1031,9 +1180,6 @@ static long ceph_fallocate(struct file *file, int mode, if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - mutex_lock(&inode->i_mutex); if (ceph_snap(inode) != CEPH_NOSNAP) { @@ -1089,16 +1235,16 @@ const struct file_operations ceph_file_fops = { .open = ceph_open, .release = ceph_release, .llseek = ceph_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = ceph_aio_read, - .aio_write = ceph_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = ceph_read_iter, + .write_iter = ceph_write_iter, .mmap = ceph_mmap, .fsync = ceph_fsync, .lock = ceph_lock, .flock = ceph_flock, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, .unlocked_ioctl = ceph_ioctl, .compat_ioctl = ceph_ioctl, .fallocate = ceph_fallocate, diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 8549a48115f..04c89c266ce 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -9,6 +9,8 @@ #include <linux/namei.h> #include <linux/writeback.h> #include <linux/vmalloc.h> +#include <linux/posix_acl.h> +#include <linux/random.h> #include "super.h" #include "mds_client.h" @@ -95,6 +97,8 @@ const struct inode_operations ceph_file_iops = { .getxattr = ceph_getxattr, .listxattr = ceph_listxattr, .removexattr = ceph_removexattr, + .get_acl = ceph_get_acl, + .set_acl = ceph_set_acl, }; @@ -176,9 +180,8 @@ struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f) * specified, copy the frag delegation info to the caller if * it is present. */ -u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, - struct ceph_inode_frag *pfrag, - int *found) +static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v, + struct ceph_inode_frag *pfrag, int *found) { u32 t = ceph_frag_make(0, 0); struct ceph_inode_frag *frag; @@ -188,7 +191,6 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, if (found) *found = 0; - mutex_lock(&ci->i_fragtree_mutex); while (1) { WARN_ON(!ceph_frag_contains_value(t, v)); frag = __ceph_find_frag(ci, t); @@ -217,10 +219,19 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, } dout("choose_frag(%x) = %x\n", v, t); - mutex_unlock(&ci->i_fragtree_mutex); return t; } +u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, + struct ceph_inode_frag *pfrag, int *found) +{ + u32 ret; + mutex_lock(&ci->i_fragtree_mutex); + ret = __ceph_choose_frag(ci, v, pfrag, found); + mutex_unlock(&ci->i_fragtree_mutex); + return ret; +} + /* * Process dirfrag (delegation) info from the mds. Include leaf * fragment in tree ONLY if ndist > 0. Otherwise, only @@ -234,11 +245,17 @@ static int ceph_fill_dirfrag(struct inode *inode, u32 id = le32_to_cpu(dirinfo->frag); int mds = le32_to_cpu(dirinfo->auth); int ndist = le32_to_cpu(dirinfo->ndist); + int diri_auth = -1; int i; int err = 0; + spin_lock(&ci->i_ceph_lock); + if (ci->i_auth_cap) + diri_auth = ci->i_auth_cap->mds; + spin_unlock(&ci->i_ceph_lock); + mutex_lock(&ci->i_fragtree_mutex); - if (ndist == 0) { + if (ndist == 0 && mds == diri_auth) { /* no delegation info needed. */ frag = __ceph_find_frag(ci, id); if (!frag) @@ -283,6 +300,75 @@ out: return err; } +static int ceph_fill_fragtree(struct inode *inode, + struct ceph_frag_tree_head *fragtree, + struct ceph_mds_reply_dirfrag *dirinfo) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_inode_frag *frag; + struct rb_node *rb_node; + int i; + u32 id, nsplits; + bool update = false; + + mutex_lock(&ci->i_fragtree_mutex); + nsplits = le32_to_cpu(fragtree->nsplits); + if (nsplits) { + i = prandom_u32() % nsplits; + id = le32_to_cpu(fragtree->splits[i].frag); + if (!__ceph_find_frag(ci, id)) + update = true; + } else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) { + rb_node = rb_first(&ci->i_fragtree); + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node)) + update = true; + } + if (!update && dirinfo) { + id = le32_to_cpu(dirinfo->frag); + if (id != __ceph_choose_frag(ci, id, NULL, NULL)) + update = true; + } + if (!update) + goto out_unlock; + + dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode)); + rb_node = rb_first(&ci->i_fragtree); + for (i = 0; i < nsplits; i++) { + id = le32_to_cpu(fragtree->splits[i].frag); + frag = NULL; + while (rb_node) { + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + if (ceph_frag_compare(frag->frag, id) >= 0) { + if (frag->frag != id) + frag = NULL; + else + rb_node = rb_next(rb_node); + break; + } + rb_node = rb_next(rb_node); + rb_erase(&frag->node, &ci->i_fragtree); + kfree(frag); + frag = NULL; + } + if (!frag) { + frag = __get_or_create_frag(ci, id); + if (IS_ERR(frag)) + continue; + } + frag->split_by = le32_to_cpu(fragtree->splits[i].by); + dout(" frag %x split by %d\n", frag->frag, frag->split_by); + } + while (rb_node) { + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + rb_node = rb_next(rb_node); + rb_erase(&frag->node, &ci->i_fragtree); + kfree(frag); + } +out_unlock: + mutex_unlock(&ci->i_fragtree_mutex); + return 0; +} /* * initialize a newly allocated inode. @@ -335,9 +421,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_hold_caps_min = 0; ci->i_hold_caps_max = 0; INIT_LIST_HEAD(&ci->i_cap_delay_list); - ci->i_cap_exporting_mds = 0; - ci->i_cap_exporting_mseq = 0; - ci->i_cap_exporting_issued = 0; INIT_LIST_HEAD(&ci->i_cap_snaps); ci->i_head_snapc = NULL; ci->i_snap_caps = 0; @@ -406,7 +489,7 @@ void ceph_destroy_inode(struct inode *inode) /* * we may still have a snap_realm reference if there are stray - * caps in i_cap_exporting_issued or i_snap_caps. + * caps in i_snap_caps. */ if (ci->i_snap_realm) { struct ceph_mds_client *mdsc = @@ -436,6 +519,16 @@ void ceph_destroy_inode(struct inode *inode) call_rcu(&inode->i_rcu, ceph_i_callback); } +int ceph_drop_inode(struct inode *inode) +{ + /* + * Positve dentry and corresponding inode are always accompanied + * in MDS reply. So no need to keep inode in the cache after + * dropping all its aliases. + */ + return 1; +} + /* * Helpers to fill in size, ctime, mtime, and atime. We have to be * careful because either the client or MDS may have more up to date @@ -571,20 +664,26 @@ static int fill_inode(struct inode *inode, unsigned long ttl_from, int cap_fmode, struct ceph_cap_reservation *caps_reservation) { + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_mds_reply_inode *info = iinfo->in; struct ceph_inode_info *ci = ceph_inode(inode); - int i; - int issued = 0, implemented; + int issued = 0, implemented, new_issued; struct timespec mtime, atime, ctime; - u32 nsplits; struct ceph_buffer *xattr_blob = NULL; + struct ceph_cap *new_cap = NULL; int err = 0; - int queue_trunc = 0; + bool wake = false; + bool queue_trunc = false; + bool new_version = false; dout("fill_inode %p ino %llx.%llx v %llu had %llu\n", inode, ceph_vinop(inode), le64_to_cpu(info->version), ci->i_version); + /* prealloc new cap struct */ + if (info->cap.caps && ceph_snap(inode) == CEPH_NOSNAP) + new_cap = ceph_get_cap(mdsc, caps_reservation); + /* * prealloc xattr data, if it looks like we'll need it. only * if len > 4 (meaning there are actually xattrs; the first 4 @@ -610,19 +709,23 @@ static int fill_inode(struct inode *inode, * 3 2 skip * 3 3 update */ - if (le64_to_cpu(info->version) > 0 && - (ci->i_version & ~1) >= le64_to_cpu(info->version)) - goto no_change; - + if (ci->i_version == 0 || + ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && + le64_to_cpu(info->version) > (ci->i_version & ~1))) + new_version = true; + issued = __ceph_caps_issued(ci, &implemented); issued |= implemented | __ceph_caps_dirty(ci); + new_issued = ~issued & le32_to_cpu(info->cap.caps); /* update inode */ ci->i_version = le64_to_cpu(info->version); inode->i_version++; inode->i_rdev = le32_to_cpu(info->rdev); + inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; - if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { + if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) && + (issued & CEPH_CAP_AUTH_EXCL) == 0) { inode->i_mode = le32_to_cpu(info->mode); inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid)); inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid)); @@ -631,31 +734,35 @@ static int fill_inode(struct inode *inode, from_kgid(&init_user_ns, inode->i_gid)); } - if ((issued & CEPH_CAP_LINK_EXCL) == 0) + if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) && + (issued & CEPH_CAP_LINK_EXCL) == 0) set_nlink(inode, le32_to_cpu(info->nlink)); - /* be careful with mtime, atime, size */ - ceph_decode_timespec(&atime, &info->atime); - ceph_decode_timespec(&mtime, &info->mtime); - ceph_decode_timespec(&ctime, &info->ctime); - queue_trunc = ceph_fill_file_size(inode, issued, - le32_to_cpu(info->truncate_seq), - le64_to_cpu(info->truncate_size), - le64_to_cpu(info->size)); - ceph_fill_file_time(inode, issued, - le32_to_cpu(info->time_warp_seq), - &ctime, &mtime, &atime); - - /* only update max_size on auth cap */ - if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && - ci->i_max_size != le64_to_cpu(info->max_size)) { - dout("max_size %lld -> %llu\n", ci->i_max_size, - le64_to_cpu(info->max_size)); - ci->i_max_size = le64_to_cpu(info->max_size); + if (new_version || (new_issued & CEPH_CAP_ANY_RD)) { + /* be careful with mtime, atime, size */ + ceph_decode_timespec(&atime, &info->atime); + ceph_decode_timespec(&mtime, &info->mtime); + ceph_decode_timespec(&ctime, &info->ctime); + ceph_fill_file_time(inode, issued, + le32_to_cpu(info->time_warp_seq), + &ctime, &mtime, &atime); } - ci->i_layout = info->layout; - inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + if (new_version || + (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { + ci->i_layout = info->layout; + queue_trunc = ceph_fill_file_size(inode, issued, + le32_to_cpu(info->truncate_seq), + le64_to_cpu(info->truncate_size), + le64_to_cpu(info->size)); + /* only update max_size on auth cap */ + if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && + ci->i_max_size != le64_to_cpu(info->max_size)) { + dout("max_size %lld -> %llu\n", ci->i_max_size, + le64_to_cpu(info->max_size)); + ci->i_max_size = le64_to_cpu(info->max_size); + } + } /* xattrs */ /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */ @@ -668,6 +775,7 @@ static int fill_inode(struct inode *inode, memcpy(ci->i_xattrs.blob->vec.iov_base, iinfo->xattr_data, iinfo->xattr_len); ci->i_xattrs.version = le64_to_cpu(info->xattr_version); + ceph_forget_all_cached_acls(inode); xattr_blob = NULL; } @@ -738,29 +846,7 @@ static int fill_inode(struct inode *inode, !__ceph_dir_is_complete(ci)) { dout(" marking %p complete (empty)\n", inode); __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count)); - ci->i_max_offset = 2; - } -no_change: - spin_unlock(&ci->i_ceph_lock); - - /* queue truncate if we saw i_size decrease */ - if (queue_trunc) - ceph_queue_vmtruncate(inode); - - /* populate frag tree */ - /* FIXME: move me up, if/when version reflects fragtree changes */ - nsplits = le32_to_cpu(info->fragtree.nsplits); - mutex_lock(&ci->i_fragtree_mutex); - for (i = 0; i < nsplits; i++) { - u32 id = le32_to_cpu(info->fragtree.splits[i].frag); - struct ceph_inode_frag *frag = __get_or_create_frag(ci, id); - - if (IS_ERR(frag)) - continue; - frag->split_by = le32_to_cpu(info->fragtree.splits[i].by); - dout(" frag %x split by %d\n", frag->frag, frag->split_by); } - mutex_unlock(&ci->i_fragtree_mutex); /* were we issued a capability? */ if (info->cap.caps) { @@ -773,30 +859,41 @@ no_change: le32_to_cpu(info->cap.seq), le32_to_cpu(info->cap.mseq), le64_to_cpu(info->cap.realm), - info->cap.flags, - caps_reservation); + info->cap.flags, &new_cap); + wake = true; } else { - spin_lock(&ci->i_ceph_lock); dout(" %p got snap_caps %s\n", inode, ceph_cap_string(le32_to_cpu(info->cap.caps))); ci->i_snap_caps |= le32_to_cpu(info->cap.caps); if (cap_fmode >= 0) __ceph_get_fmode(ci, cap_fmode); - spin_unlock(&ci->i_ceph_lock); } } else if (cap_fmode >= 0) { - pr_warning("mds issued no caps on %llx.%llx\n", + pr_warn("mds issued no caps on %llx.%llx\n", ceph_vinop(inode)); __ceph_get_fmode(ci, cap_fmode); } + spin_unlock(&ci->i_ceph_lock); + + if (wake) + wake_up_all(&ci->i_cap_wq); + + /* queue truncate if we saw i_size decrease */ + if (queue_trunc) + ceph_queue_vmtruncate(inode); + + /* populate frag tree */ + if (S_ISDIR(inode->i_mode)) + ceph_fill_fragtree(inode, &info->fragtree, dirinfo); /* update delegation info? */ if (dirinfo) ceph_fill_dirfrag(inode, dirinfo); err = 0; - out: + if (new_cap) + ceph_put_cap(mdsc, new_cap); if (xattr_blob) ceph_buffer_put(xattr_blob); return err; @@ -853,41 +950,6 @@ out_unlock: } /* - * Set dentry's directory position based on the current dir's max, and - * order it in d_subdirs, so that dcache_readdir behaves. - * - * Always called under directory's i_mutex. - */ -static void ceph_set_dentry_offset(struct dentry *dn) -{ - struct dentry *dir = dn->d_parent; - struct inode *inode = dir->d_inode; - struct ceph_inode_info *ci; - struct ceph_dentry_info *di; - - BUG_ON(!inode); - - ci = ceph_inode(inode); - di = ceph_dentry(dn); - - spin_lock(&ci->i_ceph_lock); - if (!__ceph_dir_is_complete(ci)) { - spin_unlock(&ci->i_ceph_lock); - return; - } - di->offset = ceph_inode(inode)->i_max_offset++; - spin_unlock(&ci->i_ceph_lock); - - spin_lock(&dir->d_lock); - spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); - list_move(&dn->d_u.d_child, &dir->d_subdirs); - dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, - dn->d_u.d_child.prev, dn->d_u.d_child.next); - spin_unlock(&dn->d_lock); - spin_unlock(&dir->d_lock); -} - -/* * splice a dentry to an inode. * caller must hold directory i_mutex for this to be safe. * @@ -896,7 +958,7 @@ static void ceph_set_dentry_offset(struct dentry *dn) * the caller) if we fail. */ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, - bool *prehash, bool set_offset) + bool *prehash) { struct dentry *realdn; @@ -928,8 +990,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, } if ((!prehash || *prehash) && d_unhashed(dn)) d_rehash(dn); - if (set_offset) - ceph_set_dentry_offset(dn); out: return dn; } @@ -950,10 +1010,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, { struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct inode *in = NULL; - struct ceph_mds_reply_inode *ininfo; struct ceph_vino vino; struct ceph_fs_client *fsc = ceph_sb_to_client(sb); - int i = 0; int err = 0; dout("fill_trace %p is_dentry %d is_target %d\n", req, @@ -1008,10 +1066,82 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, session, req->r_request_started, -1, &req->r_caps_reservation); if (err < 0) - return err; + goto done; } else { WARN_ON_ONCE(1); } + + if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) { + struct qstr dname; + struct dentry *dn, *parent; + + BUG_ON(!rinfo->head->is_target); + BUG_ON(req->r_dentry); + + parent = d_find_any_alias(dir); + BUG_ON(!parent); + + dname.name = rinfo->dname; + dname.len = rinfo->dname_len; + dname.hash = full_name_hash(dname.name, dname.len); + vino.ino = le64_to_cpu(rinfo->targeti.in->ino); + vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); +retry_lookup: + dn = d_lookup(parent, &dname); + dout("d_lookup on parent=%p name=%.*s got %p\n", + parent, dname.len, dname.name, dn); + + if (!dn) { + dn = d_alloc(parent, &dname); + dout("d_alloc %p '%.*s' = %p\n", parent, + dname.len, dname.name, dn); + if (dn == NULL) { + dput(parent); + err = -ENOMEM; + goto done; + } + err = ceph_init_dentry(dn); + if (err < 0) { + dput(dn); + dput(parent); + goto done; + } + } else if (dn->d_inode && + (ceph_ino(dn->d_inode) != vino.ino || + ceph_snap(dn->d_inode) != vino.snap)) { + dout(" dn %p points to wrong inode %p\n", + dn, dn->d_inode); + d_delete(dn); + dput(dn); + goto retry_lookup; + } + + req->r_dentry = dn; + dput(parent); + } + } + + if (rinfo->head->is_target) { + vino.ino = le64_to_cpu(rinfo->targeti.in->ino); + vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); + + in = ceph_get_inode(sb, vino); + if (IS_ERR(in)) { + err = PTR_ERR(in); + goto done; + } + req->r_target_inode = in; + + err = fill_inode(in, &rinfo->targeti, NULL, + session, req->r_request_started, + (!req->r_aborted && rinfo->head->result == 0) ? + req->r_fmode : -1, + &req->r_caps_reservation); + if (err < 0) { + pr_err("fill_inode badness %p %llx.%llx\n", + in, ceph_vinop(in)); + goto done; + } } /* @@ -1053,6 +1183,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, /* rename? */ if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) { + struct inode *olddir = req->r_old_dentry_dir; + BUG_ON(!olddir); + dout(" src %p '%.*s' dst %p '%.*s'\n", req->r_old_dentry, req->r_old_dentry->d_name.len, @@ -1072,18 +1205,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(dn); - /* - * d_move() puts the renamed dentry at the end of - * d_subdirs. We need to assign it an appropriate - * directory offset so we can behave when dir is - * complete. - */ - ceph_set_dentry_offset(req->r_old_dentry); + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_complete(dir); + ceph_dir_clear_complete(olddir); + dout("dn %p gets new offset %lld\n", req->r_old_dentry, ceph_dentry(req->r_old_dentry)->offset); dn = req->r_old_dentry; /* use old_dentry */ - in = dn->d_inode; } /* null dentry? */ @@ -1105,99 +1234,46 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, } /* attach proper inode */ - ininfo = rinfo->targeti.in; - vino.ino = le64_to_cpu(ininfo->ino); - vino.snap = le64_to_cpu(ininfo->snapid); - in = dn->d_inode; - if (!in) { - in = ceph_get_inode(sb, vino); - if (IS_ERR(in)) { - pr_err("fill_trace bad get_inode " - "%llx.%llx\n", vino.ino, vino.snap); - err = PTR_ERR(in); - d_drop(dn); - goto done; - } - dn = splice_dentry(dn, in, &have_lease, true); + if (!dn->d_inode) { + ceph_dir_clear_complete(dir); + ihold(in); + dn = splice_dentry(dn, in, &have_lease); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; } req->r_dentry = dn; /* may have spliced */ - ihold(in); - } else if (ceph_ino(in) == vino.ino && - ceph_snap(in) == vino.snap) { - ihold(in); - } else { + } else if (dn->d_inode && dn->d_inode != in) { dout(" %p links to %p %llx.%llx, not %llx.%llx\n", - dn, in, ceph_ino(in), ceph_snap(in), - vino.ino, vino.snap); + dn, dn->d_inode, ceph_vinop(dn->d_inode), + ceph_vinop(in)); have_lease = false; - in = NULL; } if (have_lease) update_dentry_lease(dn, rinfo->dlease, session, req->r_request_started); dout(" final dn %p\n", dn); - i++; - } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP || - req->r_op == CEPH_MDS_OP_MKSNAP) && !req->r_aborted) { + } else if (!req->r_aborted && + (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || + req->r_op == CEPH_MDS_OP_MKSNAP)) { struct dentry *dn = req->r_dentry; + struct inode *dir = req->r_locked_dir; /* fill out a snapdir LOOKUPSNAP dentry */ BUG_ON(!dn); - BUG_ON(!req->r_locked_dir); - BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR); - ininfo = rinfo->targeti.in; - vino.ino = le64_to_cpu(ininfo->ino); - vino.snap = le64_to_cpu(ininfo->snapid); - in = ceph_get_inode(sb, vino); - if (IS_ERR(in)) { - pr_err("fill_inode get_inode badness %llx.%llx\n", - vino.ino, vino.snap); - err = PTR_ERR(in); - d_delete(dn); - goto done; - } + BUG_ON(!dir); + BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR); dout(" linking snapped dir %p to dn %p\n", in, dn); - dn = splice_dentry(dn, in, NULL, true); + ceph_dir_clear_complete(dir); + ihold(in); + dn = splice_dentry(dn, in, NULL); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; } req->r_dentry = dn; /* may have spliced */ - ihold(in); - rinfo->head->is_dentry = 1; /* fool notrace handlers */ - } - - if (rinfo->head->is_target) { - vino.ino = le64_to_cpu(rinfo->targeti.in->ino); - vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); - - if (in == NULL || ceph_ino(in) != vino.ino || - ceph_snap(in) != vino.snap) { - in = ceph_get_inode(sb, vino); - if (IS_ERR(in)) { - err = PTR_ERR(in); - goto done; - } - } - req->r_target_inode = in; - - err = fill_inode(in, - &rinfo->targeti, NULL, - session, req->r_request_started, - (le32_to_cpu(rinfo->head->result) == 0) ? - req->r_fmode : -1, - &req->r_caps_reservation); - if (err < 0) { - pr_err("fill_inode badness %p %llx.%llx\n", - in, ceph_vinop(in)); - goto done; - } } - done: dout("fill_trace done err=%d\n", err); return err; @@ -1247,11 +1323,23 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct qstr dname; struct dentry *dn; struct inode *in; - int err = 0, i; + int err = 0, ret, i; struct inode *snapdir = NULL; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; - u64 frag = le32_to_cpu(rhead->args.readdir.frag); struct ceph_dentry_info *di; + u64 r_readdir_offset = req->r_readdir_offset; + u32 frag = le32_to_cpu(rhead->args.readdir.frag); + + if (rinfo->dir_dir && + le32_to_cpu(rinfo->dir_dir->frag) != frag) { + dout("readdir_prepopulate got new frag %x -> %x\n", + frag, le32_to_cpu(rinfo->dir_dir->frag)); + frag = le32_to_cpu(rinfo->dir_dir->frag); + if (ceph_frag_is_leftmost(frag)) + r_readdir_offset = 2; + else + r_readdir_offset = 0; + } if (req->r_aborted) return readdir_prepopulate_inodes_only(req, session); @@ -1268,6 +1356,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir); } + /* FIXME: release caps/leases if error occurs */ for (i = 0; i < rinfo->dir_nr; i++) { struct ceph_vino vino; @@ -1292,9 +1381,10 @@ retry_lookup: err = -ENOMEM; goto out; } - err = ceph_init_dentry(dn); - if (err < 0) { + ret = ceph_init_dentry(dn); + if (ret < 0) { dput(dn); + err = ret; goto out; } } else if (dn->d_inode && @@ -1314,9 +1404,6 @@ retry_lookup: spin_unlock(&parent->d_lock); } - di = dn->d_fsdata; - di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); - /* inode */ if (dn->d_inode) { in = dn->d_inode; @@ -1329,26 +1416,39 @@ retry_lookup: err = PTR_ERR(in); goto out; } - dn = splice_dentry(dn, in, NULL, false); - if (IS_ERR(dn)) - dn = NULL; } if (fill_inode(in, &rinfo->dir_in[i], NULL, session, req->r_request_started, -1, &req->r_caps_reservation) < 0) { pr_err("fill_inode badness on %p\n", in); + if (!dn->d_inode) + iput(in); + d_drop(dn); goto next_item; } - if (dn) - update_dentry_lease(dn, rinfo->dir_dlease[i], - req->r_session, - req->r_request_started); + + if (!dn->d_inode) { + dn = splice_dentry(dn, in, NULL); + if (IS_ERR(dn)) { + err = PTR_ERR(dn); + dn = NULL; + goto next_item; + } + } + + di = dn->d_fsdata; + di->offset = ceph_make_fpos(frag, i + r_readdir_offset); + + update_dentry_lease(dn, rinfo->dir_dlease[i], + req->r_session, + req->r_request_started); next_item: if (dn) dput(dn); } - req->r_did_prepopulate = true; + if (err == 0) + req->r_did_prepopulate = true; out: if (snapdir) { @@ -1437,7 +1537,8 @@ static void ceph_invalidate_work(struct work_struct *work) dout("invalidate_pages %p gen %d revoking %d\n", inode, ci->i_rdcache_gen, ci->i_rdcache_revoking); if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { - /* nevermind! */ + if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) + check = 1; spin_unlock(&ci->i_ceph_lock); mutex_unlock(&ci->i_truncate_mutex); goto out; @@ -1445,7 +1546,7 @@ static void ceph_invalidate_work(struct work_struct *work) orig_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); - truncate_inode_pages(inode->i_mapping, 0); + truncate_pagecache(inode, 0); spin_lock(&ci->i_ceph_lock); if (orig_gen == ci->i_rdcache_gen && @@ -1458,13 +1559,14 @@ static void ceph_invalidate_work(struct work_struct *work) dout("invalidate_pages %p gen %d raced, now %d revoking %d\n", inode, orig_gen, ci->i_rdcache_gen, ci->i_rdcache_revoking); + if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) + check = 1; } spin_unlock(&ci->i_ceph_lock); mutex_unlock(&ci->i_truncate_mutex); - +out: if (check) ceph_check_caps(ci, 0, NULL); -out: iput(inode); } @@ -1547,7 +1649,7 @@ retry: ci->i_truncate_pending, to); spin_unlock(&ci->i_ceph_lock); - truncate_inode_pages(inode->i_mapping, to); + truncate_pagecache(inode, to); spin_lock(&ci->i_ceph_lock); if (to == ci->i_truncate_size) { @@ -1594,7 +1696,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct inode *parent_inode; const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; @@ -1658,6 +1759,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) dirtied |= CEPH_CAP_AUTH_EXCL; } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || attr->ia_mode != inode->i_mode) { + inode->i_mode = attr->ia_mode; req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode); mask |= CEPH_SETATTR_MODE; release |= CEPH_CAP_AUTH_SHARED; @@ -1773,15 +1875,19 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (inode_dirty_flags) __mark_inode_dirty(inode, inode_dirty_flags); + if (ia_valid & ATTR_MODE) { + err = posix_acl_chmod(inode, attr->ia_mode); + if (err) + goto out_put; + } + if (mask) { req->r_inode = inode; ihold(inode); req->r_inode_drop = release; req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; - parent_inode = ceph_get_dentry_parent_inode(dentry); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - iput(parent_inode); + err = ceph_mdsc_do_request(mdsc, NULL, req); } dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, ceph_cap_string(dirtied), mask); @@ -1792,6 +1898,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) return err; out: spin_unlock(&ci->i_ceph_lock); +out_put: ceph_mdsc_put_request(req); return err; } diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 669622fd1ae..a822a6e5829 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -1,9 +1,8 @@ +#include <linux/ceph/ceph_debug.h> #include <linux/in.h> #include "super.h" #include "mds_client.h" -#include <linux/ceph/ceph_debug.h> - #include "ioctl.h" @@ -64,7 +63,6 @@ static long __validate_layout(struct ceph_mds_client *mdsc, static long ceph_ioctl_set_layout(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct inode *parent_inode; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; struct ceph_ioctl_layout l; @@ -111,6 +109,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; + req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL; req->r_args.setlayout.layout.fl_stripe_unit = @@ -121,9 +121,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) cpu_to_le32(l.object_size); req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool); - parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - iput(parent_inode); + err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); return err; } @@ -157,6 +155,7 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; req->r_args.setlayout.layout.fl_stripe_unit = cpu_to_le32(l.stripe_unit); @@ -183,6 +182,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; + struct ceph_object_locator oloc; + struct ceph_object_id oid; u64 len = 1, olen; u64 tmp; struct ceph_pg pgid; @@ -211,8 +212,10 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", ceph_ino(inode), dl.object_no); - r = ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap, - ceph_file_layout_pg_pool(ci->i_layout)); + oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); + ceph_oid_set_name(&oid, dl.object_name); + + r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid); if (r < 0) { up_read(&osdc->map_sem); return r; diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index ae6d14e82b0..fbc39c47bac 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -2,11 +2,31 @@ #include <linux/file.h> #include <linux/namei.h> +#include <linux/random.h> #include "super.h" #include "mds_client.h" #include <linux/ceph/pagelist.h> +static u64 lock_secret; + +static inline u64 secure_addr(void *addr) +{ + u64 v = lock_secret ^ (u64)(unsigned long)addr; + /* + * Set the most significant bit, so that MDS knows the 'owner' + * is sufficient to identify the owner of lock. (old code uses + * both 'owner' and 'pid') + */ + v |= (1ULL << 63); + return v; +} + +void __init ceph_flock_init(void) +{ + get_random_bytes(&lock_secret, sizeof(lock_secret)); +} + /** * Implement fcntl and flock locking functions. */ @@ -14,17 +34,18 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, int cmd, u8 wait, struct file_lock *fl) { struct inode *inode = file_inode(file); - struct ceph_mds_client *mdsc = - ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; int err; u64 length = 0; + u64 owner; req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; /* mds requires start and length rather than start and end */ if (LLONG_MAX == fl->fl_end) @@ -32,25 +53,24 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, else length = fl->fl_end - fl->fl_start + 1; - dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " - "length: %llu, wait: %d, type: %d", (int)lock_type, - (int)operation, (u64)fl->fl_pid, fl->fl_start, - length, wait, fl->fl_type); + owner = secure_addr(fl->fl_owner); + + dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, " + "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type, + (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length, + wait, fl->fl_type); req->r_args.filelock_change.rule = lock_type; req->r_args.filelock_change.type = cmd; + req->r_args.filelock_change.owner = cpu_to_le64(owner); req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid); - /* This should be adjusted, but I'm not sure if - namespaces actually get id numbers*/ - req->r_args.filelock_change.pid_namespace = - cpu_to_le64((u64)(unsigned long)fl->fl_nspid); req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start); req->r_args.filelock_change.length = cpu_to_le64(length); req->r_args.filelock_change.wait = wait; err = ceph_mdsc_do_request(mdsc, inode, req); - if ( operation == CEPH_MDS_OP_GETFILELOCK){ + if (operation == CEPH_MDS_OP_GETFILELOCK) { fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid); if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) fl->fl_type = F_RDLCK; @@ -87,14 +107,19 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) u8 wait = 0; u16 op = CEPH_MDS_OP_SETFILELOCK; - fl->fl_nspid = get_pid(task_tgid(current)); - dout("ceph_lock, fl_pid:%d", fl->fl_pid); + if (!(fl->fl_flags & FL_POSIX)) + return -ENOLCK; + /* No mandatory locks */ + if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) + return -ENOLCK; + + dout("ceph_lock, fl_owner: %p", fl->fl_owner); /* set wait bit as appropriate, then make command as Ceph expects it*/ - if (F_SETLKW == cmd) - wait = 1; - if (F_GETLK == cmd) + if (IS_GETLK(cmd)) op = CEPH_MDS_OP_GETFILELOCK; + else if (IS_SETLKW(cmd)) + wait = 1; if (F_RDLCK == fl->fl_type) lock_cmd = CEPH_LOCK_SHARED; @@ -105,7 +130,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl); if (!err) { - if ( op != CEPH_MDS_OP_GETFILELOCK ){ + if (op != CEPH_MDS_OP_GETFILELOCK) { dout("mds locked, locking locally"); err = posix_lock_file(file, fl, NULL); if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { @@ -131,20 +156,22 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) { u8 lock_cmd; int err; - u8 wait = 1; - - fl->fl_nspid = get_pid(task_tgid(current)); - dout("ceph_flock, fl_pid:%d", fl->fl_pid); - - /* set wait bit, then clear it out of cmd*/ - if (cmd & LOCK_NB) - wait = 0; - cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN); - /* set command sequence that Ceph wants to see: - shared lock, exclusive lock, or unlock */ - if (LOCK_SH == cmd) + u8 wait = 0; + + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + /* No mandatory locks */ + if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) + return -ENOLCK; + + dout("ceph_flock, fl_file: %p", fl->fl_file); + + if (IS_SETLKW(cmd)) + wait = 1; + + if (F_RDLCK == fl->fl_type) lock_cmd = CEPH_LOCK_SHARED; - else if (LOCK_EX == cmd) + else if (F_WRLCK == fl->fl_type) lock_cmd = CEPH_LOCK_EXCL; else lock_cmd = CEPH_LOCK_UNLOCK; @@ -280,13 +307,11 @@ int lock_to_ceph_filelock(struct file_lock *lock, struct ceph_filelock *cephlock) { int err = 0; - cephlock->start = cpu_to_le64(lock->fl_start); cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); cephlock->client = cpu_to_le64(0); - cephlock->pid = cpu_to_le64(lock->fl_pid); - cephlock->pid_namespace = - cpu_to_le64((u64)(unsigned long)lock->fl_nspid); + cephlock->pid = cpu_to_le64((u64)lock->fl_pid); + cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner)); switch (lock->fl_type) { case F_RDLCK: diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index b7bda5d9611..92a2548278f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3,6 +3,7 @@ #include <linux/fs.h> #include <linux/wait.h> #include <linux/slab.h> +#include <linux/gfp.h> #include <linux/sched.h> #include <linux/debugfs.h> #include <linux/seq_file.h> @@ -43,6 +44,7 @@ */ struct ceph_reconnect_state { + int nr_caps; struct ceph_pagelist *pagelist; bool flock; }; @@ -62,7 +64,7 @@ static const struct ceph_connection_operations mds_con_ops; */ static int parse_reply_info_in(void **p, void *end, struct ceph_mds_reply_info_in *info, - int features) + u64 features) { int err = -EIO; @@ -97,7 +99,7 @@ bad: */ static int parse_reply_info_trace(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { int err; @@ -144,7 +146,7 @@ out_bad: */ static int parse_reply_info_dir(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { u32 num, i = 0; int err; @@ -164,21 +166,18 @@ static int parse_reply_info_dir(void **p, void *end, if (num == 0) goto done; - /* alloc large array */ - info->dir_nr = num; - info->dir_in = kcalloc(num, sizeof(*info->dir_in) + - sizeof(*info->dir_dname) + - sizeof(*info->dir_dname_len) + - sizeof(*info->dir_dlease), - GFP_NOFS); - if (info->dir_in == NULL) { - err = -ENOMEM; - goto out_bad; - } + BUG_ON(!info->dir_in); info->dir_dname = (void *)(info->dir_in + num); info->dir_dname_len = (void *)(info->dir_dname + num); info->dir_dlease = (void *)(info->dir_dname_len + num); + if ((unsigned long)(info->dir_dlease + num) > + (unsigned long)info->dir_in + info->dir_buf_size) { + pr_err("dir contents are larger than expected\n"); + WARN_ON(1); + goto bad; + } + info->dir_nr = num; while (num) { /* dentry */ ceph_decode_need(p, end, sizeof(u32)*2, bad); @@ -216,7 +215,7 @@ out_bad: */ static int parse_reply_info_filelock(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { if (*p + sizeof(*info->filelock_reply) > end) goto bad; @@ -237,7 +236,7 @@ bad: */ static int parse_reply_info_create(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { if (features & CEPH_FEATURE_REPLY_CREATE_INODE) { if (*p == end) { @@ -261,7 +260,7 @@ bad: */ static int parse_reply_info_extra(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { if (info->head->op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); @@ -279,7 +278,7 @@ static int parse_reply_info_extra(void **p, void *end, */ static int parse_reply_info(struct ceph_msg *msg, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { void *p, *end; u32 len; @@ -326,7 +325,9 @@ out_bad: static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) { - kfree(info->dir_in); + if (!info->dir_in) + return; + free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size)); } @@ -443,6 +444,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, INIT_LIST_HEAD(&s->s_waiting); INIT_LIST_HEAD(&s->s_unsafe); s->s_num_cap_releases = 0; + s->s_cap_reconnect = 0; s->s_cap_iterator = NULL; INIT_LIST_HEAD(&s->s_cap_releases); INIT_LIST_HEAD(&s->s_cap_releases_done); @@ -510,12 +512,11 @@ void ceph_mdsc_release_request(struct kref *kref) struct ceph_mds_request *req = container_of(kref, struct ceph_mds_request, r_kref); + destroy_reply_info(&req->r_reply_info); if (req->r_request) ceph_msg_put(req->r_request); - if (req->r_reply) { + if (req->r_reply) ceph_msg_put(req->r_reply); - destroy_reply_info(&req->r_reply_info); - } if (req->r_inode) { ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); iput(req->r_inode); @@ -526,7 +527,9 @@ void ceph_mdsc_release_request(struct kref *kref) iput(req->r_target_inode); if (req->r_dentry) dput(req->r_dentry); - if (req->r_old_dentry) { + if (req->r_old_dentry) + dput(req->r_old_dentry); + if (req->r_old_dentry_dir) { /* * track (and drop pins for) r_old_dentry_dir * separately, since r_old_dentry's d_parent may have @@ -535,7 +538,6 @@ void ceph_mdsc_release_request(struct kref *kref) */ ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); - dput(req->r_old_dentry); iput(req->r_old_dentry_dir); } kfree(req->r_path1); @@ -642,6 +644,8 @@ static void __unregister_request(struct ceph_mds_client *mdsc, req->r_unsafe_dir = NULL; } + complete_all(&req->r_safe_completion); + ceph_mdsc_put_request(req); } @@ -709,14 +713,15 @@ static int __choose_mds(struct ceph_mds_client *mdsc, struct dentry *dn = get_nonsnap_parent(parent); inode = dn->d_inode; dout("__choose_mds using nonsnap parent %p\n", inode); - } else if (req->r_dentry->d_inode) { + } else { /* dentry target */ inode = req->r_dentry->d_inode; - } else { - /* dir + name */ - inode = dir; - hash = ceph_dentry_hash(dir, req->r_dentry); - is_hash = true; + if (!inode || mode == USE_AUTH_MDS) { + /* dir + name */ + inode = dir; + hash = ceph_dentry_hash(dir, req->r_dentry); + is_hash = true; + } } } @@ -842,35 +847,56 @@ static int __open_session(struct ceph_mds_client *mdsc, * * called under mdsc->mutex */ +static struct ceph_mds_session * +__open_export_target_session(struct ceph_mds_client *mdsc, int target) +{ + struct ceph_mds_session *session; + + session = __ceph_lookup_mds_session(mdsc, target); + if (!session) { + session = register_session(mdsc, target); + if (IS_ERR(session)) + return session; + } + if (session->s_state == CEPH_MDS_SESSION_NEW || + session->s_state == CEPH_MDS_SESSION_CLOSING) + __open_session(mdsc, session); + + return session; +} + +struct ceph_mds_session * +ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target) +{ + struct ceph_mds_session *session; + + dout("open_export_target_session to mds%d\n", target); + + mutex_lock(&mdsc->mutex); + session = __open_export_target_session(mdsc, target); + mutex_unlock(&mdsc->mutex); + + return session; +} + static void __open_export_target_sessions(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_mds_info *mi; struct ceph_mds_session *ts; int i, mds = session->s_mds; - int target; if (mds >= mdsc->mdsmap->m_max_mds) return; + mi = &mdsc->mdsmap->m_info[mds]; dout("open_export_target_sessions for mds%d (%d targets)\n", session->s_mds, mi->num_export_targets); for (i = 0; i < mi->num_export_targets; i++) { - target = mi->export_targets[i]; - ts = __ceph_lookup_mds_session(mdsc, target); - if (!ts) { - ts = register_session(mdsc, target); - if (IS_ERR(ts)) - return; - } - if (session->s_state == CEPH_MDS_SESSION_NEW || - session->s_state == CEPH_MDS_SESSION_CLOSING) - __open_session(mdsc, session); - else - dout(" mds%d target mds%d %p is %s\n", session->s_mds, - i, ts, session_state_name(ts->s_state)); - ceph_put_mds_session(ts); + ts = __open_export_target_session(mdsc, mi->export_targets[i]); + if (!IS_ERR(ts)) + ceph_put_mds_session(ts); } } @@ -986,7 +1012,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); spin_lock(&ci->i_ceph_lock); - __ceph_remove_cap(cap); + __ceph_remove_cap(cap, false); if (!__ceph_is_any_real_caps(ci)) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; @@ -1132,6 +1158,21 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, return 0; } +static int send_flushmsg_ack(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, u64 seq) +{ + struct ceph_msg *msg; + + dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n", + session->s_mds, session_state_name(session->s_state), seq); + msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq); + if (!msg) + return -ENOMEM; + ceph_con_send(&session->s_con, msg); + return 0; +} + + /* * Note new cap ttl, and any transition from stale -> not stale (fresh?). * @@ -1210,7 +1251,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { struct ceph_mds_session *session = arg; struct ceph_inode_info *ci = ceph_inode(inode); - int used, oissued, mine; + int used, wanted, oissued, mine; if (session->s_trim_caps <= 0) return -1; @@ -1218,22 +1259,25 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) spin_lock(&ci->i_ceph_lock); mine = cap->issued | cap->implemented; used = __ceph_caps_used(ci); + wanted = __ceph_caps_file_wanted(ci); oissued = __ceph_caps_issued_other(ci, cap); - dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n", + dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n", inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), - ceph_cap_string(used)); - if (ci->i_dirty_caps) - goto out; /* dirty caps */ - if ((used & ~oissued) & mine) + ceph_cap_string(used), ceph_cap_string(wanted)); + if (cap == ci->i_auth_cap) { + if (ci->i_dirty_caps | ci->i_flushing_caps) + goto out; + if ((used | wanted) & CEPH_CAP_ANY_WR) + goto out; + } + if ((used | wanted) & ~oissued & mine) goto out; /* we need these caps */ session->s_trim_caps--; if (oissued) { /* we aren't the only cap.. just remove us */ - __queue_cap_release(session, ceph_ino(inode), cap->cap_id, - cap->mseq, cap->issue_seq); - __ceph_remove_cap(cap); + __ceph_remove_cap(cap, true); } else { /* try to drop referring dentries */ spin_unlock(&ci->i_ceph_lock); @@ -1267,6 +1311,9 @@ static int trim_caps(struct ceph_mds_client *mdsc, trim_caps - session->s_trim_caps); session->s_trim_caps = 0; } + + ceph_add_cap_releases(mdsc, session); + ceph_send_cap_releases(mdsc, session); return 0; } @@ -1416,17 +1463,19 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc, unsigned num; dout("discard_cap_releases mds%d\n", session->s_mds); - spin_lock(&session->s_cap_lock); - /* zero out the in-progress message */ - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, list_head); - head = msg->front.iov_base; - num = le32_to_cpu(head->num); - dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); - head->num = cpu_to_le32(0); - msg->front.iov_len = sizeof(*head); - session->s_num_cap_releases += num; + if (!list_empty(&session->s_cap_releases)) { + /* zero out the in-progress message */ + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, list_head); + head = msg->front.iov_base; + num = le32_to_cpu(head->num); + dout("discard_cap_releases mds%d %p %u\n", + session->s_mds, msg, num); + head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); + session->s_num_cap_releases += num; + } /* requeue completed messages */ while (!list_empty(&session->s_cap_releases_done)) { @@ -1443,14 +1492,49 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc, msg->front.iov_len = sizeof(*head); list_add(&msg->list_head, &session->s_cap_releases); } - - spin_unlock(&session->s_cap_lock); } /* * requests */ +int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, + struct inode *dir) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; + size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) + + sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease); + int order, num_entries; + + spin_lock(&ci->i_ceph_lock); + num_entries = ci->i_files + ci->i_subdirs; + spin_unlock(&ci->i_ceph_lock); + num_entries = max(num_entries, 1); + num_entries = min(num_entries, opt->max_readdir); + + order = get_order(size * num_entries); + while (order >= 0) { + rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN, + order); + if (rinfo->dir_in) + break; + order--; + } + if (!rinfo->dir_in) + return -ENOMEM; + + num_entries = (PAGE_SIZE << order) / size; + num_entries = min(num_entries, opt->max_readdir); + + rinfo->dir_buf_size = PAGE_SIZE << order; + req->r_num_caps = num_entries + 1; + req->r_args.readdir.max_entries = cpu_to_le32(num_entries); + req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes); + return 0; +} + /* * Create an mds request. */ @@ -1474,6 +1558,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) init_completion(&req->r_safe_completion); INIT_LIST_HEAD(&req->r_unsafe_item); + req->r_stamp = CURRENT_TIME; + req->r_op = op; req->r_direct_mode = mode; return req; @@ -1699,7 +1785,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, } len = sizeof(*head) + - pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)); + pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) + + sizeof(struct timespec); /* calculate (max) length for cap releases */ len += sizeof(struct ceph_mds_request_release) * @@ -1716,6 +1803,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, goto out_free2; } + msg->hdr.version = 2; msg->hdr.tid = cpu_to_le64(req->r_tid); head = msg->front.iov_base; @@ -1752,6 +1840,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, mds, req->r_old_inode_drop, req->r_old_inode_unless, 0); head->num_releases = cpu_to_le16(releases); + /* time stamp */ + ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp)); + BUG_ON(p > end); msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); @@ -1875,8 +1966,11 @@ static int __do_request(struct ceph_mds_client *mdsc, int mds = -1; int err = -EAGAIN; - if (req->r_err || req->r_got_result) + if (req->r_err || req->r_got_result) { + if (req->r_aborted) + __unregister_request(mdsc, req); goto out; + } if (req->r_timeout && time_after_eq(jiffies, req->r_started + req->r_timeout)) { @@ -2009,7 +2103,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); if (req->r_locked_dir) ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); - if (req->r_old_dentry) + if (req->r_old_dentry_dir) ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); @@ -2131,13 +2225,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* dup? */ if ((req->r_got_unsafe && !head->safe) || (req->r_got_safe && head->safe)) { - pr_warning("got a dup %s reply on %llu from mds%d\n", + pr_warn("got a dup %s reply on %llu from mds%d\n", head->safe ? "safe" : "unsafe", tid, mds); mutex_unlock(&mdsc->mutex); goto out; } if (req->r_got_safe && !head->safe) { - pr_warning("got unsafe after safe on %llu from mds%d\n", + pr_warn("got unsafe after safe on %llu from mds%d\n", tid, mds); mutex_unlock(&mdsc->mutex); goto out; @@ -2154,26 +2248,16 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) */ if (result == -ESTALE) { dout("got ESTALE on request %llu", req->r_tid); - if (!req->r_inode) { - /* do nothing; not an authority problem */ - } else if (req->r_direct_mode != USE_AUTH_MDS) { + if (req->r_direct_mode != USE_AUTH_MDS) { dout("not using auth, setting for that now"); req->r_direct_mode = USE_AUTH_MDS; __do_request(mdsc, req); mutex_unlock(&mdsc->mutex); goto out; } else { - struct ceph_inode_info *ci = ceph_inode(req->r_inode); - struct ceph_cap *cap = NULL; - - if (req->r_session) - cap = ceph_get_cap_for_mds(ci, - req->r_session->s_mds); - - dout("already using auth"); - if ((!cap || cap != ci->i_auth_cap) || - (cap->mseq != req->r_sent_on_mseq)) { - dout("but cap changed, so resending"); + int mds = __choose_mds(mdsc, req); + if (mds >= 0 && mds != req->r_session->s_mds) { + dout("but auth changed, so resending"); __do_request(mdsc, req); mutex_unlock(&mdsc->mutex); goto out; @@ -2186,7 +2270,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (head->safe) { req->r_got_safe = true; __unregister_request(mdsc, req); - complete_all(&req->r_safe_completion); if (req->r_got_unsafe) { /* @@ -2238,8 +2321,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); if (err == 0) { if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || - req->r_op == CEPH_MDS_OP_LSSNAP) && - rinfo->dir_nr) + req->r_op == CEPH_MDS_OP_LSSNAP)) ceph_readdir_prepopulate(req, req->r_session); ceph_unreserve_caps(mdsc, &req->r_caps_reservation); } @@ -2400,6 +2482,10 @@ static void handle_session(struct ceph_mds_session *session, trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); break; + case CEPH_SESSION_FLUSHMSG: + send_flushmsg_ack(mdsc, session, seq); + break; + default: pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); WARN_ON(1); @@ -2490,6 +2576,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, cap->seq = 0; /* reset cap seq */ cap->issue_seq = 0; /* and issue_seq */ cap->mseq = 0; /* and migrate_seq */ + cap->cap_gen = cap->session->s_cap_gen; if (recon_state->flock) { rec.v2.cap_id = cpu_to_le64(cap->cap_id); @@ -2552,6 +2639,8 @@ encode_again: } else { err = ceph_pagelist_append(pagelist, &rec, reclen); } + + recon_state->nr_caps++; out_free: kfree(path); out_dput: @@ -2579,6 +2668,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, struct rb_node *p; int mds = session->s_mds; int err = -ENOMEM; + int s_nr_caps; struct ceph_pagelist *pagelist; struct ceph_reconnect_state recon_state; @@ -2610,20 +2700,38 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, dout("session %p state %s\n", session, session_state_name(session->s_state)); + spin_lock(&session->s_gen_ttl_lock); + session->s_cap_gen++; + spin_unlock(&session->s_gen_ttl_lock); + + spin_lock(&session->s_cap_lock); + /* + * notify __ceph_remove_cap() that we are composing cap reconnect. + * If a cap get released before being added to the cap reconnect, + * __ceph_remove_cap() should skip queuing cap release. + */ + session->s_cap_reconnect = 1; /* drop old cap expires; we're about to reestablish that state */ discard_cap_releases(mdsc, session); + spin_unlock(&session->s_cap_lock); /* traverse this session's caps */ - err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); + s_nr_caps = session->s_nr_caps; + err = ceph_pagelist_encode_32(pagelist, s_nr_caps); if (err) goto fail; + recon_state.nr_caps = 0; recon_state.pagelist = pagelist; recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK; err = iterate_session_caps(session, encode_caps_cb, &recon_state); if (err < 0) goto fail; + spin_lock(&session->s_cap_lock); + session->s_cap_reconnect = 0; + spin_unlock(&session->s_cap_lock); + /* * snaprealms. we provide mds with the ino, seq (version), and * parent for all of our realms. If the mds has any newer info, @@ -2646,11 +2754,18 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, if (recon_state.flock) reply->hdr.version = cpu_to_le16(2); - if (pagelist->length) { - /* set up outbound data if we have any */ - reply->hdr.data_len = cpu_to_le32(pagelist->length); - ceph_msg_data_add_pagelist(reply, pagelist); + + /* raced with cap release? */ + if (s_nr_caps != recon_state.nr_caps) { + struct page *page = list_first_entry(&pagelist->head, + struct page, lru); + __le32 *addr = kmap_atomic(page); + *addr = cpu_to_le32(recon_state.nr_caps); + kunmap_atomic(addr); } + + reply->hdr.data_len = cpu_to_le32(pagelist->length); + ceph_msg_data_add_pagelist(reply, pagelist); ceph_con_send(&session->s_con, reply); mutex_unlock(&session->s_mutex); @@ -3417,7 +3532,7 @@ static void peer_reset(struct ceph_connection *con) struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; - pr_warning("mds%d closed our session\n", s->s_mds); + pr_warn("mds%d closed our session\n", s->s_mds); send_mds_reconnect(mdsc, s); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index c2a19fbbe51..e00737cf523 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -67,6 +67,7 @@ struct ceph_mds_reply_info_parsed { /* for readdir results */ struct { struct ceph_mds_reply_dirfrag *dir_dir; + size_t dir_buf_size; int dir_nr; char **dir_dname; u32 *dir_dname_len; @@ -132,6 +133,7 @@ struct ceph_mds_session { struct list_head s_caps; /* all caps issued by this session */ int s_nr_caps, s_trim_caps; int s_num_cap_releases; + int s_cap_reconnect; struct list_head s_cap_releases; /* waiting cap_release messages */ struct list_head s_cap_releases_done; /* ready to send */ struct ceph_cap *s_cap_iterator; @@ -192,6 +194,7 @@ struct ceph_mds_request { int r_fmode; /* file mode, if expecting cap */ kuid_t r_uid; kgid_t r_gid; + struct timespec r_stamp; /* for choosing which mds to send this request to */ int r_direct_mode; @@ -345,7 +348,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct dentry *dn); extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); - +extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, + struct inode *dir); extern struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, @@ -382,6 +386,8 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg); +extern struct ceph_mds_session * +ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target); extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 132b64eeecd..261531e55e9 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -62,7 +62,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ceph_decode_16_safe(p, end, version, bad); if (version > 3) { - pr_warning("got mdsmap version %d > 3, failing", version); + pr_warn("got mdsmap version %d > 3, failing", version); goto bad; } diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index 89fa4a940a0..51cc23e4811 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -41,6 +41,8 @@ const char *ceph_session_op_name(int op) case CEPH_SESSION_RENEWCAPS: return "renewcaps"; case CEPH_SESSION_STALE: return "stale"; case CEPH_SESSION_RECALL_STATE: return "recall_state"; + case CEPH_SESSION_FLUSHMSG: return "flushmsg"; + case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack"; } return "???"; } @@ -52,6 +54,7 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash"; case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent"; case CEPH_MDS_OP_LOOKUPINO: return "lookupino"; + case CEPH_MDS_OP_LOOKUPNAME: return "lookupname"; case CEPH_MDS_OP_GETATTR: return "getattr"; case CEPH_MDS_OP_SETXATTR: return "setxattr"; case CEPH_MDS_OP_SETATTR: return "setattr"; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 6a0951e4304..06150fd745a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -144,7 +144,11 @@ enum { Opt_ino32, Opt_noino32, Opt_fscache, - Opt_nofscache + Opt_nofscache, +#ifdef CONFIG_CEPH_FS_POSIX_ACL + Opt_acl, +#endif + Opt_noacl }; static match_table_t fsopt_tokens = { @@ -172,6 +176,10 @@ static match_table_t fsopt_tokens = { {Opt_noino32, "noino32"}, {Opt_fscache, "fsc"}, {Opt_nofscache, "nofsc"}, +#ifdef CONFIG_CEPH_FS_POSIX_ACL + {Opt_acl, "acl"}, +#endif + {Opt_noacl, "noacl"}, {-1, NULL} }; @@ -271,6 +279,14 @@ static int parse_fsopt_token(char *c, void *private) case Opt_nofscache: fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; break; +#ifdef CONFIG_CEPH_FS_POSIX_ACL + case Opt_acl: + fsopt->sb_flags |= MS_POSIXACL; + break; +#endif + case Opt_noacl: + fsopt->sb_flags &= ~MS_POSIXACL; + break; default: BUG_ON(token); } @@ -438,6 +454,13 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) else seq_puts(m, ",nofsc"); +#ifdef CONFIG_CEPH_FS_POSIX_ACL + if (fsopt->sb_flags & MS_POSIXACL) + seq_puts(m, ",acl"); + else + seq_puts(m, ",noacl"); +#endif + if (fsopt->wsize) seq_printf(m, ",wsize=%d", fsopt->wsize); if (fsopt->rsize != CEPH_RSIZE_DEFAULT) @@ -490,10 +513,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, struct ceph_options *opt) { struct ceph_fs_client *fsc; - const unsigned supported_features = + const u64 supported_features = CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH; - const unsigned required_features = 0; + const u64 required_features = 0; int page_count; size_t size; int err = -ENOMEM; @@ -686,6 +709,7 @@ static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, .destroy_inode = ceph_destroy_inode, .write_inode = ceph_write_inode, + .drop_inode = ceph_drop_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, .show_options = ceph_show_options, @@ -819,6 +843,7 @@ static int ceph_set_super(struct super_block *s, void *data) s->s_flags = fsc->mount_options->sb_flags; s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ + s->s_xattr = ceph_xattr_handlers; s->s_fs_info = fsc; fsc->sb = s; @@ -906,6 +931,10 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, struct ceph_options *opt = NULL; dout("ceph_mount\n"); + +#ifdef CONFIG_CEPH_FS_POSIX_ACL + flags |= MS_POSIXACL; +#endif err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); if (err < 0) { res = ERR_PTR(err); @@ -997,6 +1026,7 @@ static int __init init_ceph(void) if (ret) goto out; + ceph_flock_init(); ceph_xattr_init(); ret = register_filesystem(&ceph_fs_type); if (ret) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 6014b0a3c40..12b20744e38 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -13,6 +13,7 @@ #include <linux/wait.h> #include <linux/writeback.h> #include <linux/slab.h> +#include <linux/posix_acl.h> #include <linux/ceph/libceph.h> @@ -265,7 +266,6 @@ struct ceph_inode_info { struct timespec i_rctime; u64 i_rbytes, i_rfiles, i_rsubdirs; u64 i_files, i_subdirs; - u64 i_max_offset; /* largest readdir offset, set with complete dir */ struct rb_root i_fragtree; struct mutex i_fragtree_mutex; @@ -287,9 +287,6 @@ struct ceph_inode_info { unsigned long i_hold_caps_min; /* jiffies */ unsigned long i_hold_caps_max; /* jiffies */ struct list_head i_cap_delay_list; /* for delayed cap release to mds */ - int i_cap_exporting_mds; /* to handle cap migration between */ - unsigned i_cap_exporting_mseq; /* mds's. */ - unsigned i_cap_exporting_issued; struct ceph_cap_reservation i_cap_migration_resv; struct list_head i_cap_snaps; /* snapped state pending flush to mds */ struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or @@ -335,7 +332,6 @@ struct ceph_inode_info { u32 i_fscache_gen; /* sequence, for delayed fscache validate */ struct work_struct i_revalidate_work; #endif - struct inode vfs_inode; /* at end */ }; @@ -529,6 +525,8 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci) } extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); +extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, + struct ceph_cap *ocap, int mask); extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask); extern int __ceph_caps_used(struct ceph_inode_info *ci); @@ -577,7 +575,7 @@ struct ceph_file_info { /* readdir: position within a frag */ unsigned offset; /* offset of last chunk, adjusted for . and .. */ - u64 next_offset; /* offset of next chunk (last_name's + 1) */ + unsigned next_offset; /* offset of next chunk (last_name's + 1) */ char *last_name; /* last entry in previous chunk */ struct dentry *dentry; /* next dentry (for dcache readdir) */ int dir_release_count; @@ -691,6 +689,7 @@ extern const struct inode_operations ceph_file_iops; extern struct inode *ceph_alloc_inode(struct super_block *sb); extern void ceph_destroy_inode(struct inode *inode); +extern int ceph_drop_inode(struct inode *inode); extern struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino); @@ -724,6 +723,9 @@ extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, /* xattr.c */ extern int ceph_setxattr(struct dentry *, const char *, const void *, size_t, int); +int __ceph_setxattr(struct dentry *, const char *, const void *, size_t, int); +ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); +int __ceph_removexattr(struct dentry *, const char *); extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); extern int ceph_removexattr(struct dentry *, const char *); @@ -732,24 +734,57 @@ extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); extern void __init ceph_xattr_init(void); extern void ceph_xattr_exit(void); +/* acl.c */ +extern const struct xattr_handler *ceph_xattr_handlers[]; + +#ifdef CONFIG_CEPH_FS_POSIX_ACL + +struct posix_acl *ceph_get_acl(struct inode *, int); +int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int ceph_init_acl(struct dentry *, struct inode *, struct inode *); + +static inline void ceph_forget_all_cached_acls(struct inode *inode) +{ + forget_all_cached_acls(inode); +} + +#else + +#define ceph_get_acl NULL +#define ceph_set_acl NULL + +static inline int ceph_init_acl(struct dentry *dentry, struct inode *inode, + struct inode *dir) +{ + return 0; +} + +static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode) +{ + return 0; +} + +static inline void ceph_forget_all_cached_acls(struct inode *inode) +{ +} + +#endif + /* caps.c */ extern const char *ceph_cap_string(int c); extern void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_msg *msg); -extern int ceph_add_cap(struct inode *inode, - struct ceph_mds_session *session, u64 cap_id, - int fmode, unsigned issued, unsigned wanted, - unsigned cap, unsigned seq, u64 realmino, int flags, - struct ceph_cap_reservation *caps_reservation); -extern void __ceph_remove_cap(struct ceph_cap *cap); -static inline void ceph_remove_cap(struct ceph_cap *cap) -{ - spin_lock(&cap->ci->i_ceph_lock); - __ceph_remove_cap(cap); - spin_unlock(&cap->ci->i_ceph_lock); -} +extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx); +extern void ceph_add_cap(struct inode *inode, + struct ceph_mds_session *session, u64 cap_id, + int fmode, unsigned issued, unsigned wanted, + unsigned cap, unsigned seq, u64 realmino, int flags, + struct ceph_cap **new_cap); +extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); +extern int ceph_is_any_caps(struct inode *inode); extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino, u64 cap_id, u32 migrate_seq, u32 issue_seq); @@ -836,6 +871,7 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg); extern const struct export_operations ceph_export_ops; /* locks.c */ +extern __init void ceph_flock_init(void); extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index be661d8f532..c9c2b887381 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -6,16 +6,33 @@ #include <linux/ceph/decode.h> #include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> #include <linux/slab.h> #define XATTR_CEPH_PREFIX "ceph." #define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1) +static int __remove_xattr(struct ceph_inode_info *ci, + struct ceph_inode_xattr *xattr); + +/* + * List of handlers for synthetic system.* attributes. Other + * attributes are handled directly. + */ +const struct xattr_handler *ceph_xattr_handlers[] = { +#ifdef CONFIG_CEPH_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + NULL, +}; + static bool ceph_is_valid_xattr(const char *name) { return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || + !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); } @@ -47,32 +64,48 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci) } static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, - size_t size) + size_t size) { int ret; struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); struct ceph_osd_client *osdc = &fsc->client->osdc; s64 pool = ceph_file_layout_pg_pool(ci->i_layout); const char *pool_name; + char buf[128]; dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); down_read(&osdc->map_sem); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); - if (pool_name) - ret = snprintf(val, size, - "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%s", + if (pool_name) { + size_t len = strlen(pool_name); + ret = snprintf(buf, sizeof(buf), + "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=", (unsigned long long)ceph_file_layout_su(ci->i_layout), (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), - (unsigned long long)ceph_file_layout_object_size(ci->i_layout), - pool_name); - else - ret = snprintf(val, size, + (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); + if (!size) { + ret += len; + } else if (ret + len > size) { + ret = -ERANGE; + } else { + memcpy(val, buf, ret); + memcpy(val + ret, pool_name, len); + ret += len; + } + } else { + ret = snprintf(buf, sizeof(buf), "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld", (unsigned long long)ceph_file_layout_su(ci->i_layout), (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), (unsigned long long)ceph_file_layout_object_size(ci->i_layout), (unsigned long long)pool); - + if (size) { + if (ret <= size) + memcpy(val, buf, ret); + else + ret = -ERANGE; + } + } up_read(&osdc->map_sem); return ret; } @@ -198,7 +231,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { .name_size = sizeof("ceph.dir.layout"), .getxattr_cb = ceph_vxattrcb_layout, .readonly = false, - .hidden = false, + .hidden = true, .exists_cb = ceph_vxattrcb_layout_exists, }, XATTR_LAYOUT_FIELD(dir, layout, stripe_unit), @@ -225,7 +258,7 @@ static struct ceph_vxattr ceph_file_vxattrs[] = { .name_size = sizeof("ceph.file.layout"), .getxattr_cb = ceph_vxattrcb_layout, .readonly = false, - .hidden = false, + .hidden = true, .exists_cb = ceph_vxattrcb_layout_exists, }, XATTR_LAYOUT_FIELD(file, layout, stripe_unit), @@ -305,8 +338,7 @@ static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode, static int __set_xattr(struct ceph_inode_info *ci, const char *name, int name_len, const char *val, int val_len, - int dirty, - int should_free_name, int should_free_val, + int flags, int update_xattr, struct ceph_inode_xattr **newxattr) { struct rb_node **p; @@ -335,12 +367,31 @@ static int __set_xattr(struct ceph_inode_info *ci, xattr = NULL; } + if (update_xattr) { + int err = 0; + if (xattr && (flags & XATTR_CREATE)) + err = -EEXIST; + else if (!xattr && (flags & XATTR_REPLACE)) + err = -ENODATA; + if (err) { + kfree(name); + kfree(val); + return err; + } + if (update_xattr < 0) { + if (xattr) + __remove_xattr(ci, xattr); + kfree(name); + return 0; + } + } + if (!xattr) { new = 1; xattr = *newxattr; xattr->name = name; xattr->name_len = name_len; - xattr->should_free_name = should_free_name; + xattr->should_free_name = update_xattr; ci->i_xattrs.count++; dout("__set_xattr count=%d\n", ci->i_xattrs.count); @@ -350,7 +401,7 @@ static int __set_xattr(struct ceph_inode_info *ci, if (xattr->should_free_val) kfree((void *)xattr->val); - if (should_free_name) { + if (update_xattr) { kfree((void *)name); name = xattr->name; } @@ -365,8 +416,8 @@ static int __set_xattr(struct ceph_inode_info *ci, xattr->val = ""; xattr->val_len = val_len; - xattr->dirty = dirty; - xattr->should_free_val = (val && should_free_val); + xattr->dirty = update_xattr; + xattr->should_free_val = (val && update_xattr); if (new) { rb_link_node(&xattr->node, parent, p); @@ -428,7 +479,7 @@ static int __remove_xattr(struct ceph_inode_info *ci, struct ceph_inode_xattr *xattr) { if (!xattr) - return -EOPNOTSUPP; + return -ENODATA; rb_erase(&xattr->node, &ci->i_xattrs.index); @@ -574,7 +625,7 @@ start: p += len; err = __set_xattr(ci, name, namelen, val, len, - 0, 0, 0, &xattrs[numattr]); + 0, 0, &xattrs[numattr]); if (err < 0) goto bad; @@ -663,10 +714,9 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci) } } -ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, +ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, size_t size) { - struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); int err; struct ceph_inode_xattr *xattr; @@ -675,7 +725,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, if (!ceph_is_valid_xattr(name)) return -ENODATA; - /* let's see if a virtual xattr was requested */ vxattr = ceph_match_vxattr(inode, name); if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { @@ -725,6 +774,15 @@ out: return err; } +ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, + size_t size) +{ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_getxattr(dentry, name, value, size); + + return __ceph_getxattr(dentry->d_inode, name, value, size); +} + ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) { struct inode *inode = dentry->d_inode; @@ -800,7 +858,6 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct inode *parent_inode; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = fsc->mdsc; int err; @@ -829,6 +886,9 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, dout("setxattr value=%.*s\n", (int)size, value); + if (!value) + flags |= CEPH_XATTR_REMOVE; + /* do request */ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR, USE_AUTH_MDS); @@ -848,9 +908,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, req->r_data_len = size; dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); - parent_inode = ceph_get_dentry_parent_inode(dentry); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - iput(parent_inode); + err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); @@ -863,15 +921,15 @@ out: return err; } -int ceph_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +int __ceph_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); int issued; int err; - int dirty; + int dirty = 0; int name_len = strlen(name); int val_len = size; char *newname = NULL; @@ -879,9 +937,6 @@ int ceph_setxattr(struct dentry *dentry, const char *name, struct ceph_inode_xattr *xattr = NULL; int required_blob_size; - if (ceph_snap(inode) != CEPH_NOSNAP) - return -EROFS; - if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; @@ -935,12 +990,14 @@ retry: goto retry; } - err = __set_xattr(ci, newname, name_len, newval, - val_len, 1, 1, 1, &xattr); + err = __set_xattr(ci, newname, name_len, newval, val_len, + flags, value ? 1 : -1, &xattr); - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); - ci->i_xattrs.dirty = true; - inode->i_ctime = CURRENT_TIME; + if (!err) { + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); + ci->i_xattrs.dirty = true; + inode->i_ctime = CURRENT_TIME; + } spin_unlock(&ci->i_ceph_lock); if (dirty) @@ -958,12 +1015,23 @@ out: return err; } +int ceph_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) + return -EROFS; + + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_setxattr(dentry, name, value, size, flags); + + return __ceph_setxattr(dentry, name, value, size, flags); +} + static int ceph_send_removexattr(struct dentry *dentry, const char *name) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = dentry->d_inode; - struct inode *parent_inode; struct ceph_mds_request *req; int err; @@ -977,14 +1045,12 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) req->r_num_caps = 1; req->r_path2 = kstrdup(name, GFP_NOFS); - parent_inode = ceph_get_dentry_parent_inode(dentry); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - iput(parent_inode); + err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); return err; } -int ceph_removexattr(struct dentry *dentry, const char *name) +int __ceph_removexattr(struct dentry *dentry, const char *name) { struct inode *inode = dentry->d_inode; struct ceph_vxattr *vxattr; @@ -994,9 +1060,6 @@ int ceph_removexattr(struct dentry *dentry, const char *name) int required_blob_size; int dirty; - if (ceph_snap(inode) != CEPH_NOSNAP) - return -EROFS; - if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; @@ -1053,3 +1116,13 @@ out: return err; } +int ceph_removexattr(struct dentry *dentry, const char *name) +{ + if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) + return -EROFS; + + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_removexattr(dentry, name); + + return __ceph_removexattr(dentry, name); +} diff --git a/fs/char_dev.c b/fs/char_dev.c index 94b5f60076d..f77f7702fab 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -576,7 +576,8 @@ static struct kobject *base_probe(dev_t dev, int *part, void *data) void __init chrdev_init(void) { cdev_map = kobj_map_init(base_probe, &chrdevs_lock); - bdi_init(&directly_mappable_cdev_bdi); + if (bdi_init(&directly_mappable_cdev_bdi)) + panic("Failed to init directly mappable cdev bdi"); } diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 37e4a72a7d1..9409fa10bd5 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -65,5 +65,6 @@ struct cifs_sb_info { char *mountdata; /* options received at mount time or via DFS refs */ struct backing_dev_info bdi; struct delayed_work prune_tlinks; + struct rcu_head rcu; }; #endif /* _CIFS_FS_SB_H */ diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 0227b45ef00..15e9505aa35 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -290,7 +290,8 @@ int cifsConvertToUTF16(__le16 *target, const char *source, int srclen, const struct nls_table *cp, int mapChars) { - int i, j, charlen; + int i, charlen; + int j = 0; char src_char; __le16 dst_char; wchar_t tmp; @@ -298,12 +299,11 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, if (!mapChars) return cifs_strtoUTF16(target, source, PATH_MAX, cp); - for (i = 0, j = 0; i < srclen; j++) { + for (i = 0; i < srclen; j++) { src_char = source[i]; charlen = 1; switch (src_char) { case 0: - put_unaligned(0, &target[j]); goto ctoUTF16_out; case ':': dst_char = cpu_to_le16(UNI_COLON); @@ -350,6 +350,7 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, } ctoUTF16_out: + put_unaligned(0, &target[j]); /* Null terminate target unicode string */ return j; } diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 51f5e0ee723..7ff866dbb89 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -865,8 +865,8 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, return rc; } -static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, - __u16 fid, u32 *pacllen) +struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, + const struct cifs_fid *cifsfid, u32 *pacllen) { struct cifs_ntsd *pntsd = NULL; unsigned int xid; @@ -877,7 +877,8 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, return ERR_CAST(tlink); xid = get_xid(); - rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen); + rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), cifsfid->netfid, &pntsd, + pacllen); free_xid(xid); cifs_put_tlink(tlink); @@ -895,9 +896,10 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, int oplock = 0; unsigned int xid; int rc, create_options = 0; - __u16 fid; struct cifs_tcon *tcon; struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); + struct cifs_fid fid; + struct cifs_open_parms oparms; if (IS_ERR(tlink)) return ERR_CAST(tlink); @@ -908,12 +910,19 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, if (backup_cred(cifs_sb)) create_options |= CREATE_OPEN_BACKUP_INTENT; - rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, - create_options, &fid, &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = READ_CONTROL; + oparms.create_options = create_options; + oparms.disposition = FILE_OPEN; + oparms.path = path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, NULL); if (!rc) { - rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen); - CIFSSMBClose(xid, tcon, fid); + rc = CIFSSMBGetCIFSACL(xid, tcon, fid.netfid, &pntsd, pacllen); + CIFSSMBClose(xid, tcon, fid.netfid); } cifs_put_tlink(tlink); @@ -938,7 +947,7 @@ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, if (!open_file) return get_cifs_acl_by_path(cifs_sb, path, pacllen); - pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->fid.netfid, pacllen); + pntsd = get_cifs_acl_by_fid(cifs_sb, &open_file->fid, pacllen); cifsFileInfo_put(open_file); return pntsd; } @@ -950,10 +959,11 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, int oplock = 0; unsigned int xid; int rc, access_flags, create_options = 0; - __u16 fid; struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); + struct cifs_fid fid; + struct cifs_open_parms oparms; if (IS_ERR(tlink)) return PTR_ERR(tlink); @@ -969,18 +979,25 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, else access_flags = WRITE_DAC; - rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, access_flags, - create_options, &fid, &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = access_flags; + oparms.create_options = create_options; + oparms.disposition = FILE_OPEN; + oparms.path = path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc) { cifs_dbg(VFS, "Unable to open file to set ACL\n"); goto out; } - rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen, aclflag); + rc = CIFSSMBSetCIFSACL(xid, tcon, fid.netfid, pnntsd, acllen, aclflag); cifs_dbg(NOISY, "SetCIFSACL rc = %d\n", rc); - CIFSSMBClose(xid, tcon, fid); + CIFSSMBClose(xid, tcon, fid.netfid); out: free_xid(xid); cifs_put_tlink(tlink); @@ -990,19 +1007,31 @@ out: /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, - struct inode *inode, const char *path, const __u16 *pfid) + struct inode *inode, const char *path, + const struct cifs_fid *pfid) { struct cifs_ntsd *pntsd = NULL; u32 acllen = 0; int rc = 0; + struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); + struct cifs_tcon *tcon; cifs_dbg(NOISY, "converting ACL to mode for %s\n", path); - if (pfid) - pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen); - else - pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + if (pfid && (tcon->ses->server->ops->get_acl_by_fid)) + pntsd = tcon->ses->server->ops->get_acl_by_fid(cifs_sb, pfid, + &acllen); + else if (tcon->ses->server->ops->get_acl) + pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path, + &acllen); + else { + cifs_put_tlink(tlink); + return -EOPNOTSUPP; + } /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */ if (IS_ERR(pntsd)) { rc = PTR_ERR(pntsd); @@ -1014,6 +1043,8 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, cifs_dbg(VFS, "parse sec desc failed rc = %d\n", rc); } + cifs_put_tlink(tlink); + return rc; } @@ -1027,15 +1058,30 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, __u32 secdesclen = 0; struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */ struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); + struct cifs_tcon *tcon; + + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); cifs_dbg(NOISY, "set ACL from mode for %s\n", path); /* Get the security descriptor */ - pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen); + + if (tcon->ses->server->ops->get_acl == NULL) { + cifs_put_tlink(tlink); + return -EOPNOTSUPP; + } + + pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path, + &secdesclen); if (IS_ERR(pntsd)) { rc = PTR_ERR(pntsd); cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc); - goto out; + cifs_put_tlink(tlink); + return rc; } /* @@ -1048,6 +1094,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, pnntsd = kmalloc(secdesclen, GFP_KERNEL); if (!pnntsd) { kfree(pntsd); + cifs_put_tlink(tlink); return -ENOMEM; } @@ -1056,14 +1103,18 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc); + if (tcon->ses->server->ops->set_acl == NULL) + rc = -EOPNOTSUPP; + if (!rc) { /* Set the security descriptor */ - rc = set_cifs_acl(pnntsd, secdesclen, inode, path, aclflag); + rc = tcon->ses->server->ops->set_acl(pnntsd, secdesclen, inode, + path, aclflag); cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc); } + cifs_put_tlink(tlink); kfree(pnntsd); kfree(pntsd); -out: return rc; } diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index fc6f4f3a1a9..4934347321d 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -548,7 +548,13 @@ static int CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) { int rc; - unsigned int offset = CIFS_SESS_KEY_SIZE + 8; + struct ntlmv2_resp *ntlmv2 = (struct ntlmv2_resp *) + (ses->auth_key.response + CIFS_SESS_KEY_SIZE); + unsigned int hash_len; + + /* The MD5 hash starts at challenge_key.key */ + hash_len = ses->auth_key.len - (CIFS_SESS_KEY_SIZE + + offsetof(struct ntlmv2_resp, challenge.key[0])); if (!ses->server->secmech.sdeschmacmd5) { cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__); @@ -556,7 +562,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) } rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, - ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); + ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n", __func__); @@ -570,20 +576,21 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) } if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) - memcpy(ses->auth_key.response + offset, - ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); + memcpy(ntlmv2->challenge.key, + ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); else - memcpy(ses->auth_key.response + offset, - ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); + memcpy(ntlmv2->challenge.key, + ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, - ses->auth_key.response + offset, ses->auth_key.len - offset); + ntlmv2->challenge.key, hash_len); if (rc) { cifs_dbg(VFS, "%s: Could not update with response\n", __func__); return rc; } + /* Note that the MD5 digest over writes anon.challenge_key.key */ rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, - ses->auth_key.response + CIFS_SESS_KEY_SIZE); + ntlmv2->ntlmv2_hash); if (rc) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); @@ -627,7 +634,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) int rc; int baselen; unsigned int tilen; - struct ntlmv2_resp *buf; + struct ntlmv2_resp *ntlmv2; char ntlmv2_hash[16]; unsigned char *tiblob = NULL; /* target info blob */ @@ -660,13 +667,14 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) } ses->auth_key.len += baselen; - buf = (struct ntlmv2_resp *) + ntlmv2 = (struct ntlmv2_resp *) (ses->auth_key.response + CIFS_SESS_KEY_SIZE); - buf->blob_signature = cpu_to_le32(0x00000101); - buf->reserved = 0; - buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); - get_random_bytes(&buf->client_chal, sizeof(buf->client_chal)); - buf->reserved2 = 0; + ntlmv2->blob_signature = cpu_to_le32(0x00000101); + ntlmv2->reserved = 0; + /* Must be within 5 minutes of the server */ + ntlmv2->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); + get_random_bytes(&ntlmv2->client_chal, sizeof(ntlmv2->client_chal)); + ntlmv2->reserved2 = 0; memcpy(ses->auth_key.response + baselen, tiblob, tilen); @@ -706,7 +714,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) } rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, - ses->auth_key.response + CIFS_SESS_KEY_SIZE, + ntlmv2->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not update with response\n", __func__); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 801975c34cf..88839806742 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -87,10 +87,6 @@ extern mempool_t *cifs_mid_poolp; struct workqueue_struct *cifsiod_wq; -#ifdef CONFIG_CIFS_SMB2 -__u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE]; -#endif - /* * Bumps refcount for cifs super block. * Note that it should be only called if a referece to VFS super block is @@ -120,14 +116,16 @@ cifs_read_super(struct super_block *sb) { struct inode *inode; struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; int rc = 0; cifs_sb = CIFS_SB(sb); + tcon = cifs_sb_master_tcon(cifs_sb); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL) sb->s_flags |= MS_POSIXACL; - if (cifs_sb_master_tcon(cifs_sb)->ses->capabilities & CAP_LARGE_FILES) + if (tcon->ses->capabilities & tcon->ses->server->vals->cap_large_files) sb->s_maxbytes = MAX_LFS_FILESIZE; else sb->s_maxbytes = MAX_NON_LFS; @@ -147,7 +145,7 @@ cifs_read_super(struct super_block *sb) goto out_no_root; } - if (cifs_sb_master_tcon(cifs_sb)->nocase) + if (tcon->nocase) sb->s_d_op = &cifs_ci_dentry_ops; else sb->s_d_op = &cifs_dentry_ops; @@ -249,8 +247,9 @@ cifs_alloc_inode(struct super_block *sb) * server, can not assume caching of file data or metadata. */ cifs_set_oplock_level(cifs_inode, 0); - cifs_inode->delete_pending = false; - cifs_inode->invalid_mapping = false; + cifs_inode->flags = 0; + spin_lock_init(&cifs_inode->writers_lock); + cifs_inode->writers = 0; cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ cifs_inode->server_eof = 0; cifs_inode->uniqueid = 0; @@ -284,7 +283,7 @@ cifs_destroy_inode(struct inode *inode) static void cifs_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); cifs_fscache_release_inode_cookie(inode); } @@ -295,7 +294,7 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr; struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr; - seq_printf(s, ",addr="); + seq_puts(s, ",addr="); switch (server->dstaddr.ss_family) { case AF_INET: @@ -307,7 +306,7 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) seq_printf(s, "%%%u", sa6->sin6_scope_id); break; default: - seq_printf(s, "(unknown)"); + seq_puts(s, "(unknown)"); } } @@ -317,45 +316,45 @@ cifs_show_security(struct seq_file *s, struct cifs_ses *ses) if (ses->sectype == Unspecified) return; - seq_printf(s, ",sec="); + seq_puts(s, ",sec="); switch (ses->sectype) { case LANMAN: - seq_printf(s, "lanman"); + seq_puts(s, "lanman"); break; case NTLMv2: - seq_printf(s, "ntlmv2"); + seq_puts(s, "ntlmv2"); break; case NTLM: - seq_printf(s, "ntlm"); + seq_puts(s, "ntlm"); break; case Kerberos: - seq_printf(s, "krb5"); + seq_puts(s, "krb5"); break; case RawNTLMSSP: - seq_printf(s, "ntlmssp"); + seq_puts(s, "ntlmssp"); break; default: /* shouldn't ever happen */ - seq_printf(s, "unknown"); + seq_puts(s, "unknown"); break; } if (ses->sign) - seq_printf(s, "i"); + seq_puts(s, "i"); } static void cifs_show_cache_flavor(struct seq_file *s, struct cifs_sb_info *cifs_sb) { - seq_printf(s, ",cache="); + seq_puts(s, ",cache="); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) - seq_printf(s, "strict"); + seq_puts(s, "strict"); else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) - seq_printf(s, "none"); + seq_puts(s, "none"); else - seq_printf(s, "loose"); + seq_puts(s, "loose"); } static void @@ -388,7 +387,7 @@ cifs_show_options(struct seq_file *s, struct dentry *root) cifs_show_cache_flavor(s, cifs_sb); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) - seq_printf(s, ",multiuser"); + seq_puts(s, ",multiuser"); else if (tcon->ses->user_name) seq_printf(s, ",username=%s", tcon->ses->user_name); @@ -414,16 +413,16 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",uid=%u", from_kuid_munged(&init_user_ns, cifs_sb->mnt_uid)); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) - seq_printf(s, ",forceuid"); + seq_puts(s, ",forceuid"); else - seq_printf(s, ",noforceuid"); + seq_puts(s, ",noforceuid"); seq_printf(s, ",gid=%u", from_kgid_munged(&init_user_ns, cifs_sb->mnt_gid)); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) - seq_printf(s, ",forcegid"); + seq_puts(s, ",forcegid"); else - seq_printf(s, ",noforcegid"); + seq_puts(s, ",noforcegid"); cifs_show_address(s, tcon->ses->server); @@ -435,47 +434,47 @@ cifs_show_options(struct seq_file *s, struct dentry *root) cifs_show_nls(s, cifs_sb->local_nls); if (tcon->seal) - seq_printf(s, ",seal"); + seq_puts(s, ",seal"); if (tcon->nocase) - seq_printf(s, ",nocase"); + seq_puts(s, ",nocase"); if (tcon->retry) - seq_printf(s, ",hard"); + seq_puts(s, ",hard"); if (tcon->unix_ext) - seq_printf(s, ",unix"); + seq_puts(s, ",unix"); else - seq_printf(s, ",nounix"); + seq_puts(s, ",nounix"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) - seq_printf(s, ",posixpaths"); + seq_puts(s, ",posixpaths"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) - seq_printf(s, ",setuids"); + seq_puts(s, ",setuids"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) - seq_printf(s, ",serverino"); + seq_puts(s, ",serverino"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) - seq_printf(s, ",rwpidforward"); + seq_puts(s, ",rwpidforward"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) - seq_printf(s, ",forcemand"); + seq_puts(s, ",forcemand"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) - seq_printf(s, ",nouser_xattr"); + seq_puts(s, ",nouser_xattr"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR) - seq_printf(s, ",mapchars"); + seq_puts(s, ",mapchars"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) - seq_printf(s, ",sfu"); + seq_puts(s, ",sfu"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) - seq_printf(s, ",nobrl"); + seq_puts(s, ",nobrl"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) - seq_printf(s, ",cifsacl"); + seq_puts(s, ",cifsacl"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) - seq_printf(s, ",dynperm"); + seq_puts(s, ",dynperm"); if (root->d_sb->s_flags & MS_POSIXACL) - seq_printf(s, ",acl"); + seq_puts(s, ",acl"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) - seq_printf(s, ",mfsymlinks"); + seq_puts(s, ",mfsymlinks"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) - seq_printf(s, ",fsc"); + seq_puts(s, ",fsc"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC) - seq_printf(s, ",nostrictsync"); + seq_puts(s, ",nostrictsync"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) - seq_printf(s, ",noperm"); + seq_puts(s, ",noperm"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) seq_printf(s, ",backupuid=%u", from_kuid_munged(&init_user_ns, @@ -539,6 +538,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root) static int cifs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_NODIRATIME; return 0; } @@ -725,23 +725,42 @@ out_nls: goto out; } -static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t +cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + ssize_t rc; + struct inode *inode = file_inode(iocb->ki_filp); + + rc = cifs_revalidate_mapping(inode); + if (rc) + return rc; + + return generic_file_read_iter(iocb, iter); +} + +static ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); + struct cifsInodeInfo *cinode = CIFS_I(inode); ssize_t written; int rc; - written = generic_file_aio_write(iocb, iov, nr_segs, pos); + written = cifs_get_writer(cinode); + if (written) + return written; + + written = generic_file_write_iter(iocb, from); if (CIFS_CACHE_WRITE(CIFS_I(inode))) - return written; + goto out; rc = filemap_fdatawrite(inode->i_mapping); if (rc) - cifs_dbg(FYI, "cifs_file_aio_write: %d rc on %p inode\n", + cifs_dbg(FYI, "cifs_file_write_iter: %d rc on %p inode\n", rc, inode); +out: + cifs_put_writer(cinode); return written; } @@ -847,7 +866,6 @@ const struct inode_operations cifs_file_inode_ops = { /* revalidate:cifs_revalidate, */ .setattr = cifs_setattr, .getattr = cifs_getattr, /* do we need this anymore? */ - .rename = cifs_rename, .permission = cifs_permission, #ifdef CONFIG_CIFS_XATTR .setxattr = cifs_setxattr, @@ -874,10 +892,10 @@ const struct inode_operations cifs_symlink_inode_ops = { }; const struct file_operations cifs_file_ops = { - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = cifs_file_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = cifs_loose_read_iter, + .write_iter = cifs_file_write_iter, .open = cifs_open, .release = cifs_close, .lock = cifs_lock, @@ -893,10 +911,10 @@ const struct file_operations cifs_file_ops = { }; const struct file_operations cifs_file_strict_ops = { - .read = do_sync_read, - .write = do_sync_write, - .aio_read = cifs_strict_readv, - .aio_write = cifs_strict_writev, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = cifs_strict_readv, + .write_iter = cifs_strict_writev, .open = cifs_open, .release = cifs_close, .lock = cifs_lock, @@ -913,10 +931,10 @@ const struct file_operations cifs_file_strict_ops = { const struct file_operations cifs_file_direct_ops = { /* BB reevaluate whether they can be done with directio, no cache */ - .read = do_sync_read, - .write = do_sync_write, - .aio_read = cifs_user_readv, - .aio_write = cifs_user_writev, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = cifs_user_readv, + .write_iter = cifs_user_writev, .open = cifs_open, .release = cifs_close, .lock = cifs_lock, @@ -932,10 +950,10 @@ const struct file_operations cifs_file_direct_ops = { }; const struct file_operations cifs_file_nobrl_ops = { - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = cifs_file_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = cifs_loose_read_iter, + .write_iter = cifs_file_write_iter, .open = cifs_open, .release = cifs_close, .fsync = cifs_fsync, @@ -950,10 +968,10 @@ const struct file_operations cifs_file_nobrl_ops = { }; const struct file_operations cifs_file_strict_nobrl_ops = { - .read = do_sync_read, - .write = do_sync_write, - .aio_read = cifs_strict_readv, - .aio_write = cifs_strict_writev, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = cifs_strict_readv, + .write_iter = cifs_strict_writev, .open = cifs_open, .release = cifs_close, .fsync = cifs_strict_fsync, @@ -969,10 +987,10 @@ const struct file_operations cifs_file_strict_nobrl_ops = { const struct file_operations cifs_file_direct_nobrl_ops = { /* BB reevaluate whether they can be done with directio, no cache */ - .read = do_sync_read, - .write = do_sync_write, - .aio_read = cifs_user_readv, - .aio_write = cifs_user_writev, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = cifs_user_readv, + .write_iter = cifs_user_writev, .open = cifs_open, .release = cifs_close, .fsync = cifs_fsync, @@ -1003,7 +1021,7 @@ cifs_init_once(void *inode) init_rwsem(&cifsi->lock_sem); } -static int +static int __init cifs_init_inodecache(void) { cifs_inode_cachep = kmem_cache_create("cifs_inode_cache", @@ -1178,10 +1196,6 @@ init_cifs(void) spin_lock_init(&cifs_file_list_lock); spin_lock_init(&GlobalMid_Lock); -#ifdef CONFIG_CIFS_SMB2 - get_random_bytes(cifs_client_guid, SMB2_CLIENT_GUID_SIZE); -#endif - if (cifs_max_pending < 2) { cifs_max_pending = 2; cifs_dbg(FYI, "cifs_max_pending set to min of 2\n"); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 26a754f49ba..70f178a7c75 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -22,20 +22,28 @@ #ifndef _CIFSFS_H #define _CIFSFS_H +#include <linux/hash.h> + #define ROOT_I 2 /* * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down - * so that it will fit. + * so that it will fit. We use hash_64 to convert the value to 31 bits, and + * then add 1, to ensure that we don't end up with a 0 as the value. */ +#if BITS_PER_LONG == 64 static inline ino_t cifs_uniqueid_to_ino_t(u64 fileid) { - ino_t ino = (ino_t) fileid; - if (sizeof(ino_t) < sizeof(u64)) - ino ^= fileid >> (sizeof(u64)-sizeof(ino_t)) * 8; - return ino; + return (ino_t)fileid; } +#else +static inline ino_t +cifs_uniqueid_to_ino_t(u64 fileid) +{ + return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1; +} +#endif extern struct file_system_type cifs_fs_type; extern const struct address_space_operations cifs_addr_ops; @@ -67,6 +75,8 @@ extern int cifs_revalidate_dentry_attr(struct dentry *); extern int cifs_revalidate_file(struct file *filp); extern int cifs_revalidate_dentry(struct dentry *); extern int cifs_invalidate_mapping(struct inode *inode); +extern int cifs_revalidate_mapping(struct inode *inode); +extern int cifs_zap_mapping(struct inode *inode); extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *); extern int cifs_setattr(struct dentry *, struct iattr *); @@ -85,14 +95,10 @@ extern const struct file_operations cifs_file_strict_nobrl_ops; extern int cifs_open(struct inode *inode, struct file *file); extern int cifs_close(struct inode *inode, struct file *file); extern int cifs_closedir(struct inode *inode, struct file *file); -extern ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos); -extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos); -extern ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos); -extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos); +extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to); +extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to); +extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from); +extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from); extern int cifs_lock(struct file *, int, struct file_lock *); extern int cifs_fsync(struct file *, loff_t, loff_t, int); extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int); @@ -130,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.02" +#define CIFS_VERSION "2.03" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 52b6f6c26bf..de6aed8c78e 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -228,6 +228,8 @@ struct smb_version_operations { /* verify the message */ int (*check_message)(char *, unsigned int); bool (*is_oplock_break)(char *, struct TCP_Server_Info *); + void (*downgrade_oplock)(struct TCP_Server_Info *, + struct cifsInodeInfo *, bool); /* process transaction2 response */ bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *, char *, int); @@ -261,7 +263,7 @@ struct smb_version_operations { /* query path data from the server */ int (*query_path_info)(const unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const char *, - FILE_ALL_INFO *, bool *); + FILE_ALL_INFO *, bool *, bool *); /* query file data from the server */ int (*query_file_info)(const unsigned int, struct cifs_tcon *, struct cifs_fid *, FILE_ALL_INFO *); @@ -278,6 +280,8 @@ struct smb_version_operations { /* set attributes */ int (*set_file_info)(struct inode *, const char *, FILE_BASIC_INFO *, const unsigned int); + int (*set_compression)(const unsigned int, struct cifs_tcon *, + struct cifsFileInfo *); /* check if we can send an echo or nor */ bool (*can_echo)(struct TCP_Server_Info *); /* send echo request */ @@ -321,7 +325,8 @@ struct smb_version_operations { /* async read from the server */ int (*async_readv)(struct cifs_readdata *); /* async write to the server */ - int (*async_writev)(struct cifs_writedata *); + int (*async_writev)(struct cifs_writedata *, + void (*release)(struct kref *)); /* sync read from the server */ int (*sync_read)(const unsigned int, struct cifsFileInfo *, struct cifs_io_parms *, unsigned int *, char **, @@ -368,8 +373,12 @@ struct smb_version_operations { void (*new_lease_key)(struct cifs_fid *); int (*generate_signingkey)(struct cifs_ses *); int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *); - int (*query_mf_symlink)(const unsigned char *, char *, unsigned int *, - struct cifs_sb_info *, unsigned int); + int (*query_mf_symlink)(unsigned int, struct cifs_tcon *, + struct cifs_sb_info *, const unsigned char *, + char *, unsigned int *); + int (*create_mf_symlink)(unsigned int, struct cifs_tcon *, + struct cifs_sb_info *, const unsigned char *, + char *, unsigned int *); /* if we can do cache read operations */ bool (*is_read_op)(__u32); /* set oplock level for the inode */ @@ -379,6 +388,22 @@ struct smb_version_operations { char * (*create_lease_buf)(u8 *, u8); /* parse lease context buffer and return oplock/epoch info */ __u8 (*parse_lease_buf)(void *, unsigned int *); + int (*clone_range)(const unsigned int, struct cifsFileInfo *src_file, + struct cifsFileInfo *target_file, u64 src_off, u64 len, + u64 dest_off); + int (*validate_negotiate)(const unsigned int, struct cifs_tcon *); + ssize_t (*query_all_EAs)(const unsigned int, struct cifs_tcon *, + const unsigned char *, const unsigned char *, char *, + size_t, const struct nls_table *, int); + int (*set_EA)(const unsigned int, struct cifs_tcon *, const char *, + const char *, const void *, const __u16, + const struct nls_table *, int); + struct cifs_ntsd * (*get_acl)(struct cifs_sb_info *, struct inode *, + const char *, u32 *); + struct cifs_ntsd * (*get_acl_by_fid)(struct cifs_sb_info *, + const struct cifs_fid *, u32 *); + int (*set_acl)(struct cifs_ntsd *, __u32, struct inode *, const char *, + int); }; struct smb_version_values { @@ -490,7 +515,7 @@ struct cifs_mnt_data { static inline unsigned int get_rfc1002_length(void *buf) { - return be32_to_cpu(*((__be32 *)buf)); + return be32_to_cpu(*((__be32 *)buf)) & 0xffffff; } static inline void @@ -534,6 +559,7 @@ struct TCP_Server_Info { int echo_credits; /* echo reserved slots */ int oplock_credits; /* oplock break reserved slots */ bool echoes:1; /* enable echoes */ + __u8 client_guid[SMB2_CLIENT_GUID_SIZE]; /* Client GUID */ #endif u16 dialect; /* dialect index that server chose */ bool oplocks:1; /* enable oplocks */ @@ -620,11 +646,34 @@ set_credits(struct TCP_Server_Info *server, const int val) } static inline __u64 -get_next_mid(struct TCP_Server_Info *server) +get_next_mid64(struct TCP_Server_Info *server) { return server->ops->get_next_mid(server); } +static inline __le16 +get_next_mid(struct TCP_Server_Info *server) +{ + __u16 mid = get_next_mid64(server); + /* + * The value in the SMB header should be little endian for easy + * on-the-wire decoding. + */ + return cpu_to_le16(mid); +} + +static inline __u16 +get_mid(const struct smb_hdr *smb) +{ + return le16_to_cpu(smb->Mid); +} + +static inline bool +compare_mid(__u16 mid, const struct smb_hdr *smb) +{ + return mid == le16_to_cpu(smb->Mid); +} + /* * When the server supports very large reads and writes via POSIX extensions, * we can allow up to 2^24-1, minus the size of a READ/WRITE_AND_X header, not @@ -828,6 +877,11 @@ struct cifs_tcon { __u32 maximal_access; __u32 vol_serial_number; __le64 vol_create_time; + __u32 ss_flags; /* sector size flags */ + __u32 perf_sector_size; /* best sector size for perf */ + __u32 max_chunks; + __u32 max_bytes_chunk; + __u32 max_bytes_copy; #endif /* CONFIG_CIFS_SMB2 */ #ifdef CONFIG_CIFS_FSCACHE u64 resource_id; /* server resource id */ @@ -1020,7 +1074,7 @@ struct cifs_writedata { unsigned int pagesz; unsigned int tailsz; unsigned int nr_pages; - struct page *pages[1]; + struct page *pages[]; }; /* @@ -1060,8 +1114,15 @@ struct cifsInodeInfo { __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ unsigned int oplock; /* oplock/lease level we have */ unsigned int epoch; /* used to track lease state changes */ - bool delete_pending; /* DELETE_ON_CLOSE is set */ - bool invalid_mapping; /* pagecache is invalid */ +#define CIFS_INODE_PENDING_OPLOCK_BREAK (0) /* oplock break in progress */ +#define CIFS_INODE_PENDING_WRITERS (1) /* Writes in progress */ +#define CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2 (2) /* Downgrade oplock to L2 */ +#define CIFS_INO_DELETE_PENDING (3) /* delete pending on server */ +#define CIFS_INO_INVALID_MAPPING (4) /* pagecache is invalid */ +#define CIFS_INO_LOCK (5) /* lock bit for synchronization */ + unsigned long flags; + spinlock_t writers_lock; + unsigned int writers; /* Number of writers on this inode */ unsigned long time; /* jiffies of last update of inode */ u64 server_eof; /* current file size on server -- protected by i_lock */ u64 uniqueid; /* server inode number */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index a630475e421..33df36ef9d5 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -428,7 +428,7 @@ struct smb_hdr { __u16 Tid; __le16 Pid; __u16 Uid; - __u16 Mid; + __le16 Mid; __u8 WordCount; } __attribute__((packed)); @@ -697,7 +697,13 @@ struct ntlmssp2_name { } __attribute__((packed)); struct ntlmv2_resp { - char ntlmv2_hash[CIFS_ENCPWD_SIZE]; + union { + char ntlmv2_hash[CIFS_ENCPWD_SIZE]; + struct { + __u8 reserved[8]; + __u8 key[CIFS_SERVER_CHALLENGE_SIZE]; + } __attribute__((packed)) challenge; + } __attribute__((packed)); __le32 blob_signature; __u32 reserved; __le64 time; @@ -1352,6 +1358,35 @@ typedef struct smb_com_transaction_ioctl_req { __u8 Data[1]; } __attribute__((packed)) TRANSACT_IOCTL_REQ; +typedef struct smb_com_transaction_compr_ioctl_req { + struct smb_hdr hdr; /* wct = 23 */ + __u8 MaxSetupCount; + __u16 Reserved; + __le32 TotalParameterCount; + __le32 TotalDataCount; + __le32 MaxParameterCount; + __le32 MaxDataCount; + __le32 ParameterCount; + __le32 ParameterOffset; + __le32 DataCount; + __le32 DataOffset; + __u8 SetupCount; /* four setup words follow subcommand */ + /* SNIA spec incorrectly included spurious pad here */ + __le16 SubCommand; /* 2 = IOCTL/FSCTL */ + __le32 FunctionCode; + __u16 Fid; + __u8 IsFsctl; /* 1 = File System Control 0 = device control (IOCTL) */ + __u8 IsRootFlag; /* 1 = apply command to root of share (must be DFS) */ + __le16 ByteCount; + __u8 Pad[3]; + __le16 compression_state; /* See below for valid flags */ +} __attribute__((packed)) TRANSACT_COMPR_IOCTL_REQ; + +/* compression state flags */ +#define COMPRESSION_FORMAT_NONE 0x0000 +#define COMPRESSION_FORMAT_DEFAULT 0x0001 +#define COMPRESSION_FORMAT_LZNT1 0x0002 + typedef struct smb_com_transaction_ioctl_rsp { struct smb_hdr hdr; /* wct = 19 */ __u8 Reserved[3]; @@ -1491,15 +1526,30 @@ struct file_notify_information { __u8 FileName[0]; } __attribute__((packed)); -struct reparse_data { - __u32 ReparseTag; - __u16 ReparseDataLength; +/* For IO_REPARSE_TAG_SYMLINK */ +struct reparse_symlink_data { + __le32 ReparseTag; + __le16 ReparseDataLength; __u16 Reserved; - __u16 SubstituteNameOffset; - __u16 SubstituteNameLength; - __u16 PrintNameOffset; - __u16 PrintNameLength; - __u32 Flags; + __le16 SubstituteNameOffset; + __le16 SubstituteNameLength; + __le16 PrintNameOffset; + __le16 PrintNameLength; + __le32 Flags; + char PathBuffer[0]; +} __attribute__((packed)); + +/* For IO_REPARSE_TAG_NFS */ +#define NFS_SPECFILE_LNK 0x00000000014B4E4C +#define NFS_SPECFILE_CHR 0x0000000000524843 +#define NFS_SPECFILE_BLK 0x00000000004B4C42 +#define NFS_SPECFILE_FIFO 0x000000004F464946 +#define NFS_SPECFILE_SOCK 0x000000004B434F53 +struct reparse_posix_data { + __le32 ReparseTag; + __le16 ReparseDataLength; + __u16 Reserved; + __le64 InodeType; /* LNK, FIFO, CHR etc. */ char PathBuffer[0]; } __attribute__((packed)); @@ -2200,6 +2250,9 @@ typedef struct { __le32 DeviceCharacteristics; } __attribute__((packed)) FILE_SYSTEM_DEVICE_INFO; /* device info level 0x104 */ +/* minimum includes first three fields, and empty FS Name */ +#define MIN_FS_ATTR_INFO_SIZE 12 + typedef struct { __le32 Attributes; __le32 MaxPathNameComponentLength; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index b5ec2a268f5..ca7980a1e30 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -127,6 +127,9 @@ extern u64 cifs_UnixTimeToNT(struct timespec); extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset); extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock); +extern int cifs_get_writer(struct cifsInodeInfo *cinode); +extern void cifs_put_writer(struct cifsInodeInfo *cinode); +extern void cifs_done_oplock_break(struct cifsInodeInfo *cinode); extern int cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, const unsigned int xid); extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile); @@ -151,7 +154,7 @@ extern struct inode *cifs_iget(struct super_block *sb, extern int cifs_get_inode_info(struct inode **inode, const char *full_path, FILE_ALL_INFO *data, struct super_block *sb, - int xid, const __u16 *fid); + int xid, const struct cifs_fid *fid); extern int cifs_get_inode_info_unix(struct inode **pinode, const unsigned char *search_path, struct super_block *sb, unsigned int xid); @@ -162,11 +165,13 @@ extern int cifs_rename_pending_delete(const char *full_path, const unsigned int xid); extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct inode *inode, - const char *path, const __u16 *pfid); + const char *path, const struct cifs_fid *pfid); extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64, kuid_t, kgid_t); extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, const char *, u32 *); +extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *, + const struct cifs_fid *, u32 *); extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, const char *, int); @@ -360,11 +365,10 @@ extern int CIFSSMBUnixQuerySymLink(const unsigned int xid, extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, char **symlinkinfo, const struct nls_table *nls_codepage); -extern int CIFSSMBOpen(const unsigned int xid, struct cifs_tcon *tcon, - const char *fileName, const int disposition, - const int access_flags, const int omode, - __u16 *netfid, int *pOplock, FILE_ALL_INFO *, - const struct nls_table *nls_codepage, int remap); +extern int CIFSSMB_set_compression(const unsigned int xid, + struct cifs_tcon *tcon, __u16 fid); +extern int CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms, + int *oplock, FILE_ALL_INFO *buf); extern int SMBLegacyOpen(const unsigned int xid, struct cifs_tcon *tcon, const char *fileName, const int disposition, const int access_flags, const int omode, @@ -474,10 +478,11 @@ extern int CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon, const int netfid, __u64 *pExtAttrBits, __u64 *pMask); extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb); -extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr); -extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr, - const unsigned char *path, - struct cifs_sb_info *cifs_sb, unsigned int xid); +extern bool couldbe_mf_symlink(const struct cifs_fattr *fattr); +extern int check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + struct cifs_fattr *fattr, + const unsigned char *path); extern int mdfour(unsigned char *, unsigned char *, int); extern int E_md4hash(const unsigned char *passwd, unsigned char *p16, const struct nls_table *codepage); @@ -488,12 +493,18 @@ void cifs_readdata_release(struct kref *refcount); int cifs_async_readv(struct cifs_readdata *rdata); int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid); -int cifs_async_writev(struct cifs_writedata *wdata); +int cifs_async_writev(struct cifs_writedata *wdata, + void (*release)(struct kref *kref)); void cifs_writev_complete(struct work_struct *work); struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete); void cifs_writedata_release(struct kref *refcount); -int open_query_close_cifs_symlink(const unsigned char *path, char *pbuf, - unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb, - unsigned int xid); +int cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const unsigned char *path, char *pbuf, + unsigned int *pbytes_read); +int cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const unsigned char *path, char *pbuf, + unsigned int *pbytes_written); #endif /* _CIFSPROTO_H */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 4baf35949b5..6ce4e0954b9 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1273,104 +1273,124 @@ OldOpenRetry: } int -CIFSSMBOpen(const unsigned int xid, struct cifs_tcon *tcon, - const char *fileName, const int openDisposition, - const int access_flags, const int create_options, __u16 *netfid, - int *pOplock, FILE_ALL_INFO *pfile_info, - const struct nls_table *nls_codepage, int remap) +CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms, int *oplock, + FILE_ALL_INFO *buf) { int rc = -EACCES; - OPEN_REQ *pSMB = NULL; - OPEN_RSP *pSMBr = NULL; + OPEN_REQ *req = NULL; + OPEN_RSP *rsp = NULL; int bytes_returned; int name_len; __u16 count; + struct cifs_sb_info *cifs_sb = oparms->cifs_sb; + struct cifs_tcon *tcon = oparms->tcon; + int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; + const struct nls_table *nls = cifs_sb->local_nls; + int create_options = oparms->create_options; + int desired_access = oparms->desired_access; + int disposition = oparms->disposition; + const char *path = oparms->path; openRetry: - rc = smb_init(SMB_COM_NT_CREATE_ANDX, 24, tcon, (void **) &pSMB, - (void **) &pSMBr); + rc = smb_init(SMB_COM_NT_CREATE_ANDX, 24, tcon, (void **)&req, + (void **)&rsp); if (rc) return rc; - pSMB->AndXCommand = 0xFF; /* none */ + /* no commands go after this */ + req->AndXCommand = 0xFF; - if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { - count = 1; /* account for one byte pad to word boundary */ - name_len = - cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1), - fileName, PATH_MAX, nls_codepage, remap); - name_len++; /* trailing null */ + if (req->hdr.Flags2 & SMBFLG2_UNICODE) { + /* account for one byte pad to word boundary */ + count = 1; + name_len = cifsConvertToUTF16((__le16 *)(req->fileName + 1), + path, PATH_MAX, nls, remap); + /* trailing null */ + name_len++; name_len *= 2; - pSMB->NameLength = cpu_to_le16(name_len); - } else { /* BB improve check for buffer overruns BB */ - count = 0; /* no pad */ - name_len = strnlen(fileName, PATH_MAX); - name_len++; /* trailing null */ - pSMB->NameLength = cpu_to_le16(name_len); - strncpy(pSMB->fileName, fileName, name_len); + req->NameLength = cpu_to_le16(name_len); + } else { + /* BB improve check for buffer overruns BB */ + /* no pad */ + count = 0; + name_len = strnlen(path, PATH_MAX); + /* trailing null */ + name_len++; + req->NameLength = cpu_to_le16(name_len); + strncpy(req->fileName, path, name_len); } - if (*pOplock & REQ_OPLOCK) - pSMB->OpenFlags = cpu_to_le32(REQ_OPLOCK); - else if (*pOplock & REQ_BATCHOPLOCK) - pSMB->OpenFlags = cpu_to_le32(REQ_BATCHOPLOCK); - pSMB->DesiredAccess = cpu_to_le32(access_flags); - pSMB->AllocationSize = 0; - /* set file as system file if special file such - as fifo and server expecting SFU style and - no Unix extensions */ + + if (*oplock & REQ_OPLOCK) + req->OpenFlags = cpu_to_le32(REQ_OPLOCK); + else if (*oplock & REQ_BATCHOPLOCK) + req->OpenFlags = cpu_to_le32(REQ_BATCHOPLOCK); + + req->DesiredAccess = cpu_to_le32(desired_access); + req->AllocationSize = 0; + + /* + * Set file as system file if special file such as fifo and server + * expecting SFU style and no Unix extensions. + */ if (create_options & CREATE_OPTION_SPECIAL) - pSMB->FileAttributes = cpu_to_le32(ATTR_SYSTEM); + req->FileAttributes = cpu_to_le32(ATTR_SYSTEM); else - pSMB->FileAttributes = cpu_to_le32(ATTR_NORMAL); + req->FileAttributes = cpu_to_le32(ATTR_NORMAL); - /* XP does not handle ATTR_POSIX_SEMANTICS */ - /* but it helps speed up case sensitive checks for other - servers such as Samba */ + /* + * XP does not handle ATTR_POSIX_SEMANTICS but it helps speed up case + * sensitive checks for other servers such as Samba. + */ if (tcon->ses->capabilities & CAP_UNIX) - pSMB->FileAttributes |= cpu_to_le32(ATTR_POSIX_SEMANTICS); + req->FileAttributes |= cpu_to_le32(ATTR_POSIX_SEMANTICS); if (create_options & CREATE_OPTION_READONLY) - pSMB->FileAttributes |= cpu_to_le32(ATTR_READONLY); + req->FileAttributes |= cpu_to_le32(ATTR_READONLY); + + req->ShareAccess = cpu_to_le32(FILE_SHARE_ALL); + req->CreateDisposition = cpu_to_le32(disposition); + req->CreateOptions = cpu_to_le32(create_options & CREATE_OPTIONS_MASK); - pSMB->ShareAccess = cpu_to_le32(FILE_SHARE_ALL); - pSMB->CreateDisposition = cpu_to_le32(openDisposition); - pSMB->CreateOptions = cpu_to_le32(create_options & CREATE_OPTIONS_MASK); /* BB Expirement with various impersonation levels and verify */ - pSMB->ImpersonationLevel = cpu_to_le32(SECURITY_IMPERSONATION); - pSMB->SecurityFlags = - SECURITY_CONTEXT_TRACKING | SECURITY_EFFECTIVE_ONLY; + req->ImpersonationLevel = cpu_to_le32(SECURITY_IMPERSONATION); + req->SecurityFlags = SECURITY_CONTEXT_TRACKING|SECURITY_EFFECTIVE_ONLY; count += name_len; - inc_rfc1001_len(pSMB, count); + inc_rfc1001_len(req, count); - pSMB->ByteCount = cpu_to_le16(count); - /* long_op set to 1 to allow for oplock break timeouts */ - rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, - (struct smb_hdr *)pSMBr, &bytes_returned, 0); + req->ByteCount = cpu_to_le16(count); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *)req, + (struct smb_hdr *)rsp, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_opens); if (rc) { cifs_dbg(FYI, "Error in Open = %d\n", rc); - } else { - *pOplock = pSMBr->OplockLevel; /* 1 byte no need to le_to_cpu */ - *netfid = pSMBr->Fid; /* cifs fid stays in le */ - /* Let caller know file was created so we can set the mode. */ - /* Do we care about the CreateAction in any other cases? */ - if (cpu_to_le32(FILE_CREATE) == pSMBr->CreateAction) - *pOplock |= CIFS_CREATE_ACTION; - if (pfile_info) { - memcpy((char *)pfile_info, (char *)&pSMBr->CreationTime, - 36 /* CreationTime to Attributes */); - /* the file_info buf is endian converted by caller */ - pfile_info->AllocationSize = pSMBr->AllocationSize; - pfile_info->EndOfFile = pSMBr->EndOfFile; - pfile_info->NumberOfLinks = cpu_to_le32(1); - pfile_info->DeletePending = 0; - } + cifs_buf_release(req); + if (rc == -EAGAIN) + goto openRetry; + return rc; } - cifs_buf_release(pSMB); - if (rc == -EAGAIN) - goto openRetry; + /* 1 byte no need to le_to_cpu */ + *oplock = rsp->OplockLevel; + /* cifs fid stays in le */ + oparms->fid->netfid = rsp->Fid; + + /* Let caller know file was created so we can set the mode. */ + /* Do we care about the CreateAction in any other cases? */ + if (cpu_to_le32(FILE_CREATE) == rsp->CreateAction) + *oplock |= CIFS_CREATE_ACTION; + + if (buf) { + /* copy from CreationTime to Attributes */ + memcpy((char *)buf, (char *)&rsp->CreationTime, 36); + /* the file_info buf is endian converted by caller */ + buf->AllocationSize = rsp->AllocationSize; + buf->EndOfFile = rsp->EndOfFile; + buf->NumberOfLinks = cpu_to_le32(1); + buf->DeletePending = 0; + } + + cifs_buf_release(req); return rc; } @@ -1890,7 +1910,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata) do { server = tlink_tcon(wdata->cfile->tlink)->ses->server; - rc = server->ops->async_writev(wdata); + rc = server->ops->async_writev(wdata, cifs_writedata_release); } while (rc == -EAGAIN); for (i = 0; i < wdata->nr_pages; i++) { @@ -1942,15 +1962,9 @@ cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete) { struct cifs_writedata *wdata; - /* this would overflow */ - if (nr_pages == 0) { - cifs_dbg(VFS, "%s: called with nr_pages == 0!\n", __func__); - return NULL; - } - /* writedata + number of page pointers */ wdata = kzalloc(sizeof(*wdata) + - sizeof(struct page *) * (nr_pages - 1), GFP_NOFS); + sizeof(struct page *) * nr_pages, GFP_NOFS); if (wdata != NULL) { kref_init(&wdata->refcount); INIT_LIST_HEAD(&wdata->list); @@ -2011,7 +2025,8 @@ cifs_writev_callback(struct mid_q_entry *mid) /* cifs_async_writev - send an async write, and set up mid to handle result */ int -cifs_async_writev(struct cifs_writedata *wdata) +cifs_async_writev(struct cifs_writedata *wdata, + void (*release)(struct kref *kref)) { int rc = -EACCES; WRITE_REQ *smb = NULL; @@ -2085,7 +2100,7 @@ cifs_async_writev(struct cifs_writedata *wdata) if (rc == 0) cifs_stats_inc(&tcon->stats.cifs_stats.num_writes); else - kref_put(&wdata->refcount, cifs_writedata_release); + kref_put(&wdata->refcount, release); async_writev_out: cifs_small_buf_release(smb); @@ -3088,7 +3103,8 @@ CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, bool is_unicode; unsigned int sub_len; char *sub_start; - struct reparse_data *reparse_buf; + struct reparse_symlink_data *reparse_buf; + struct reparse_posix_data *posix_buf; __u32 data_offset, data_count; char *end_of_smb; @@ -3137,20 +3153,47 @@ CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, goto qreparse_out; } end_of_smb = 2 + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount; - reparse_buf = (struct reparse_data *) + reparse_buf = (struct reparse_symlink_data *) ((char *)&pSMBr->hdr.Protocol + data_offset); if ((char *)reparse_buf >= end_of_smb) { rc = -EIO; goto qreparse_out; } - if ((reparse_buf->PathBuffer + reparse_buf->PrintNameOffset + - reparse_buf->PrintNameLength) > end_of_smb) { + if (reparse_buf->ReparseTag == cpu_to_le32(IO_REPARSE_TAG_NFS)) { + cifs_dbg(FYI, "NFS style reparse tag\n"); + posix_buf = (struct reparse_posix_data *)reparse_buf; + + if (posix_buf->InodeType != cpu_to_le64(NFS_SPECFILE_LNK)) { + cifs_dbg(FYI, "unsupported file type 0x%llx\n", + le64_to_cpu(posix_buf->InodeType)); + rc = -EOPNOTSUPP; + goto qreparse_out; + } + is_unicode = true; + sub_len = le16_to_cpu(reparse_buf->ReparseDataLength); + if (posix_buf->PathBuffer + sub_len > end_of_smb) { + cifs_dbg(FYI, "reparse buf beyond SMB\n"); + rc = -EIO; + goto qreparse_out; + } + *symlinkinfo = cifs_strndup_from_utf16(posix_buf->PathBuffer, + sub_len, is_unicode, nls_codepage); + goto qreparse_out; + } else if (reparse_buf->ReparseTag != + cpu_to_le32(IO_REPARSE_TAG_SYMLINK)) { + rc = -EOPNOTSUPP; + goto qreparse_out; + } + + /* Reparse tag is NTFS symlink */ + sub_start = le16_to_cpu(reparse_buf->SubstituteNameOffset) + + reparse_buf->PathBuffer; + sub_len = le16_to_cpu(reparse_buf->SubstituteNameLength); + if (sub_start + sub_len > end_of_smb) { cifs_dbg(FYI, "reparse buf beyond SMB\n"); rc = -EIO; goto qreparse_out; } - sub_start = reparse_buf->SubstituteNameOffset + reparse_buf->PathBuffer; - sub_len = reparse_buf->SubstituteNameLength; if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) is_unicode = true; else @@ -3171,6 +3214,60 @@ qreparse_out: return rc; } +int +CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon, + __u16 fid) +{ + int rc = 0; + int bytes_returned; + struct smb_com_transaction_compr_ioctl_req *pSMB; + struct smb_com_transaction_ioctl_rsp *pSMBr; + + cifs_dbg(FYI, "Set compression for %u\n", fid); + rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + pSMB->compression_state = cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); + + pSMB->TotalParameterCount = 0; + pSMB->TotalDataCount = __constant_cpu_to_le32(2); + pSMB->MaxParameterCount = 0; + pSMB->MaxDataCount = 0; + pSMB->MaxSetupCount = 4; + pSMB->Reserved = 0; + pSMB->ParameterOffset = 0; + pSMB->DataCount = __constant_cpu_to_le32(2); + pSMB->DataOffset = + cpu_to_le32(offsetof(struct smb_com_transaction_compr_ioctl_req, + compression_state) - 4); /* 84 */ + pSMB->SetupCount = 4; + pSMB->SubCommand = __constant_cpu_to_le16(NT_TRANSACT_IOCTL); + pSMB->ParameterCount = 0; + pSMB->FunctionCode = __constant_cpu_to_le32(FSCTL_SET_COMPRESSION); + pSMB->IsFsctl = 1; /* FSCTL */ + pSMB->IsRootFlag = 0; + pSMB->Fid = fid; /* file handle always le */ + /* 3 byte pad, followed by 2 byte compress state */ + pSMB->ByteCount = __constant_cpu_to_le16(5); + inc_rfc1001_len(pSMB, 5); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) + cifs_dbg(FYI, "Send error in SetCompression = %d\n", rc); + + cifs_buf_release(pSMB); + + /* + * Note: On -EAGAIN error only caller can retry on handle based calls + * since file handle passed in no longer valid. + */ + return rc; +} + + #ifdef CONFIG_CIFS_POSIX /*Convert an Access Control Entry from wire format to local POSIX xattr format*/ @@ -3287,11 +3384,13 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, return 0; } cifs_acl->version = cpu_to_le16(1); - if (acl_type == ACL_TYPE_ACCESS) + if (acl_type == ACL_TYPE_ACCESS) { cifs_acl->access_entry_count = cpu_to_le16(count); - else if (acl_type == ACL_TYPE_DEFAULT) + cifs_acl->default_entry_count = __constant_cpu_to_le16(0xFFFF); + } else if (acl_type == ACL_TYPE_DEFAULT) { cifs_acl->default_entry_count = cpu_to_le16(count); - else { + cifs_acl->access_entry_count = __constant_cpu_to_le16(0xFFFF); + } else { cifs_dbg(FYI, "unknown ACL type %d\n", acl_type); return 0; } @@ -3926,7 +4025,7 @@ QFileInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc); + cifs_dbg(FYI, "Send error in QFileInfo = %d", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -4095,7 +4194,7 @@ UnixQFileInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc); + cifs_dbg(FYI, "Send error in UnixQFileInfo = %d", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -4179,7 +4278,7 @@ UnixQPathInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc); + cifs_dbg(FYI, "Send error in UnixQPathInfo = %d", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -6098,6 +6197,9 @@ QAllEAsRetry: cifs_dbg(FYI, "ea length %d\n", list_len); if (list_len <= 8) { cifs_dbg(FYI, "empty EA list returned from server\n"); + /* didn't find the named attribute */ + if (ea_name) + rc = -ENODATA; goto QAllEAsOut; } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index a279ffc0bc2..20d75b8ddb2 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2144,6 +2144,9 @@ cifs_get_tcp_session(struct smb_vol *volume_info) sizeof(tcp_ses->srcaddr)); memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr, sizeof(tcp_ses->dstaddr)); +#ifdef CONFIG_CIFS_SMB2 + get_random_bytes(tcp_ses->client_guid, SMB2_CLIENT_GUID_SIZE); +#endif /* * at this point we are the only ones with the pointer * to the struct since the kernel thread not created yet @@ -2225,7 +2228,7 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol) vol->username ? vol->username : "", CIFS_MAX_USERNAME_LEN)) return 0; - if (strlen(vol->username) != 0 && + if ((vol->username && strlen(vol->username) != 0) && ses->password != NULL && strncmp(ses->password, vol->password ? vol->password : "", @@ -2242,6 +2245,8 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol) spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + if (ses->status == CifsExiting) + continue; if (!match_session(ses, vol)) continue; ++ses->ses_count; @@ -2255,24 +2260,37 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol) static void cifs_put_smb_ses(struct cifs_ses *ses) { - unsigned int xid; + unsigned int rc, xid; struct TCP_Server_Info *server = ses->server; cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count); + spin_lock(&cifs_tcp_ses_lock); + if (ses->status == CifsExiting) { + spin_unlock(&cifs_tcp_ses_lock); + return; + } if (--ses->ses_count > 0) { spin_unlock(&cifs_tcp_ses_lock); return; } - - list_del_init(&ses->smb_ses_list); + if (ses->status == CifsGood) + ses->status = CifsExiting; spin_unlock(&cifs_tcp_ses_lock); - if (ses->status == CifsGood && server->ops->logoff) { + if (ses->status == CifsExiting && server->ops->logoff) { xid = get_xid(); - server->ops->logoff(xid, ses); + rc = server->ops->logoff(xid, ses); + if (rc) + cifs_dbg(VFS, "%s: Session Logoff failure rc=%d\n", + __func__, rc); _free_xid(xid); } + + spin_lock(&cifs_tcp_ses_lock); + list_del_init(&ses->smb_ses_list); + spin_unlock(&cifs_tcp_ses_lock); + sesInfoFree(ses); cifs_put_tcp_session(server); } @@ -3755,6 +3773,13 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, return rc; } +static void delayed_free(struct rcu_head *p) +{ + struct cifs_sb_info *sbi = container_of(p, struct cifs_sb_info, rcu); + unload_nls(sbi->local_nls); + kfree(sbi); +} + void cifs_umount(struct cifs_sb_info *cifs_sb) { @@ -3779,8 +3804,7 @@ cifs_umount(struct cifs_sb_info *cifs_sb) bdi_destroy(&cifs_sb->bdi); kfree(cifs_sb->mountdata); - unload_nls(cifs_sb->local_nls); - kfree(cifs_sb); + call_rcu(&cifs_sb->rcu, delayed_free); } int diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 5384c2a640c..3db0c5fd9a1 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -193,7 +193,7 @@ check_name(struct dentry *direntry) static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, struct tcon_link *tlink, unsigned oflags, umode_t mode, - __u32 *oplock, struct cifs_fid *fid, int *created) + __u32 *oplock, struct cifs_fid *fid) { int rc = -ENOENT; int create_options = CREATE_NOT_DIR; @@ -349,7 +349,6 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, .device = 0, }; - *created |= FILE_CREATED; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { args.uid = current_fsuid(); if (inode->i_mode & S_ISGID) @@ -379,7 +378,7 @@ cifs_create_get_file_info: xid); else { rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb, - xid, &fid->netfid); + xid, fid); if (newinode) { if (server->ops->set_lease_key) server->ops->set_lease_key(newinode, fid); @@ -480,13 +479,16 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, cifs_add_pending_open(&fid, tlink, &open); rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, - &oplock, &fid, opened); + &oplock, &fid); if (rc) { cifs_del_pending_open(&open); goto out; } + if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) + *opened |= FILE_CREATED; + rc = finish_open(file, direntry, generic_file_open, opened); if (rc) { if (server->ops->close) @@ -529,7 +531,6 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, struct TCP_Server_Info *server; struct cifs_fid fid; __u32 oplock; - int created = FILE_CREATED; cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %s and dentry = 0x%p\n", inode, direntry->d_name.name, direntry); @@ -546,7 +547,7 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, server->ops->new_lease_key(&fid); rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, - &oplock, &fid, &created); + &oplock, &fid); if (!rc && server->ops->close) server->ops->close(xid, tcon, &fid); @@ -564,12 +565,13 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL; struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; - struct cifs_tcon *pTcon; + struct cifs_tcon *tcon; struct cifs_io_parms io_parms; char *full_path = NULL; struct inode *newinode = NULL; int oplock = 0; - u16 fileHandle; + struct cifs_fid fid; + struct cifs_open_parms oparms; FILE_ALL_INFO *buf = NULL; unsigned int bytes_written; struct win_dev *pdev; @@ -582,7 +584,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, if (IS_ERR(tlink)) return PTR_ERR(tlink); - pTcon = tlink_tcon(tlink); + tcon = tlink_tcon(tlink); xid = get_xid(); @@ -592,7 +594,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, goto mknod_out; } - if (pTcon->unix_ext) { + if (tcon->unix_ext) { struct cifs_unix_set_info_args args = { .mode = mode & ~current_umask(), .ctime = NO_CHANGE_64, @@ -607,7 +609,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, args.uid = INVALID_UID; /* no change */ args.gid = INVALID_GID; /* no change */ } - rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args, + rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); @@ -639,42 +641,44 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, if (backup_cred(cifs_sb)) create_options |= CREATE_OPEN_BACKUP_INTENT; - rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE, - GENERIC_WRITE, create_options, - &fileHandle, &oplock, buf, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = GENERIC_WRITE; + oparms.create_options = create_options; + oparms.disposition = FILE_CREATE; + oparms.path = full_path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, buf); if (rc) goto mknod_out; - /* BB Do not bother to decode buf since no local inode yet to put - * timestamps in, but we can reuse it safely */ + /* + * BB Do not bother to decode buf since no local inode yet to put + * timestamps in, but we can reuse it safely. + */ pdev = (struct win_dev *)buf; - io_parms.netfid = fileHandle; + io_parms.netfid = fid.netfid; io_parms.pid = current->tgid; - io_parms.tcon = pTcon; + io_parms.tcon = tcon; io_parms.offset = 0; io_parms.length = sizeof(struct win_dev); if (S_ISCHR(mode)) { memcpy(pdev->type, "IntxCHR", 8); - pdev->major = - cpu_to_le64(MAJOR(device_number)); - pdev->minor = - cpu_to_le64(MINOR(device_number)); - rc = CIFSSMBWrite(xid, &io_parms, - &bytes_written, (char *)pdev, - NULL, 0); + pdev->major = cpu_to_le64(MAJOR(device_number)); + pdev->minor = cpu_to_le64(MINOR(device_number)); + rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, (char *)pdev, + NULL, 0); } else if (S_ISBLK(mode)) { memcpy(pdev->type, "IntxBLK", 8); - pdev->major = - cpu_to_le64(MAJOR(device_number)); - pdev->minor = - cpu_to_le64(MINOR(device_number)); - rc = CIFSSMBWrite(xid, &io_parms, - &bytes_written, (char *)pdev, - NULL, 0); + pdev->major = cpu_to_le64(MAJOR(device_number)); + pdev->minor = cpu_to_le64(MINOR(device_number)); + rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, (char *)pdev, + NULL, 0); } /* else if (S_ISFIFO) */ - CIFSSMBClose(xid, pTcon, fileHandle); + CIFSSMBClose(xid, tcon, fid.netfid); d_drop(direntry); /* FIXME: add code here to set EAs */ @@ -756,7 +760,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, /* if it was once a directory (but how can we tell?) we could do shrink_dcache_parent(direntry); */ } else if (rc != -EACCES) { - cifs_dbg(VFS, "Unexpected lookup error %d\n", rc); + cifs_dbg(FYI, "Unexpected lookup error %d\n", rc); /* We special case check for Access Denied - since that is a common return code */ } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 7ddddf2e250..e90a1e9aa62 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -244,7 +244,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, xid); else rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, - xid, &fid->netfid); + xid, fid); out: kfree(buf); @@ -335,7 +335,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, spin_unlock(&cifs_file_list_lock); if (fid->purge_cache) - cifs_invalidate_mapping(inode); + cifs_zap_mapping(inode); file->private_data = cfile; return cfile; @@ -392,7 +392,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) * again and get at least level II oplock. */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) - CIFS_I(inode)->invalid_mapping = true; + set_bit(CIFS_INO_INVALID_MAPPING, &cifsi->flags); cifs_set_oplock_level(cifsi, 0); } spin_unlock(&cifs_file_list_lock); @@ -678,7 +678,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) /* * Can not refresh inode by passing in file_info buf to be returned by - * CIFSSMBOpen and then calling get_inode_info with returned buf since + * ops->open and then calling get_inode_info with returned buf since * file might have write behind data that needs to be flushed and server * version of file size can be stale. If we knew for sure that inode was * not dirty locally we could do this. @@ -1529,7 +1529,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, */ if (!CIFS_CACHE_WRITE(CIFS_I(inode)) && CIFS_CACHE_READ(CIFS_I(inode))) { - cifs_invalidate_mapping(inode); + cifs_zap_mapping(inode); cifs_dbg(FYI, "Set no oplock for inode=%p due to mand locks\n", inode); CIFS_I(inode)->oplock = 0; @@ -2043,7 +2043,8 @@ retry: } wdata->pid = wdata->cfile->pid; server = tlink_tcon(wdata->cfile->tlink)->ses->server; - rc = server->ops->async_writev(wdata); + rc = server->ops->async_writev(wdata, + cifs_writedata_release); } while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN); for (i = 0; i < nr_pages; ++i) @@ -2217,7 +2218,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, file->f_path.dentry->d_name.name, datasync); if (!CIFS_CACHE_READ(CIFS_I(inode))) { - rc = cifs_invalidate_mapping(inode); + rc = cifs_zap_mapping(inode); if (rc) { cifs_dbg(FYI, "rc: %d during invalidate phase\n", rc); rc = 0; /* don't care about it in fsync */ @@ -2331,9 +2332,20 @@ size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len) } static void -cifs_uncached_writev_complete(struct work_struct *work) +cifs_uncached_writedata_release(struct kref *refcount) { int i; + struct cifs_writedata *wdata = container_of(refcount, + struct cifs_writedata, refcount); + + for (i = 0; i < wdata->nr_pages; i++) + put_page(wdata->pages[i]); + cifs_writedata_release(refcount); +} + +static void +cifs_uncached_writev_complete(struct work_struct *work) +{ struct cifs_writedata *wdata = container_of(work, struct cifs_writedata, work); struct inode *inode = wdata->cfile->dentry->d_inode; @@ -2347,12 +2359,7 @@ cifs_uncached_writev_complete(struct work_struct *work) complete(&wdata->done); - if (wdata->result != -EAGAIN) { - for (i = 0; i < wdata->nr_pages; i++) - put_page(wdata->pages[i]); - } - - kref_put(&wdata->refcount, cifs_writedata_release); + kref_put(&wdata->refcount, cifs_uncached_writedata_release); } /* attempt to send write to server, retry on any -EAGAIN errors */ @@ -2370,21 +2377,20 @@ cifs_uncached_retry_writev(struct cifs_writedata *wdata) if (rc != 0) continue; } - rc = server->ops->async_writev(wdata); + rc = server->ops->async_writev(wdata, + cifs_uncached_writedata_release); } while (rc == -EAGAIN); return rc; } static ssize_t -cifs_iovec_write(struct file *file, const struct iovec *iov, - unsigned long nr_segs, loff_t *poffset) +cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset) { unsigned long nr_pages, i; - size_t copied, len, cur_len; + size_t bytes, copied, len, cur_len; ssize_t total_written = 0; loff_t offset; - struct iov_iter it; struct cifsFileInfo *open_file; struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb; @@ -2393,14 +2399,16 @@ cifs_iovec_write(struct file *file, const struct iovec *iov, int rc; pid_t pid; - len = iov_length(iov, nr_segs); - if (!len) - return 0; - + len = iov_iter_count(from); rc = generic_write_checks(file, poffset, &len, 0); if (rc) return rc; + if (!len) + return 0; + + iov_iter_truncate(from, len); + INIT_LIST_HEAD(&wdata_list); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); open_file = file->private_data; @@ -2416,7 +2424,6 @@ cifs_iovec_write(struct file *file, const struct iovec *iov, else pid = current->tgid; - iov_iter_init(&it, iov, nr_segs, len, 0); do { size_t save_len; @@ -2436,14 +2443,44 @@ cifs_iovec_write(struct file *file, const struct iovec *iov, save_len = cur_len; for (i = 0; i < nr_pages; i++) { - copied = min_t(const size_t, cur_len, PAGE_SIZE); - copied = iov_iter_copy_from_user(wdata->pages[i], &it, - 0, copied); + bytes = min_t(size_t, cur_len, PAGE_SIZE); + copied = copy_page_from_iter(wdata->pages[i], 0, bytes, + from); cur_len -= copied; - iov_iter_advance(&it, copied); + /* + * If we didn't copy as much as we expected, then that + * may mean we trod into an unmapped area. Stop copying + * at that point. On the next pass through the big + * loop, we'll likely end up getting a zero-length + * write and bailing out of it. + */ + if (copied < bytes) + break; } cur_len = save_len - cur_len; + /* + * If we have no data to send, then that probably means that + * the copy above failed altogether. That's most likely because + * the address in the iovec was bogus. Set the rc to -EFAULT, + * free anything we allocated and bail out. + */ + if (!cur_len) { + for (i = 0; i < nr_pages; i++) + put_page(wdata->pages[i]); + kfree(wdata); + rc = -EFAULT; + break; + } + + /* + * i + 1 now represents the number of pages we actually used in + * the copy phase above. Bring nr_pages down to that, and free + * any pages that we didn't use. + */ + for ( ; nr_pages > i + 1; nr_pages--) + put_page(wdata->pages[nr_pages - 1]); + wdata->sync_mode = WB_SYNC_ALL; wdata->nr_pages = nr_pages; wdata->offset = (__u64)offset; @@ -2454,7 +2491,8 @@ cifs_iovec_write(struct file *file, const struct iovec *iov, wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE); rc = cifs_uncached_retry_writev(wdata); if (rc) { - kref_put(&wdata->refcount, cifs_writedata_release); + kref_put(&wdata->refcount, + cifs_uncached_writedata_release); break; } @@ -2496,7 +2534,7 @@ restart_loop: } } list_del_init(&wdata->list); - kref_put(&wdata->refcount, cifs_writedata_release); + kref_put(&wdata->refcount, cifs_uncached_writedata_release); } if (total_written > 0) @@ -2506,11 +2544,11 @@ restart_loop: return total_written ? total_written : (ssize_t)rc; } -ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) { ssize_t written; struct inode *inode; + loff_t pos = iocb->ki_pos; inode = file_inode(iocb->ki_filp); @@ -2520,9 +2558,9 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov, * write request. */ - written = cifs_iovec_write(iocb->ki_filp, iov, nr_segs, &pos); + written = cifs_iovec_write(iocb->ki_filp, from, &pos); if (written > 0) { - CIFS_I(inode)->invalid_mapping = true; + set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(inode)->flags); iocb->ki_pos = pos; } @@ -2530,8 +2568,7 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov, } static ssize_t -cifs_writev(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +cifs_writev(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; @@ -2539,38 +2576,38 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, struct cifsInodeInfo *cinode = CIFS_I(inode); struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; ssize_t rc = -EACCES; - - BUG_ON(iocb->ki_pos != pos); + loff_t lock_pos = iocb->ki_pos; /* * We need to hold the sem to be sure nobody modifies lock list * with a brlock that prevents writing. */ down_read(&cinode->lock_sem); - if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs), + mutex_lock(&inode->i_mutex); + if (file->f_flags & O_APPEND) + lock_pos = i_size_read(inode); + if (!cifs_find_lock_conflict(cfile, lock_pos, iov_iter_count(from), server->vals->exclusive_lock_type, NULL, CIFS_WRITE_OP)) { - mutex_lock(&inode->i_mutex); - rc = __generic_file_aio_write(iocb, iov, nr_segs, - &iocb->ki_pos); + rc = __generic_file_write_iter(iocb, from); mutex_unlock(&inode->i_mutex); - } - if (rc > 0) { - ssize_t err; + if (rc > 0) { + ssize_t err; - err = generic_write_sync(file, pos, rc); - if (err < 0 && rc > 0) - rc = err; + err = generic_write_sync(file, iocb->ki_pos - rc, rc); + if (err < 0) + rc = err; + } + } else { + mutex_unlock(&inode->i_mutex); } - up_read(&cinode->lock_sem); return rc; } ssize_t -cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); struct cifsInodeInfo *cinode = CIFS_I(inode); @@ -2580,12 +2617,19 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); ssize_t written; + written = cifs_get_writer(cinode); + if (written) + return written; + if (CIFS_CACHE_WRITE(cinode)) { if (cap_unix(tcon->ses) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) - && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) - return generic_file_aio_write(iocb, iov, nr_segs, pos); - return cifs_writev(iocb, iov, nr_segs, pos); + && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) { + written = generic_file_write_iter(iocb, from); + goto out; + } + written = cifs_writev(iocb, from); + goto out; } /* * For non-oplocked files in strict cache mode we need to write the data @@ -2593,18 +2637,20 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, * affected pages because it may cause a error with mandatory locks on * these pages but not on the region from pos to ppos+len-1. */ - written = cifs_user_writev(iocb, iov, nr_segs, pos); + written = cifs_user_writev(iocb, from); if (written > 0 && CIFS_CACHE_READ(cinode)) { /* * Windows 7 server can delay breaking level2 oplock if a write * request comes - break it on the client to prevent reading * an old data. */ - cifs_invalidate_mapping(inode); + cifs_zap_mapping(inode); cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n", inode); cinode->oplock = 0; } +out: + cifs_put_writer(cinode); return written; } @@ -2699,56 +2745,27 @@ cifs_retry_async_readv(struct cifs_readdata *rdata) /** * cifs_readdata_to_iov - copy data from pages in response to an iovec * @rdata: the readdata response with list of pages holding data - * @iov: vector in which we should copy the data - * @nr_segs: number of segments in vector - * @offset: offset into file of the first iovec - * @copied: used to return the amount of data copied to the iov + * @iter: destination for our data * * This function copies data from a list of pages in a readdata response into * an array of iovecs. It will first calculate where the data should go * based on the info in the readdata and then copy the data into that spot. */ -static ssize_t -cifs_readdata_to_iov(struct cifs_readdata *rdata, const struct iovec *iov, - unsigned long nr_segs, loff_t offset, ssize_t *copied) +static int +cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) { - int rc = 0; - struct iov_iter ii; - size_t pos = rdata->offset - offset; - ssize_t remaining = rdata->bytes; - unsigned char *pdata; + size_t remaining = rdata->bytes; unsigned int i; - /* set up iov_iter and advance to the correct offset */ - iov_iter_init(&ii, iov, nr_segs, iov_length(iov, nr_segs), 0); - iov_iter_advance(&ii, pos); - - *copied = 0; for (i = 0; i < rdata->nr_pages; i++) { - ssize_t copy; struct page *page = rdata->pages[i]; - - /* copy a whole page or whatever's left */ - copy = min_t(ssize_t, remaining, PAGE_SIZE); - - /* ...but limit it to whatever space is left in the iov */ - copy = min_t(ssize_t, copy, iov_iter_count(&ii)); - - /* go while there's data to be copied and no errors */ - if (copy && !rc) { - pdata = kmap(page); - rc = memcpy_toiovecend(ii.iov, pdata, ii.iov_offset, - (int)copy); - kunmap(page); - if (!rc) { - *copied += copy; - remaining -= copy; - iov_iter_advance(&ii, copy); - } - } + size_t copy = min_t(size_t, remaining, PAGE_SIZE); + size_t written = copy_page_to_iter(page, 0, copy, iter); + remaining -= written; + if (written < copy && iov_iter_count(iter) > 0) + break; } - - return rc; + return remaining ? -EFAULT : 0; } static void @@ -2809,14 +2826,13 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server, return total_read > 0 ? total_read : result; } -static ssize_t -cifs_iovec_read(struct file *file, const struct iovec *iov, - unsigned long nr_segs, loff_t *poffset) +ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) { + struct file *file = iocb->ki_filp; ssize_t rc; size_t len, cur_len; ssize_t total_read = 0; - loff_t offset = *poffset; + loff_t offset = iocb->ki_pos; unsigned int npages; struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; @@ -2825,10 +2841,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, struct list_head rdata_list; pid_t pid; - if (!nr_segs) - return 0; - - len = iov_length(iov, nr_segs); + len = iov_iter_count(to); if (!len) return 0; @@ -2857,7 +2870,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, cifs_uncached_readv_complete); if (!rdata) { rc = -ENOMEM; - goto error; + break; } rc = cifs_read_allocate_pages(rdata, npages); @@ -2889,60 +2902,48 @@ error: if (!list_empty(&rdata_list)) rc = 0; + len = iov_iter_count(to); /* the loop below should proceed in the order of increasing offsets */ -restart_loop: list_for_each_entry_safe(rdata, tmp, &rdata_list, list) { + again: if (!rc) { - ssize_t copied; - /* FIXME: freezable sleep too? */ rc = wait_for_completion_killable(&rdata->done); if (rc) rc = -EINTR; - else if (rdata->result) + else if (rdata->result) { rc = rdata->result; - else { - rc = cifs_readdata_to_iov(rdata, iov, - nr_segs, *poffset, - &copied); - total_read += copied; + /* resend call if it's a retryable error */ + if (rc == -EAGAIN) { + rc = cifs_retry_async_readv(rdata); + goto again; + } + } else { + rc = cifs_readdata_to_iov(rdata, to); } - /* resend call if it's a retryable error */ - if (rc == -EAGAIN) { - rc = cifs_retry_async_readv(rdata); - goto restart_loop; - } } list_del_init(&rdata->list); kref_put(&rdata->refcount, cifs_uncached_readdata_release); } + total_read = len - iov_iter_count(to); + cifs_stats_bytes_read(tcon, total_read); - *poffset += total_read; /* mask nodata case */ if (rc == -ENODATA) rc = 0; - return total_read ? total_read : rc; -} - -ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - ssize_t read; - - read = cifs_iovec_read(iocb->ki_filp, iov, nr_segs, &pos); - if (read > 0) - iocb->ki_pos = pos; - - return read; + if (total_read) { + iocb->ki_pos += total_read; + return total_read; + } + return rc; } ssize_t -cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); struct cifsInodeInfo *cinode = CIFS_I(inode); @@ -2961,22 +2962,22 @@ cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov, * pos+len-1. */ if (!CIFS_CACHE_READ(cinode)) - return cifs_user_readv(iocb, iov, nr_segs, pos); + return cifs_user_readv(iocb, to); if (cap_unix(tcon->ses) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) - return generic_file_aio_read(iocb, iov, nr_segs, pos); + return generic_file_read_iter(iocb, to); /* * We need to hold the sem to be sure nobody modifies lock list * with a brlock that prevents reading. */ down_read(&cinode->lock_sem); - if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs), + if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(to), tcon->ses->server->vals->shared_lock_type, NULL, CIFS_READ_OP)) - rc = generic_file_aio_read(iocb, iov, nr_segs, pos); + rc = generic_file_read_iter(iocb, to); up_read(&cinode->lock_sem); return rc; } @@ -3085,6 +3086,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static struct vm_operations_struct cifs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = cifs_page_mkwrite, .remap_pages = generic_file_remap_pages, }; @@ -3097,7 +3099,7 @@ int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) xid = get_xid(); if (!CIFS_CACHE_READ(CIFS_I(inode))) { - rc = cifs_invalidate_mapping(inode); + rc = cifs_zap_mapping(inode); if (rc) return rc; } @@ -3616,6 +3618,13 @@ static int cifs_launder_page(struct page *page) return rc; } +static int +cifs_pending_writers_wait(void *unused) +{ + schedule(); + return 0; +} + void cifs_oplock_break(struct work_struct *work) { struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, @@ -3623,8 +3632,15 @@ void cifs_oplock_break(struct work_struct *work) struct inode *inode = cfile->dentry->d_inode; struct cifsInodeInfo *cinode = CIFS_I(inode); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; int rc = 0; + wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, + cifs_pending_writers_wait, TASK_UNINTERRUPTIBLE); + + server->ops->downgrade_oplock(server, cinode, + test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags)); + if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) && cifs_has_mand_locks(cinode)) { cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n", @@ -3641,7 +3657,7 @@ void cifs_oplock_break(struct work_struct *work) if (!CIFS_CACHE_READ(cinode)) { rc = filemap_fdatawait(inode->i_mapping); mapping_set_error(inode->i_mapping, rc); - cifs_invalidate_mapping(inode); + cifs_zap_mapping(inode); } cifs_dbg(FYI, "Oplock flush inode %p rc %d\n", inode, rc); } @@ -3661,8 +3677,30 @@ void cifs_oplock_break(struct work_struct *work) cinode); cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } + cifs_done_oplock_break(cinode); } +/* + * The presence of cifs_direct_io() in the address space ops vector + * allowes open() O_DIRECT flags which would have failed otherwise. + * + * In the non-cached mode (mount with cache=none), we shunt off direct read and write requests + * so this method should never be called. + * + * Direct IO is not yet supported in the cached mode. + */ +static ssize_t +cifs_direct_io(int rw, struct kiocb *iocb, struct iov_iter *iter, + loff_t pos) +{ + /* + * FIXME + * Eventually need to support direct IO for non forcedirectio mounts + */ + return -EINVAL; +} + + const struct address_space_operations cifs_addr_ops = { .readpage = cifs_readpage, .readpages = cifs_readpages, @@ -3672,6 +3710,7 @@ const struct address_space_operations cifs_addr_ops = { .write_end = cifs_write_end, .set_page_dirty = __set_page_dirty_nobuffers, .releasepage = cifs_release_page, + .direct_IO = cifs_direct_io, .invalidatepage = cifs_invalidate_page, .launder_page = cifs_launder_page, }; diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index b3258f35e88..8d4b7bc8ae9 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -27,7 +27,7 @@ void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) { server->fscache = fscache_acquire_cookie(cifs_fscache_netfs.primary_index, - &cifs_fscache_server_index_def, server); + &cifs_fscache_server_index_def, server, true); cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, server, server->fscache); } @@ -46,7 +46,7 @@ void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) tcon->fscache = fscache_acquire_cookie(server->fscache, - &cifs_fscache_super_index_def, tcon); + &cifs_fscache_super_index_def, tcon, true); cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, server->fscache, tcon->fscache); } @@ -69,7 +69,7 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) { cifsi->fscache = fscache_acquire_cookie(tcon->fscache, - &cifs_fscache_inode_object_def, cifsi); + &cifs_fscache_inode_object_def, cifsi, true); cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n", __func__, tcon->fscache, cifsi->fscache); } @@ -119,7 +119,7 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode) cifsi->fscache = fscache_acquire_cookie( cifs_sb_master_tcon(cifs_sb)->fscache, &cifs_fscache_inode_object_def, - cifsi); + cifsi, true); cifs_dbg(FYI, "%s: new cookie 0x%p oldcookie 0x%p\n", __func__, cifsi->fscache, old); } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 867b7cdc794..a174605f6af 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -22,6 +22,7 @@ #include <linux/stat.h> #include <linux/slab.h> #include <linux/pagemap.h> +#include <linux/freezer.h> #include <asm/div64.h> #include "cifsfs.h" #include "cifspdu.h" @@ -117,7 +118,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) cifs_dbg(FYI, "%s: invalidating inode %llu mapping\n", __func__, cifs_i->uniqueid); - cifs_i->invalid_mapping = true; + set_bit(CIFS_INO_INVALID_MAPPING, &cifs_i->flags); } /* @@ -177,7 +178,10 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) else cifs_i->time = jiffies; - cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING; + if (fattr->cf_flags & CIFS_FATTR_DELETE_PENDING) + set_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags); + else + clear_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags); cifs_i->server_eof = fattr->cf_eof; /* @@ -383,9 +387,10 @@ int cifs_get_inode_info_unix(struct inode **pinode, /* check for Minshall+French symlinks */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { - int tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid); + int tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr, + full_path); if (tmprc) - cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc); + cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); } if (*pinode == NULL) { @@ -403,18 +408,20 @@ int cifs_get_inode_info_unix(struct inode **pinode, } static int -cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path, +cifs_sfu_type(struct cifs_fattr *fattr, const char *path, struct cifs_sb_info *cifs_sb, unsigned int xid) { int rc; int oplock = 0; - __u16 netfid; struct tcon_link *tlink; struct cifs_tcon *tcon; + struct cifs_fid fid; + struct cifs_open_parms oparms; struct cifs_io_parms io_parms; char buf[24]; unsigned int bytes_read; char *pbuf; + int buf_type = CIFS_NO_BUFFER; pbuf = buf; @@ -435,62 +442,69 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path, return PTR_ERR(tlink); tcon = tlink_tcon(tlink); - rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, GENERIC_READ, - CREATE_NOT_DIR, &netfid, &oplock, NULL, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc == 0) { - int buf_type = CIFS_NO_BUFFER; - /* Read header */ - io_parms.netfid = netfid; - io_parms.pid = current->tgid; - io_parms.tcon = tcon; - io_parms.offset = 0; - io_parms.length = 24; - rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, - &buf_type); - if ((rc == 0) && (bytes_read >= 8)) { - if (memcmp("IntxBLK", pbuf, 8) == 0) { - cifs_dbg(FYI, "Block device\n"); - fattr->cf_mode |= S_IFBLK; - fattr->cf_dtype = DT_BLK; - if (bytes_read == 24) { - /* we have enough to decode dev num */ - __u64 mjr; /* major */ - __u64 mnr; /* minor */ - mjr = le64_to_cpu(*(__le64 *)(pbuf+8)); - mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); - fattr->cf_rdev = MKDEV(mjr, mnr); - } - } else if (memcmp("IntxCHR", pbuf, 8) == 0) { - cifs_dbg(FYI, "Char device\n"); - fattr->cf_mode |= S_IFCHR; - fattr->cf_dtype = DT_CHR; - if (bytes_read == 24) { - /* we have enough to decode dev num */ - __u64 mjr; /* major */ - __u64 mnr; /* minor */ - mjr = le64_to_cpu(*(__le64 *)(pbuf+8)); - mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); - fattr->cf_rdev = MKDEV(mjr, mnr); - } - } else if (memcmp("IntxLNK", pbuf, 7) == 0) { - cifs_dbg(FYI, "Symlink\n"); - fattr->cf_mode |= S_IFLNK; - fattr->cf_dtype = DT_LNK; - } else { - fattr->cf_mode |= S_IFREG; /* file? */ - fattr->cf_dtype = DT_REG; - rc = -EOPNOTSUPP; + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = GENERIC_READ; + oparms.create_options = CREATE_NOT_DIR; + oparms.disposition = FILE_OPEN; + oparms.path = path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, NULL); + if (rc) { + cifs_put_tlink(tlink); + return rc; + } + + /* Read header */ + io_parms.netfid = fid.netfid; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = 24; + + rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type); + if ((rc == 0) && (bytes_read >= 8)) { + if (memcmp("IntxBLK", pbuf, 8) == 0) { + cifs_dbg(FYI, "Block device\n"); + fattr->cf_mode |= S_IFBLK; + fattr->cf_dtype = DT_BLK; + if (bytes_read == 24) { + /* we have enough to decode dev num */ + __u64 mjr; /* major */ + __u64 mnr; /* minor */ + mjr = le64_to_cpu(*(__le64 *)(pbuf+8)); + mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); + fattr->cf_rdev = MKDEV(mjr, mnr); } + } else if (memcmp("IntxCHR", pbuf, 8) == 0) { + cifs_dbg(FYI, "Char device\n"); + fattr->cf_mode |= S_IFCHR; + fattr->cf_dtype = DT_CHR; + if (bytes_read == 24) { + /* we have enough to decode dev num */ + __u64 mjr; /* major */ + __u64 mnr; /* minor */ + mjr = le64_to_cpu(*(__le64 *)(pbuf+8)); + mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); + fattr->cf_rdev = MKDEV(mjr, mnr); + } + } else if (memcmp("IntxLNK", pbuf, 7) == 0) { + cifs_dbg(FYI, "Symlink\n"); + fattr->cf_mode |= S_IFLNK; + fattr->cf_dtype = DT_LNK; } else { - fattr->cf_mode |= S_IFREG; /* then it is a file */ + fattr->cf_mode |= S_IFREG; /* file? */ fattr->cf_dtype = DT_REG; - rc = -EOPNOTSUPP; /* or some unknown SFU type */ + rc = -EOPNOTSUPP; } - CIFSSMBClose(xid, tcon, netfid); + } else { + fattr->cf_mode |= S_IFREG; /* then it is a file */ + fattr->cf_dtype = DT_REG; + rc = -EOPNOTSUPP; /* or some unknown SFU type */ } + CIFSSMBClose(xid, tcon, fid.netfid); cifs_put_tlink(tlink); return rc; } @@ -517,10 +531,15 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, return PTR_ERR(tlink); tcon = tlink_tcon(tlink); - rc = CIFSSMBQAllEAs(xid, tcon, path, "SETFILEBITS", - ea_value, 4 /* size of buf */, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + if (tcon->ses->server->ops->query_all_EAs == NULL) { + cifs_put_tlink(tlink); + return -EOPNOTSUPP; + } + + rc = tcon->ses->server->ops->query_all_EAs(xid, tcon, path, + "SETFILEBITS", ea_value, 4 /* size of buf */, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); cifs_put_tlink(tlink); if (rc < 0) return (int)rc; @@ -542,7 +561,8 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, /* Fill a cifs_fattr struct with info from FILE_ALL_INFO */ static void cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, - struct cifs_sb_info *cifs_sb, bool adjust_tz) + struct cifs_sb_info *cifs_sb, bool adjust_tz, + bool symlink) { struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); @@ -569,7 +589,11 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, fattr->cf_createtime = le64_to_cpu(info->CreationTime); fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); - if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { + + if (symlink) { + fattr->cf_mode = S_IFLNK; + fattr->cf_dtype = DT_LNK; + } else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode; fattr->cf_dtype = DT_DIR; /* @@ -578,10 +602,6 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, */ if (!tcon->unix_ext) fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK; - } else if (fattr->cf_cifsattrs & ATTR_REPARSE) { - fattr->cf_mode = S_IFLNK; - fattr->cf_dtype = DT_LNK; - fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); } else { fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode; fattr->cf_dtype = DT_REG; @@ -626,7 +646,8 @@ cifs_get_file_info(struct file *filp) rc = server->ops->query_file_info(xid, tcon, &cfile->fid, &find_data); switch (rc) { case 0: - cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false); + cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false, + false); break; case -EREMOTE: cifs_create_dfs_fattr(&fattr, inode->i_sb); @@ -660,7 +681,7 @@ cgfi_exit: int cifs_get_inode_info(struct inode **inode, const char *full_path, FILE_ALL_INFO *data, struct super_block *sb, int xid, - const __u16 *fid) + const struct cifs_fid *fid) { bool validinum = false; __u16 srchflgs; @@ -673,6 +694,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, bool adjust_tz = false; struct cifs_fattr fattr; struct cifs_search_info *srchinf = NULL; + bool symlink = false; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -702,12 +724,12 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, } data = (FILE_ALL_INFO *)buf; rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path, - data, &adjust_tz); + data, &adjust_tz, &symlink); } if (!rc) { - cifs_all_info_to_fattr(&fattr, (FILE_ALL_INFO *)data, cifs_sb, - adjust_tz); + cifs_all_info_to_fattr(&fattr, data, cifs_sb, adjust_tz, + symlink); } else if (rc == -EREMOTE) { cifs_create_dfs_fattr(&fattr, sb); rc = 0; @@ -796,9 +818,10 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, /* check for Minshall+French symlinks */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { - tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid); + tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr, + full_path); if (tmprc) - cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc); + cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); } if (!*inode) { @@ -1027,7 +1050,8 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, { int oplock = 0; int rc; - __u16 netfid; + struct cifs_fid fid; + struct cifs_open_parms oparms; struct inode *inode = dentry->d_inode; struct cifsInodeInfo *cifsInode = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); @@ -1050,10 +1074,16 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, goto out; } - rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, - DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR, - &netfid, &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = DELETE | FILE_WRITE_ATTRIBUTES; + oparms.create_options = CREATE_NOT_DIR; + oparms.disposition = FILE_OPEN; + oparms.path = full_path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc != 0) goto out; @@ -1074,7 +1104,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, goto out_close; } info_buf->Attributes = cpu_to_le32(dosattr); - rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid, + rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, fid.netfid, current->tgid); /* although we would like to mark the file hidden if that fails we will still try to rename it */ @@ -1085,7 +1115,8 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, } /* rename the file */ - rc = CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls, + rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, NULL, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc != 0) { @@ -1094,8 +1125,8 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, } /* try to set DELETE_ON_CLOSE */ - if (!cifsInode->delete_pending) { - rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid, + if (!test_bit(CIFS_INO_DELETE_PENDING, &cifsInode->flags)) { + rc = CIFSSMBSetFileDisposition(xid, tcon, true, fid.netfid, current->tgid); /* * some samba versions return -ENOENT when we try to set the @@ -1111,11 +1142,11 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, rc = -EBUSY; goto undo_rename; } - cifsInode->delete_pending = true; + set_bit(CIFS_INO_DELETE_PENDING, &cifsInode->flags); } out_close: - CIFSSMBClose(xid, tcon, netfid); + CIFSSMBClose(xid, tcon, fid.netfid); out: kfree(info_buf); cifs_put_tlink(tlink); @@ -1127,13 +1158,13 @@ out: * them anyway. */ undo_rename: - CIFSSMBRenameOpenFile(xid, tcon, netfid, dentry->d_name.name, + CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, dentry->d_name.name, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); undo_setattr: if (dosattr != origattr) { info_buf->Attributes = cpu_to_le32(origattr); - if (!CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid, + if (!CIFSSMBSetFileInfo(xid, tcon, info_buf, fid.netfid, current->tgid)) cifsInode->cifsAttrs = origattr; } @@ -1544,7 +1575,8 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, struct tcon_link *tlink; struct cifs_tcon *tcon; struct TCP_Server_Info *server; - __u16 srcfid; + struct cifs_fid fid; + struct cifs_open_parms oparms; int oplock, rc; tlink = cifs_sb_tlink(cifs_sb); @@ -1571,17 +1603,23 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, if (to_dentry->d_parent != from_dentry->d_parent) goto do_rename_exit; + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; /* open the file to be renamed -- we need DELETE perms */ - rc = CIFSSMBOpen(xid, tcon, from_path, FILE_OPEN, DELETE, - CREATE_NOT_DIR, &srcfid, &oplock, NULL, - cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + oparms.desired_access = DELETE; + oparms.create_options = CREATE_NOT_DIR; + oparms.disposition = FILE_OPEN; + oparms.path = from_path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc == 0) { - rc = CIFSSMBRenameOpenFile(xid, tcon, srcfid, + rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, (const char *) to_dentry->d_name.name, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - CIFSSMBClose(xid, tcon, srcfid); + CIFSSMBClose(xid, tcon, fid.netfid); } do_rename_exit: cifs_put_tlink(tlink); @@ -1703,6 +1741,9 @@ cifs_inode_needs_reval(struct inode *inode) if (cifs_i->time == 0) return true; + if (!cifs_sb->actimeo) + return true; + if (!time_in_range(jiffies, cifs_i->time, cifs_i->time + cifs_sb->actimeo)) return true; @@ -1722,23 +1763,62 @@ int cifs_invalidate_mapping(struct inode *inode) { int rc = 0; - struct cifsInodeInfo *cifs_i = CIFS_I(inode); - - cifs_i->invalid_mapping = false; if (inode->i_mapping && inode->i_mapping->nrpages != 0) { rc = invalidate_inode_pages2(inode->i_mapping); - if (rc) { + if (rc) cifs_dbg(VFS, "%s: could not invalidate inode %p\n", __func__, inode); - cifs_i->invalid_mapping = true; - } } cifs_fscache_reset_inode_cookie(inode); return rc; } +/** + * cifs_wait_bit_killable - helper for functions that are sleeping on bit locks + * @word: long word containing the bit lock + */ +static int +cifs_wait_bit_killable(void *word) +{ + if (fatal_signal_pending(current)) + return -ERESTARTSYS; + freezable_schedule_unsafe(); + return 0; +} + +int +cifs_revalidate_mapping(struct inode *inode) +{ + int rc; + unsigned long *flags = &CIFS_I(inode)->flags; + + rc = wait_on_bit_lock(flags, CIFS_INO_LOCK, cifs_wait_bit_killable, + TASK_KILLABLE); + if (rc) + return rc; + + if (test_and_clear_bit(CIFS_INO_INVALID_MAPPING, flags)) { + rc = cifs_invalidate_mapping(inode); + if (rc) + set_bit(CIFS_INO_INVALID_MAPPING, flags); + } + + clear_bit_unlock(CIFS_INO_LOCK, flags); + smp_mb__after_atomic(); + wake_up_bit(flags, CIFS_INO_LOCK); + + return rc; +} + +int +cifs_zap_mapping(struct inode *inode) +{ + set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(inode)->flags); + return cifs_revalidate_mapping(inode); +} + int cifs_revalidate_file_attr(struct file *filp) { int rc = 0; @@ -1805,9 +1885,7 @@ int cifs_revalidate_file(struct file *filp) if (rc) return rc; - if (CIFS_I(inode)->invalid_mapping) - rc = cifs_invalidate_mapping(inode); - return rc; + return cifs_revalidate_mapping(inode); } /* revalidate a dentry's inode attributes */ @@ -1820,9 +1898,7 @@ int cifs_revalidate_dentry(struct dentry *dentry) if (rc) return rc; - if (CIFS_I(inode)->invalid_mapping) - rc = cifs_invalidate_mapping(inode); - return rc; + return cifs_revalidate_mapping(inode); } int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 3e084558585..45cb59bcc79 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -3,7 +3,7 @@ * * vfs operations that deal with io control * - * Copyright (C) International Business Machines Corp., 2005,2007 + * Copyright (C) International Business Machines Corp., 2005,2013 * Author(s): Steve French (sfrench@us.ibm.com) * * This library is free software; you can redistribute it and/or modify @@ -22,25 +22,132 @@ */ #include <linux/fs.h> +#include <linux/file.h> +#include <linux/mount.h> +#include <linux/mm.h> +#include <linux/pagemap.h> #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" #include "cifs_debug.h" #include "cifsfs.h" +#define CIFS_IOCTL_MAGIC 0xCF +#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) + +static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, + unsigned long srcfd, u64 off, u64 len, u64 destoff) +{ + int rc; + struct cifsFileInfo *smb_file_target = dst_file->private_data; + struct inode *target_inode = file_inode(dst_file); + struct cifs_tcon *target_tcon; + struct fd src_file; + struct cifsFileInfo *smb_file_src; + struct inode *src_inode; + struct cifs_tcon *src_tcon; + + cifs_dbg(FYI, "ioctl clone range\n"); + /* the destination must be opened for writing */ + if (!(dst_file->f_mode & FMODE_WRITE)) { + cifs_dbg(FYI, "file target not open for write\n"); + return -EINVAL; + } + + /* check if target volume is readonly and take reference */ + rc = mnt_want_write_file(dst_file); + if (rc) { + cifs_dbg(FYI, "mnt_want_write failed with rc %d\n", rc); + return rc; + } + + src_file = fdget(srcfd); + if (!src_file.file) { + rc = -EBADF; + goto out_drop_write; + } + + if ((!src_file.file->private_data) || (!dst_file->private_data)) { + rc = -EBADF; + cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n"); + goto out_fput; + } + + rc = -EXDEV; + smb_file_target = dst_file->private_data; + smb_file_src = src_file.file->private_data; + src_tcon = tlink_tcon(smb_file_src->tlink); + target_tcon = tlink_tcon(smb_file_target->tlink); + + /* check if source and target are on same tree connection */ + if (src_tcon != target_tcon) { + cifs_dbg(VFS, "file copy src and target on different volume\n"); + goto out_fput; + } + + src_inode = file_inode(src_file.file); + + /* + * Note: cifs case is easier than btrfs since server responsible for + * checks for proper open modes and file type and if it wants + * server could even support copy of range where source = target + */ + + /* so we do not deadlock racing two ioctls on same files */ + if (target_inode < src_inode) { + mutex_lock_nested(&target_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD); + } else { + mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&target_inode->i_mutex, I_MUTEX_CHILD); + } + + /* determine range to clone */ + rc = -EINVAL; + if (off + len > src_inode->i_size || off + len < off) + goto out_unlock; + if (len == 0) + len = src_inode->i_size - off; + + cifs_dbg(FYI, "about to flush pages\n"); + /* should we flush first and last page first */ + truncate_inode_pages_range(&target_inode->i_data, destoff, + PAGE_CACHE_ALIGN(destoff + len)-1); + + if (target_tcon->ses->server->ops->clone_range) + rc = target_tcon->ses->server->ops->clone_range(xid, + smb_file_src, smb_file_target, off, len, destoff); + + /* force revalidate of size and timestamps of target file now + that target is updated on the server */ + CIFS_I(target_inode)->time = 0; +out_unlock: + /* although unlocking in the reverse order from locking is not + strictly necessary here it is a little cleaner to be consistent */ + if (target_inode < src_inode) { + mutex_unlock(&src_inode->i_mutex); + mutex_unlock(&target_inode->i_mutex); + } else { + mutex_unlock(&target_inode->i_mutex); + mutex_unlock(&src_inode->i_mutex); + } +out_fput: + fdput(src_file); +out_drop_write: + mnt_drop_write_file(dst_file); + return rc; +} + long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) { struct inode *inode = file_inode(filep); int rc = -ENOTTY; /* strange error - but the precedent */ unsigned int xid; struct cifs_sb_info *cifs_sb; -#ifdef CONFIG_CIFS_POSIX struct cifsFileInfo *pSMBFile = filep->private_data; struct cifs_tcon *tcon; __u64 ExtAttrBits = 0; - __u64 ExtAttrMask = 0; __u64 caps; -#endif /* CONFIG_CIFS_POSIX */ xid = get_xid(); @@ -49,13 +156,14 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) cifs_sb = CIFS_SB(inode->i_sb); switch (command) { -#ifdef CONFIG_CIFS_POSIX case FS_IOC_GETFLAGS: if (pSMBFile == NULL) break; tcon = tlink_tcon(pSMBFile->tlink); caps = le64_to_cpu(tcon->fsUnixInfo.Capability); +#ifdef CONFIG_CIFS_POSIX if (CIFS_UNIX_EXTATTR_CAP & caps) { + __u64 ExtAttrMask = 0; rc = CIFSGetExtAttr(xid, tcon, pSMBFile->fid.netfid, &ExtAttrBits, &ExtAttrMask); @@ -63,29 +171,53 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) rc = put_user(ExtAttrBits & FS_FL_USER_VISIBLE, (int __user *)arg); + if (rc != EOPNOTSUPP) + break; + } +#endif /* CONFIG_CIFS_POSIX */ + rc = 0; + if (CIFS_I(inode)->cifsAttrs & ATTR_COMPRESSED) { + /* add in the compressed bit */ + ExtAttrBits = FS_COMPR_FL; + rc = put_user(ExtAttrBits & FS_FL_USER_VISIBLE, + (int __user *)arg); } break; - case FS_IOC_SETFLAGS: if (pSMBFile == NULL) break; tcon = tlink_tcon(pSMBFile->tlink); caps = le64_to_cpu(tcon->fsUnixInfo.Capability); - if (CIFS_UNIX_EXTATTR_CAP & caps) { - if (get_user(ExtAttrBits, (int __user *)arg)) { - rc = -EFAULT; - break; - } - /* - * rc = CIFSGetExtAttr(xid, tcon, - * pSMBFile->fid.netfid, - * extAttrBits, - * &ExtAttrMask); - */ + + if (get_user(ExtAttrBits, (int __user *)arg)) { + rc = -EFAULT; + break; + } + + /* + * if (CIFS_UNIX_EXTATTR_CAP & caps) + * rc = CIFSSetExtAttr(xid, tcon, + * pSMBFile->fid.netfid, + * extAttrBits, + * &ExtAttrMask); + * if (rc != EOPNOTSUPP) + * break; + */ + + /* Currently only flag we can set is compressed flag */ + if ((ExtAttrBits & FS_COMPR_FL) == 0) + break; + + /* Try to set compress flag */ + if (tcon->ses->server->ops->set_compression) { + rc = tcon->ses->server->ops->set_compression( + xid, tcon, pSMBFile); + cifs_dbg(FYI, "set compress flag rc %d\n", rc); } - cifs_dbg(FYI, "set flags not implemented yet\n"); break; -#endif /* CONFIG_CIFS_POSIX */ + case CIFS_IOC_COPYCHUNK_FILE: + rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0); + break; default: cifs_dbg(FYI, "unsupported ioctl\n"); break; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index cc0234710dd..68559fd557f 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -29,6 +29,10 @@ #include "cifs_debug.h" #include "cifs_fs_sb.h" +/* + * M-F Symlink Functions - Begin + */ + #define CIFS_MF_SYMLINK_LEN_OFFSET (4+1) #define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1)) #define CIFS_MF_SYMLINK_LINK_OFFSET (CIFS_MF_SYMLINK_MD5_OFFSET+(32+1)) @@ -91,10 +95,8 @@ symlink_hash_err: } static int -CIFSParseMFSymlink(const u8 *buf, - unsigned int buf_len, - unsigned int *_link_len, - char **_link_str) +parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len, + char **_link_str) { int rc; unsigned int link_len; @@ -137,7 +139,7 @@ CIFSParseMFSymlink(const u8 *buf, } static int -CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str) +format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str) { int rc; unsigned int link_len; @@ -180,214 +182,114 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str) return 0; } +bool +couldbe_mf_symlink(const struct cifs_fattr *fattr) +{ + if (!S_ISREG(fattr->cf_mode)) + /* it's not a symlink */ + return false; + + if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE) + /* it's not a symlink */ + return false; + + return true; +} + static int -CIFSCreateMFSymLink(const unsigned int xid, struct cifs_tcon *tcon, - const char *fromName, const char *toName, - struct cifs_sb_info *cifs_sb) +create_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *fromName, + const char *toName) { int rc; - int oplock = 0; - int remap; - int create_options = CREATE_NOT_DIR; - __u16 netfid = 0; u8 *buf; unsigned int bytes_written = 0; - struct cifs_io_parms io_parms; - struct nls_table *nls_codepage; - - nls_codepage = cifs_sb->local_nls; - remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); if (!buf) return -ENOMEM; - rc = CIFSFormatMFSymlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName); - if (rc != 0) { - kfree(buf); - return rc; - } - - if (backup_cred(cifs_sb)) - create_options |= CREATE_OPEN_BACKUP_INTENT; - - rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE, - create_options, &netfid, &oplock, NULL, - nls_codepage, remap); - if (rc != 0) { - kfree(buf); - return rc; - } - - io_parms.netfid = netfid; - io_parms.pid = current->tgid; - io_parms.tcon = tcon; - io_parms.offset = 0; - io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; + rc = format_mf_symlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName); + if (rc) + goto out; - rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, buf, NULL, 0); - CIFSSMBClose(xid, tcon, netfid); - kfree(buf); - if (rc != 0) - return rc; + rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon, cifs_sb, + fromName, buf, &bytes_written); + if (rc) + goto out; if (bytes_written != CIFS_MF_SYMLINK_FILE_SIZE) - return -EIO; - - return 0; + rc = -EIO; +out: + kfree(buf); + return rc; } static int -CIFSQueryMFSymLink(const unsigned int xid, struct cifs_tcon *tcon, - const unsigned char *searchName, char **symlinkinfo, - const struct nls_table *nls_codepage, int remap) +query_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const unsigned char *path, + char **symlinkinfo) { int rc; - int oplock = 0; - __u16 netfid = 0; - u8 *buf; - char *pbuf; - unsigned int bytes_read = 0; - int buf_type = CIFS_NO_BUFFER; + u8 *buf = NULL; unsigned int link_len = 0; - struct cifs_io_parms io_parms; - FILE_ALL_INFO file_info; - - rc = CIFSSMBOpen(xid, tcon, searchName, FILE_OPEN, GENERIC_READ, - CREATE_NOT_DIR, &netfid, &oplock, &file_info, - nls_codepage, remap); - if (rc != 0) - return rc; - - if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { - CIFSSMBClose(xid, tcon, netfid); - /* it's not a symlink */ - return -EINVAL; - } + unsigned int bytes_read = 0; buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); if (!buf) return -ENOMEM; - pbuf = buf; - io_parms.netfid = netfid; - io_parms.pid = current->tgid; - io_parms.tcon = tcon; - io_parms.offset = 0; - io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; - - rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type); - CIFSSMBClose(xid, tcon, netfid); - if (rc != 0) { - kfree(buf); - return rc; - } - - rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, symlinkinfo); - kfree(buf); - if (rc != 0) - return rc; - - return 0; -} - -bool -CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr) -{ - if (!(fattr->cf_mode & S_IFREG)) - /* it's not a symlink */ - return false; - - if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE) - /* it's not a symlink */ - return false; - - return true; -} - -int -open_query_close_cifs_symlink(const unsigned char *path, char *pbuf, - unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb, - unsigned int xid) -{ - int rc; - int oplock = 0; - __u16 netfid = 0; - struct tcon_link *tlink; - struct cifs_tcon *ptcon; - struct cifs_io_parms io_parms; - int buf_type = CIFS_NO_BUFFER; - FILE_ALL_INFO file_info; - tlink = cifs_sb_tlink(cifs_sb); - if (IS_ERR(tlink)) - return PTR_ERR(tlink); - ptcon = tlink_tcon(tlink); + if (tcon->ses->server->ops->query_mf_symlink) + rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon, + cifs_sb, path, buf, &bytes_read); + else + rc = -ENOSYS; - rc = CIFSSMBOpen(xid, ptcon, path, FILE_OPEN, GENERIC_READ, - CREATE_NOT_DIR, &netfid, &oplock, &file_info, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc != 0) { - cifs_put_tlink(tlink); - return rc; - } + if (rc) + goto out; - if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { - CIFSSMBClose(xid, ptcon, netfid); - cifs_put_tlink(tlink); - /* it's not a symlink */ - return rc; + if (bytes_read == 0) { /* not a symlink */ + rc = -EINVAL; + goto out; } - io_parms.netfid = netfid; - io_parms.pid = current->tgid; - io_parms.tcon = ptcon; - io_parms.offset = 0; - io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; - - rc = CIFSSMBRead(xid, &io_parms, pbytes_read, &pbuf, &buf_type); - CIFSSMBClose(xid, ptcon, netfid); - cifs_put_tlink(tlink); + rc = parse_mf_symlink(buf, bytes_read, &link_len, symlinkinfo); +out: + kfree(buf); return rc; } - int -CIFSCheckMFSymlink(struct cifs_fattr *fattr, - const unsigned char *path, - struct cifs_sb_info *cifs_sb, unsigned int xid) +check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, + const unsigned char *path) { - int rc = 0; + int rc; u8 *buf = NULL; unsigned int link_len = 0; unsigned int bytes_read = 0; - struct cifs_tcon *ptcon; - if (!CIFSCouldBeMFSymlink(fattr)) + if (!couldbe_mf_symlink(fattr)) /* it's not a symlink */ return 0; buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); - if (!buf) { - rc = -ENOMEM; - goto out; - } + if (!buf) + return -ENOMEM; - ptcon = tlink_tcon(cifs_sb_tlink(cifs_sb)); - if ((ptcon->ses) && (ptcon->ses->server->ops->query_mf_symlink)) - rc = ptcon->ses->server->ops->query_mf_symlink(path, buf, - &bytes_read, cifs_sb, xid); + if (tcon->ses->server->ops->query_mf_symlink) + rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon, + cifs_sb, path, buf, &bytes_read); else - goto out; + rc = -ENOSYS; - if (rc != 0) + if (rc) goto out; if (bytes_read == 0) /* not a symlink */ goto out; - rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL); + rc = parse_mf_symlink(buf, bytes_read, &link_len, NULL); if (rc == -EINVAL) { /* it's not a symlink */ rc = 0; @@ -407,6 +309,95 @@ out: return rc; } +/* + * SMB 1.0 Protocol specific functions + */ + +int +cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const unsigned char *path, + char *pbuf, unsigned int *pbytes_read) +{ + int rc; + int oplock = 0; + struct cifs_fid fid; + struct cifs_open_parms oparms; + struct cifs_io_parms io_parms; + int buf_type = CIFS_NO_BUFFER; + FILE_ALL_INFO file_info; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = GENERIC_READ; + oparms.create_options = CREATE_NOT_DIR; + oparms.disposition = FILE_OPEN; + oparms.path = path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, &file_info); + if (rc) + return rc; + + if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) + /* it's not a symlink */ + goto out; + + io_parms.netfid = fid.netfid; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; + + rc = CIFSSMBRead(xid, &io_parms, pbytes_read, &pbuf, &buf_type); +out: + CIFSSMBClose(xid, tcon, fid.netfid); + return rc; +} + +int +cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const unsigned char *path, + char *pbuf, unsigned int *pbytes_written) +{ + int rc; + int oplock = 0; + struct cifs_fid fid; + struct cifs_open_parms oparms; + struct cifs_io_parms io_parms; + int create_options = CREATE_NOT_DIR; + + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = GENERIC_WRITE; + oparms.create_options = create_options; + oparms.disposition = FILE_CREATE; + oparms.path = path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, NULL); + if (rc) + return rc; + + io_parms.netfid = fid.netfid; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; + + rc = CIFSSMBWrite(xid, &io_parms, pbytes_written, pbuf, NULL, 0); + CIFSSMBClose(xid, tcon, fid.netfid); + return rc; +} + +/* + * M-F Symlink Functions - End + */ + int cifs_hardlink(struct dentry *old_file, struct inode *inode, struct dentry *direntry) @@ -442,8 +433,10 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, CIFS_MOUNT_MAP_SPECIAL_CHR); else { server = tcon->ses->server; - if (!server->ops->create_hardlink) - return -ENOSYS; + if (!server->ops->create_hardlink) { + rc = -ENOSYS; + goto cifs_hl_exit; + } rc = server->ops->create_hardlink(xid, tcon, from_name, to_name, cifs_sb); if ((rc == -EIO) || (rc == -EINVAL)) @@ -534,15 +527,10 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd) * and fallback to UNIX Extensions Symlinks. */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) - rc = CIFSQueryMFSymLink(xid, tcon, full_path, &target_path, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + rc = query_mf_symlink(xid, tcon, cifs_sb, full_path, + &target_path); - if ((rc != 0) && cap_unix(tcon->ses)) - rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path, - cifs_sb->local_nls); - else if (rc != 0 && server->ops->query_symlink) + if (rc != 0 && server->ops->query_symlink) rc = server->ops->query_symlink(xid, tcon, full_path, &target_path, cifs_sb); @@ -591,8 +579,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname) /* BB what if DFS and this volume is on different share? BB */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) - rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname, - cifs_sb); + rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname); else if (pTcon->unix_ext) rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname, cifs_sb->local_nls); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 138a011633f..3b0c62e622d 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -278,7 +278,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ , } static int -check_smb_hdr(struct smb_hdr *smb, __u16 mid) +check_smb_hdr(struct smb_hdr *smb) { /* does it have the right SMB "signature" ? */ if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) { @@ -287,13 +287,6 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid) return 1; } - /* Make sure that message ids match */ - if (mid != smb->Mid) { - cifs_dbg(VFS, "Mids do not match. received=%u expected=%u\n", - smb->Mid, mid); - return 1; - } - /* if it's a response then accept */ if (smb->Flags & SMBFLG_RESPONSE) return 0; @@ -302,7 +295,8 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid) if (smb->Command == SMB_COM_LOCKING_ANDX) return 0; - cifs_dbg(VFS, "Server sent request, not response. mid=%u\n", smb->Mid); + cifs_dbg(VFS, "Server sent request, not response. mid=%u\n", + get_mid(smb)); return 1; } @@ -310,7 +304,6 @@ int checkSMB(char *buf, unsigned int total_read) { struct smb_hdr *smb = (struct smb_hdr *)buf; - __u16 mid = smb->Mid; __u32 rfclen = be32_to_cpu(smb->smb_buf_length); __u32 clc_len; /* calculated length */ cifs_dbg(FYI, "checkSMB Length: 0x%x, smb_buf_length: 0x%x\n", @@ -348,7 +341,7 @@ checkSMB(char *buf, unsigned int total_read) } /* otherwise, there is enough to get to the BCC */ - if (check_smb_hdr(smb, mid)) + if (check_smb_hdr(smb)) return -EIO; clc_len = smbCalcSize(smb); @@ -359,6 +352,7 @@ checkSMB(char *buf, unsigned int total_read) } if (4 + rfclen != clc_len) { + __u16 mid = get_mid(smb); /* check if bcc wrapped around for large read responses */ if ((rfclen > 64 * 1024) && (rfclen > clc_len)) { /* check if lengths match mod 64K */ @@ -366,11 +360,11 @@ checkSMB(char *buf, unsigned int total_read) return 0; /* bcc wrapped */ } cifs_dbg(FYI, "Calculated size %u vs length %u mismatch for mid=%u\n", - clc_len, 4 + rfclen, smb->Mid); + clc_len, 4 + rfclen, mid); if (4 + rfclen < clc_len) { cifs_dbg(VFS, "RFC1001 size %u smaller than SMB for mid=%u\n", - rfclen, smb->Mid); + rfclen, mid); return -EIO; } else if (rfclen > clc_len + 512) { /* @@ -383,7 +377,7 @@ checkSMB(char *buf, unsigned int total_read) * data to 512 bytes. */ cifs_dbg(VFS, "RFC1001 size %u more than 512 bytes larger than SMB for mid=%u\n", - rfclen, smb->Mid); + rfclen, mid); return -EIO; } } @@ -472,8 +466,22 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) cifs_dbg(FYI, "file id match, oplock break\n"); pCifsInode = CIFS_I(netfile->dentry->d_inode); - cifs_set_oplock_level(pCifsInode, - pSMB->OplockLevel ? OPLOCK_READ : 0); + set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, + &pCifsInode->flags); + + /* + * Set flag if the server downgrades the oplock + * to L2 else clear. + */ + if (pSMB->OplockLevel) + set_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &pCifsInode->flags); + else + clear_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &pCifsInode->flags); + queue_work(cifsiod_wq, &netfile->oplock_break); netfile->oplock_break_cancelled = false; @@ -557,6 +565,62 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) cinode->oplock = 0; } +static int +cifs_oplock_break_wait(void *unused) +{ + schedule(); + return signal_pending(current) ? -ERESTARTSYS : 0; +} + +/* + * We wait for oplock breaks to be processed before we attempt to perform + * writes. + */ +int cifs_get_writer(struct cifsInodeInfo *cinode) +{ + int rc; + +start: + rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK, + cifs_oplock_break_wait, TASK_KILLABLE); + if (rc) + return rc; + + spin_lock(&cinode->writers_lock); + if (!cinode->writers) + set_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + cinode->writers++; + /* Check to see if we have started servicing an oplock break */ + if (test_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags)) { + cinode->writers--; + if (cinode->writers == 0) { + clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS); + } + spin_unlock(&cinode->writers_lock); + goto start; + } + spin_unlock(&cinode->writers_lock); + return 0; +} + +void cifs_put_writer(struct cifsInodeInfo *cinode) +{ + spin_lock(&cinode->writers_lock); + cinode->writers--; + if (cinode->writers == 0) { + clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS); + } + spin_unlock(&cinode->writers_lock); +} + +void cifs_done_oplock_break(struct cifsInodeInfo *cinode) +{ + clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK); +} + bool backup_cred(struct cifs_sb_info *cifs_sb) { diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index af847e1cf1c..6834b9c3bec 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -51,7 +51,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = { {ERRnoaccess, -EACCES}, {ERRbadfid, -EBADF}, {ERRbadmcb, -EIO}, - {ERRnomem, -ENOMEM}, + {ERRnomem, -EREMOTEIO}, {ERRbadmem, -EFAULT}, {ERRbadenv, -EFAULT}, {ERRbadformat, -EINVAL}, @@ -780,7 +780,9 @@ static const struct { ERRDOS, ERRnoaccess, 0xc0000290}, { ERRDOS, ERRbadfunc, 0xc000029c}, { ERRDOS, ERRsymlink, NT_STATUS_STOPPED_ON_SYMLINK}, { - ERRDOS, ERRinvlevel, 0x007c0001}, }; + ERRDOS, ERRinvlevel, 0x007c0001}, { + 0, 0, 0 } +}; /***************************************************************************** Print an error message from the status code @@ -793,8 +795,8 @@ cifs_print_status(__u32 status_code) while (nt_errs[idx].nt_errstr != NULL) { if (((nt_errs[idx].nt_errcode) & 0xFFFFFF) == (status_code & 0xFFFFFF)) { - printk(KERN_NOTICE "Status code returned 0x%08x %s\n", - status_code, nt_errs[idx].nt_errstr); + pr_notice("Status code returned 0x%08x %s\n", + status_code, nt_errs[idx].nt_errstr); } idx++; } @@ -939,8 +941,9 @@ cifs_UnixTimeToNT(struct timespec t) return (u64) t.tv_sec * 10000000 + t.tv_nsec/100 + NTFS_TIME_OFFSET; } -static int total_days_of_prev_months[] = -{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334}; +static const int total_days_of_prev_months[] = { + 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 +}; struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset) { diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 53a75f3d017..b15862e0f68 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -134,22 +134,6 @@ out: dput(dentry); } -/* - * Is it possible that this directory might turn out to be a DFS referral - * once we go to try and use it? - */ -static bool -cifs_dfs_is_possible(struct cifs_sb_info *cifs_sb) -{ -#ifdef CONFIG_CIFS_DFS_UPCALL - struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); - - if (tcon->Flags & SMB_SHARE_IS_IN_DFS) - return true; -#endif - return false; -} - static void cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) { @@ -159,27 +143,19 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode; fattr->cf_dtype = DT_DIR; - /* - * Windows CIFS servers generally make DFS referrals look - * like directories in FIND_* responses with the reparse - * attribute flag also set (since DFS junctions are - * reparse points). We must revalidate at least these - * directory inodes before trying to use them (if - * they are DFS we will get PATH_NOT_COVERED back - * when queried directly and can then try to connect - * to the DFS target) - */ - if (cifs_dfs_is_possible(cifs_sb) && - (fattr->cf_cifsattrs & ATTR_REPARSE)) - fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; - } else if (fattr->cf_cifsattrs & ATTR_REPARSE) { - fattr->cf_mode = S_IFLNK; - fattr->cf_dtype = DT_LNK; } else { fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode; fattr->cf_dtype = DT_REG; } + /* + * We need to revalidate it further to make a decision about whether it + * is a symbolic link, DFS referral or a reparse point with a direct + * access like junctions, deduplicated files, NFS symlinks. + */ + if (fattr->cf_cifsattrs & ATTR_REPARSE) + fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; + /* non-unix readdir doesn't provide nlink */ fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK; @@ -773,7 +749,7 @@ static int cifs_filldir(char *find_entry, struct file *file, } if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) && - CIFSCouldBeMFSymlink(&fattr)) + couldbe_mf_symlink(&fattr)) /* * trying to get the type and mode can be slow, * so just call those regular files for now, and mark diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 352358de1d7..e87387dbf39 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -500,9 +500,9 @@ select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) return NTLMv2; if (global_secflags & CIFSSEC_MAY_NTLM) return NTLM; - /* Fallthrough */ default: - return Unspecified; + /* Fallthrough to attempt LANMAN authentication next */ + break; } case CIFS_NEGFLAVOR_LANMAN: switch (requested) { diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 8233b174de3..d1fdfa84870 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -67,7 +67,7 @@ send_nt_cancel(struct TCP_Server_Info *server, void *buf, mutex_unlock(&server->srv_mutex); cifs_dbg(FYI, "issued NT_CANCEL for mid %u, rc = %d\n", - in_buf->Mid, rc); + get_mid(in_buf), rc); return rc; } @@ -101,7 +101,7 @@ cifs_find_mid(struct TCP_Server_Info *server, char *buffer) spin_lock(&GlobalMid_Lock); list_for_each_entry(mid, &server->pending_mid_q, qhead) { - if (mid->mid == buf->Mid && + if (compare_mid(mid->mid, buf) && mid->mid_state == MID_REQUEST_SUBMITTED && le16_to_cpu(mid->command) == buf->Command) { spin_unlock(&GlobalMid_Lock); @@ -372,6 +372,16 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) return 0; } +static void +cifs_downgrade_oplock(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, bool set_level2) +{ + if (set_level2) + cifs_set_oplock_level(cinode, OPLOCK_READ); + else + cifs_set_oplock_level(cinode, 0); +} + static bool cifs_check_trans2(struct mid_q_entry *mid, struct TCP_Server_Info *server, char *buf, int malformed) @@ -534,10 +544,12 @@ cifs_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, static int cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, - FILE_ALL_INFO *data, bool *adjustTZ) + FILE_ALL_INFO *data, bool *adjustTZ, bool *symlink) { int rc; + *symlink = false; + /* could do find first instead but this returns more info */ rc = CIFSSMBQPathInfo(xid, tcon, full_path, data, 0 /* not legacy */, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & @@ -554,6 +566,30 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, CIFS_MOUNT_MAP_SPECIAL_CHR); *adjustTZ = true; } + + if (!rc && (le32_to_cpu(data->Attributes) & ATTR_REPARSE)) { + int tmprc; + int oplock = 0; + struct cifs_fid fid; + struct cifs_open_parms oparms; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.create_options = 0; + oparms.disposition = FILE_OPEN; + oparms.path = full_path; + oparms.fid = &fid; + oparms.reconnect = false; + + /* Need to check if this is a symbolic link or not */ + tmprc = CIFS_open(xid, &oparms, &oplock, NULL); + if (tmprc == -EOPNOTSUPP) + *symlink = true; + else + CIFSSMBClose(xid, tcon, fid.netfid); + } + return rc; } @@ -686,12 +722,7 @@ cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms, oparms->cifs_sb->local_nls, oparms->cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - return CIFSSMBOpen(xid, oparms->tcon, oparms->path, - oparms->disposition, oparms->desired_access, - oparms->create_options, &oparms->fid->netfid, oplock, - buf, oparms->cifs_sb->local_nls, - oparms->cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + return CIFS_open(xid, oparms, oplock, buf); } static void @@ -742,8 +773,9 @@ smb_set_file_info(struct inode *inode, const char *full_path, { int oplock = 0; int rc; - __u16 netfid; __u32 netpid; + struct cifs_fid fid; + struct cifs_open_parms oparms; struct cifsFileInfo *open_file; struct cifsInodeInfo *cinode = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); @@ -753,7 +785,7 @@ smb_set_file_info(struct inode *inode, const char *full_path, /* if the file is already open for write, just use that fileid */ open_file = find_writable_file(cinode, true); if (open_file) { - netfid = open_file->fid.netfid; + fid.netfid = open_file->fid.netfid; netpid = open_file->pid; tcon = tlink_tcon(open_file->tlink); goto set_via_filehandle; @@ -777,12 +809,17 @@ smb_set_file_info(struct inode *inode, const char *full_path, goto out; } - cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n"); - rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, - SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR, - &netfid, &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = SYNCHRONIZE | FILE_WRITE_ATTRIBUTES; + oparms.create_options = CREATE_NOT_DIR; + oparms.disposition = FILE_OPEN; + oparms.path = full_path; + oparms.fid = &fid; + oparms.reconnect = false; + cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n"); + rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc != 0) { if (rc == -EIO) rc = -EINVAL; @@ -792,12 +829,12 @@ smb_set_file_info(struct inode *inode, const char *full_path, netpid = current->tgid; set_via_filehandle: - rc = CIFSSMBSetFileInfo(xid, tcon, buf, netfid, netpid); + rc = CIFSSMBSetFileInfo(xid, tcon, buf, fid.netfid, netpid); if (!rc) cinode->cifsAttrs = le32_to_cpu(buf->Attributes); if (open_file == NULL) - CIFSSMBClose(xid, tcon, netfid); + CIFSSMBClose(xid, tcon, fid.netfid); else cifsFileInfo_put(open_file); out: @@ -807,6 +844,13 @@ out: } static int +cifs_set_compression(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile) +{ + return CIFSSMB_set_compression(xid, tcon, cfile->fid.netfid); +} + +static int cifs_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, const char *path, struct cifs_sb_info *cifs_sb, struct cifs_fid *fid, __u16 search_flags, @@ -882,33 +926,80 @@ cifs_mand_lock(const unsigned int xid, struct cifsFileInfo *cfile, __u64 offset, } static int +cifs_unix_dfs_readlink(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, char **symlinkinfo, + const struct nls_table *nls_codepage) +{ +#ifdef CONFIG_CIFS_DFS_UPCALL + int rc; + unsigned int num_referrals = 0; + struct dfs_info3_param *referrals = NULL; + + rc = get_dfs_path(xid, tcon->ses, searchName, nls_codepage, + &num_referrals, &referrals, 0); + + if (!rc && num_referrals > 0) { + *symlinkinfo = kstrndup(referrals->node_name, + strlen(referrals->node_name), + GFP_KERNEL); + if (!*symlinkinfo) + rc = -ENOMEM; + free_dfs_info_array(referrals, num_referrals); + } + return rc; +#else /* No DFS support */ + return -EREMOTE; +#endif +} + +static int cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, const char *full_path, char **target_path, struct cifs_sb_info *cifs_sb) { int rc; int oplock = 0; - __u16 netfid; + struct cifs_fid fid; + struct cifs_open_parms oparms; cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); - rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, - FILE_READ_ATTRIBUTES, OPEN_REPARSE_POINT, &netfid, - &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + /* Check for unix extensions */ + if (cap_unix(tcon->ses)) { + rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, target_path, + cifs_sb->local_nls); + if (rc == -EREMOTE) + rc = cifs_unix_dfs_readlink(xid, tcon, full_path, + target_path, + cifs_sb->local_nls); + + goto out; + } + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.create_options = OPEN_REPARSE_POINT; + oparms.disposition = FILE_OPEN; + oparms.path = full_path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc) - return rc; + goto out; - rc = CIFSSMBQuerySymLink(xid, tcon, netfid, target_path, + rc = CIFSSMBQuerySymLink(xid, tcon, fid.netfid, target_path, cifs_sb->local_nls); - if (rc) { - CIFSSMBClose(xid, tcon, netfid); - return rc; - } + if (rc) + goto out_close; convert_delimiter(*target_path, '/'); - CIFSSMBClose(xid, tcon, netfid); - cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path); +out_close: + CIFSSMBClose(xid, tcon, fid.netfid); +out: + if (!rc) + cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path); return rc; } @@ -938,6 +1029,7 @@ struct smb_version_operations smb1_operations = { .clear_stats = cifs_clear_stats, .print_stats = cifs_print_stats, .is_oplock_break = is_valid_oplock_break, + .downgrade_oplock = cifs_downgrade_oplock, .check_trans2 = cifs_check_trans2, .need_neg = cifs_need_neg, .negotiate = cifs_negotiate, @@ -956,6 +1048,7 @@ struct smb_version_operations smb1_operations = { .set_path_size = CIFSSMBSetEOF, .set_file_size = CIFSSMBSetFileSize, .set_file_info = smb_set_file_info, + .set_compression = cifs_set_compression, .echo = CIFSSMBEcho, .mkdir = CIFSSMBMkDir, .mkdir_setinfo = cifs_mkdir_setinfo, @@ -982,8 +1075,18 @@ struct smb_version_operations smb1_operations = { .mand_lock = cifs_mand_lock, .mand_unlock_range = cifs_unlock_range, .push_mand_locks = cifs_push_mandatory_locks, - .query_mf_symlink = open_query_close_cifs_symlink, + .query_mf_symlink = cifs_query_mf_symlink, + .create_mf_symlink = cifs_create_mf_symlink, .is_read_op = cifs_is_read_op, +#ifdef CONFIG_CIFS_XATTR + .query_all_EAs = CIFSSMBQAllEAs, + .set_EA = CIFSSMBSetEA, +#endif /* CIFS_XATTR */ +#ifdef CONFIG_CIFS_ACL + .get_acl = get_cifs_acl, + .get_acl_by_fid = get_cifs_acl_by_fid, + .set_acl = set_cifs_acl, +#endif /* CIFS_ACL */ }; struct smb_version_values smb1_values = { diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index c38350851b0..bc0bb9c34f7 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -57,4 +57,7 @@ #define SMB2_CMACAES_SIZE (16) #define SMB3_SIGNKEY_SIZE (16) +/* Maximum buffer size value we can send with 1 credit */ +#define SMB2_MAX_BUFFER_SIZE 65536 + #endif /* _SMB2_GLOB_H */ diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 78ff88c467b..84c012a6aba 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -123,12 +123,13 @@ move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src) int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, - FILE_ALL_INFO *data, bool *adjust_tz) + FILE_ALL_INFO *data, bool *adjust_tz, bool *symlink) { int rc; struct smb2_file_all_info *smb2_data; *adjust_tz = false; + *symlink = false; smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2, GFP_KERNEL); @@ -136,9 +137,16 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, return -ENOMEM; rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, - OPEN_REPARSE_POINT, smb2_data, - SMB2_OP_QUERY_INFO); + FILE_READ_ATTRIBUTES, FILE_OPEN, 0, + smb2_data, SMB2_OP_QUERY_INFO); + if (rc == -EOPNOTSUPP) { + *symlink = true; + /* Failed on a symbolic link - query a reparse point info */ + rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path, + FILE_READ_ATTRIBUTES, FILE_OPEN, + OPEN_REPARSE_POINT, smb2_data, + SMB2_OP_QUERY_INFO); + } if (rc) goto out; diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 7c2f45c06fc..94bd4fbb13d 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -306,7 +306,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_NONEXISTENT_SECTOR, -EIO, "STATUS_NONEXISTENT_SECTOR"}, {STATUS_MORE_PROCESSING_REQUIRED, -EIO, "STATUS_MORE_PROCESSING_REQUIRED"}, - {STATUS_NO_MEMORY, -ENOMEM, "STATUS_NO_MEMORY"}, + {STATUS_NO_MEMORY, -EREMOTEIO, "STATUS_NO_MEMORY"}, {STATUS_CONFLICTING_ADDRESSES, -EADDRINUSE, "STATUS_CONFLICTING_ADDRESSES"}, {STATUS_NOT_MAPPED_VIEW, -EIO, "STATUS_NOT_MAPPED_VIEW"}, diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index fb3966265b6..b8021fde987 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -575,9 +575,21 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) else cfile->oplock_break_cancelled = false; - server->ops->set_oplock_level(cinode, - rsp->OplockLevel ? SMB2_OPLOCK_LEVEL_II : 0, - 0, NULL); + set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, + &cinode->flags); + + /* + * Set flag if the server downgrades the oplock + * to L2 else clear. + */ + if (rsp->OplockLevel) + set_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); + else + clear_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); queue_work(cifsiod_wq, &cfile->oplock_break); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 861b3321414..787844bde38 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -182,11 +182,8 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) /* start with specified wsize, or default */ wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE; wsize = min_t(unsigned int, wsize, server->max_write); - /* - * limit write size to 2 ** 16, because we don't support multicredit - * requests now. - */ - wsize = min_t(unsigned int, wsize, 2 << 15); + /* set it to the maximum buffer size value we can send with 1 credit */ + wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); return wsize; } @@ -200,15 +197,100 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) /* start with specified rsize, or default */ rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE; rsize = min_t(unsigned int, rsize, server->max_read); - /* - * limit write size to 2 ** 16, because we don't support multicredit - * requests now. - */ - rsize = min_t(unsigned int, rsize, 2 << 15); + /* set it to the maximum buffer size value we can send with 1 credit */ + rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE); return rsize; } +#ifdef CONFIG_CIFS_STATS2 +static int +SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) +{ + int rc; + unsigned int ret_data_len = 0; + struct network_interface_info_ioctl_rsp *out_buf; + + rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, + FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */, + NULL /* no data input */, 0 /* no data input */, + (char **)&out_buf, &ret_data_len); + + if ((rc == 0) && (ret_data_len > 0)) { + /* Dump info on first interface */ + cifs_dbg(FYI, "Adapter Capability 0x%x\t", + le32_to_cpu(out_buf->Capability)); + cifs_dbg(FYI, "Link Speed %lld\n", + le64_to_cpu(out_buf->LinkSpeed)); + } else + cifs_dbg(VFS, "error %d on ioctl to get interface list\n", rc); + + return rc; +} +#endif /* STATS2 */ + +static void +smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) +{ + int rc; + __le16 srch_path = 0; /* Null - open root of share */ + u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + struct cifs_open_parms oparms; + struct cifs_fid fid; + + oparms.tcon = tcon; + oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.disposition = FILE_OPEN; + oparms.create_options = 0; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL); + if (rc) + return; + +#ifdef CONFIG_CIFS_STATS2 + SMB3_request_interfaces(xid, tcon); +#endif /* STATS2 */ + + SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, + FS_ATTRIBUTE_INFORMATION); + SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, + FS_DEVICE_INFORMATION); + SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, + FS_SECTOR_SIZE_INFORMATION); /* SMB3 specific */ + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + return; +} + +static void +smb2_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) +{ + int rc; + __le16 srch_path = 0; /* Null - open root of share */ + u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + struct cifs_open_parms oparms; + struct cifs_fid fid; + + oparms.tcon = tcon; + oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.disposition = FILE_OPEN; + oparms.create_options = 0; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL); + if (rc) + return; + + SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, + FS_ATTRIBUTE_INFORMATION); + SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, + FS_DEVICE_INFORMATION); + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + return; +} + static int smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path) @@ -304,7 +386,19 @@ smb2_dump_share_caps(struct seq_file *m, struct cifs_tcon *tcon) seq_puts(m, " ASYMMETRIC,"); if (tcon->capabilities == 0) seq_puts(m, " None"); + if (tcon->ss_flags & SSINFO_FLAGS_ALIGNED_DEVICE) + seq_puts(m, " Aligned,"); + if (tcon->ss_flags & SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE) + seq_puts(m, " Partition Aligned,"); + if (tcon->ss_flags & SSINFO_FLAGS_NO_SEEK_PENALTY) + seq_puts(m, " SSD,"); + if (tcon->ss_flags & SSINFO_FLAGS_TRIM_ENABLED) + seq_puts(m, " TRIM-support,"); + seq_printf(m, "\tShare Flags: 0x%x", tcon->share_flags); + if (tcon->perf_sector_size) + seq_printf(m, "\tOptimal sector size: 0x%x", + tcon->perf_sector_size); } static void @@ -394,6 +488,157 @@ smb2_close_file(const unsigned int xid, struct cifs_tcon *tcon, } static int +SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, + struct copychunk_ioctl *pcchunk) +{ + int rc; + unsigned int ret_data_len; + struct resume_key_req *res_key; + + rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, + FSCTL_SRV_REQUEST_RESUME_KEY, true /* is_fsctl */, + NULL, 0 /* no input */, + (char **)&res_key, &ret_data_len); + + if (rc) { + cifs_dbg(VFS, "refcpy ioctl error %d getting resume key\n", rc); + goto req_res_key_exit; + } + if (ret_data_len < sizeof(struct resume_key_req)) { + cifs_dbg(VFS, "Invalid refcopy resume key length\n"); + rc = -EINVAL; + goto req_res_key_exit; + } + memcpy(pcchunk->SourceKey, res_key->ResumeKey, COPY_CHUNK_RES_KEY_SIZE); + +req_res_key_exit: + kfree(res_key); + return rc; +} + +static int +smb2_clone_range(const unsigned int xid, + struct cifsFileInfo *srcfile, + struct cifsFileInfo *trgtfile, u64 src_off, + u64 len, u64 dest_off) +{ + int rc; + unsigned int ret_data_len; + struct copychunk_ioctl *pcchunk; + struct copychunk_ioctl_rsp *retbuf = NULL; + struct cifs_tcon *tcon; + int chunks_copied = 0; + bool chunk_sizes_updated = false; + + pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL); + + if (pcchunk == NULL) + return -ENOMEM; + + cifs_dbg(FYI, "in smb2_clone_range - about to call request res key\n"); + /* Request a key from the server to identify the source of the copy */ + rc = SMB2_request_res_key(xid, tlink_tcon(srcfile->tlink), + srcfile->fid.persistent_fid, + srcfile->fid.volatile_fid, pcchunk); + + /* Note: request_res_key sets res_key null only if rc !=0 */ + if (rc) + goto cchunk_out; + + /* For now array only one chunk long, will make more flexible later */ + pcchunk->ChunkCount = __constant_cpu_to_le32(1); + pcchunk->Reserved = 0; + pcchunk->Reserved2 = 0; + + tcon = tlink_tcon(trgtfile->tlink); + + while (len > 0) { + pcchunk->SourceOffset = cpu_to_le64(src_off); + pcchunk->TargetOffset = cpu_to_le64(dest_off); + pcchunk->Length = + cpu_to_le32(min_t(u32, len, tcon->max_bytes_chunk)); + + /* Request server copy to target from src identified by key */ + rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, + trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE, + true /* is_fsctl */, (char *)pcchunk, + sizeof(struct copychunk_ioctl), (char **)&retbuf, + &ret_data_len); + if (rc == 0) { + if (ret_data_len != + sizeof(struct copychunk_ioctl_rsp)) { + cifs_dbg(VFS, "invalid cchunk response size\n"); + rc = -EIO; + goto cchunk_out; + } + if (retbuf->TotalBytesWritten == 0) { + cifs_dbg(FYI, "no bytes copied\n"); + rc = -EIO; + goto cchunk_out; + } + /* + * Check if server claimed to write more than we asked + */ + if (le32_to_cpu(retbuf->TotalBytesWritten) > + le32_to_cpu(pcchunk->Length)) { + cifs_dbg(VFS, "invalid copy chunk response\n"); + rc = -EIO; + goto cchunk_out; + } + if (le32_to_cpu(retbuf->ChunksWritten) != 1) { + cifs_dbg(VFS, "invalid num chunks written\n"); + rc = -EIO; + goto cchunk_out; + } + chunks_copied++; + + src_off += le32_to_cpu(retbuf->TotalBytesWritten); + dest_off += le32_to_cpu(retbuf->TotalBytesWritten); + len -= le32_to_cpu(retbuf->TotalBytesWritten); + + cifs_dbg(FYI, "Chunks %d PartialChunk %d Total %d\n", + le32_to_cpu(retbuf->ChunksWritten), + le32_to_cpu(retbuf->ChunkBytesWritten), + le32_to_cpu(retbuf->TotalBytesWritten)); + } else if (rc == -EINVAL) { + if (ret_data_len != sizeof(struct copychunk_ioctl_rsp)) + goto cchunk_out; + + cifs_dbg(FYI, "MaxChunks %d BytesChunk %d MaxCopy %d\n", + le32_to_cpu(retbuf->ChunksWritten), + le32_to_cpu(retbuf->ChunkBytesWritten), + le32_to_cpu(retbuf->TotalBytesWritten)); + + /* + * Check if this is the first request using these sizes, + * (ie check if copy succeed once with original sizes + * and check if the server gave us different sizes after + * we already updated max sizes on previous request). + * if not then why is the server returning an error now + */ + if ((chunks_copied != 0) || chunk_sizes_updated) + goto cchunk_out; + + /* Check that server is not asking us to grow size */ + if (le32_to_cpu(retbuf->ChunkBytesWritten) < + tcon->max_bytes_chunk) + tcon->max_bytes_chunk = + le32_to_cpu(retbuf->ChunkBytesWritten); + else + goto cchunk_out; /* server gave us bogus size */ + + /* No need to change MaxChunks since already set to 1 */ + chunk_sizes_updated = true; + } + } + +cchunk_out: + kfree(pcchunk); + return rc; +} + +static int smb2_flush_file(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *fid) { @@ -446,6 +691,14 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, } static int +smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile) +{ + return SMB2_set_compression(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid); +} + +static int smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, const char *path, struct cifs_sb_info *cifs_sb, struct cifs_fid *fid, __u16 search_flags, @@ -652,6 +905,17 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, } static void +smb2_downgrade_oplock(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, bool set_level2) +{ + if (set_level2) + server->ops->set_oplock_level(cinode, SMB2_OPLOCK_LEVEL_II, + 0, NULL); + else + server->ops->set_oplock_level(cinode, 0, 0, NULL); +} + +static void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, unsigned int epoch, bool *purge_cache) { @@ -783,6 +1047,7 @@ smb2_create_lease_buf(u8 *lease_key, u8 oplock) buf->ccontext.NameOffset = cpu_to_le16(offsetof (struct create_lease, Name)); buf->ccontext.NameLength = cpu_to_le16(4); + /* SMB2_CREATE_REQUEST_LEASE is "RqLs" */ buf->Name[0] = 'R'; buf->Name[1] = 'q'; buf->Name[2] = 'L'; @@ -809,6 +1074,7 @@ smb3_create_lease_buf(u8 *lease_key, u8 oplock) buf->ccontext.NameOffset = cpu_to_le16(offsetof (struct create_lease_v2, Name)); buf->ccontext.NameLength = cpu_to_le16(4); + /* SMB2_CREATE_REQUEST_LEASE is "RqLs" */ buf->Name[0] = 'R'; buf->Name[1] = 'q'; buf->Name[2] = 'L'; @@ -857,6 +1123,7 @@ struct smb_version_operations smb20_operations = { .clear_stats = smb2_clear_stats, .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -865,6 +1132,7 @@ struct smb_version_operations smb20_operations = { .logoff = SMB2_logoff, .tree_connect = SMB2_tcon, .tree_disconnect = SMB2_tdis, + .qfs_tcon = smb2_qfs_tcon, .is_path_accessible = smb2_is_path_accessible, .can_echo = smb2_can_echo, .echo = SMB2_echo, @@ -874,6 +1142,7 @@ struct smb_version_operations smb20_operations = { .set_path_size = smb2_set_path_size, .set_file_size = smb2_set_file_size, .set_file_info = smb2_set_file_info, + .set_compression = smb2_set_compression, .mkdir = smb2_mkdir, .mkdir_setinfo = smb2_mkdir_setinfo, .rmdir = smb2_rmdir, @@ -907,6 +1176,7 @@ struct smb_version_operations smb20_operations = { .set_oplock_level = smb2_set_oplock_level, .create_lease_buf = smb2_create_lease_buf, .parse_lease_buf = smb2_parse_lease_buf, + .clone_range = smb2_clone_range, }; struct smb_version_operations smb21_operations = { @@ -928,6 +1198,7 @@ struct smb_version_operations smb21_operations = { .clear_stats = smb2_clear_stats, .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -936,6 +1207,7 @@ struct smb_version_operations smb21_operations = { .logoff = SMB2_logoff, .tree_connect = SMB2_tcon, .tree_disconnect = SMB2_tdis, + .qfs_tcon = smb2_qfs_tcon, .is_path_accessible = smb2_is_path_accessible, .can_echo = smb2_can_echo, .echo = SMB2_echo, @@ -945,6 +1217,7 @@ struct smb_version_operations smb21_operations = { .set_path_size = smb2_set_path_size, .set_file_size = smb2_set_file_size, .set_file_info = smb2_set_file_info, + .set_compression = smb2_set_compression, .mkdir = smb2_mkdir, .mkdir_setinfo = smb2_mkdir_setinfo, .rmdir = smb2_rmdir, @@ -978,6 +1251,7 @@ struct smb_version_operations smb21_operations = { .set_oplock_level = smb21_set_oplock_level, .create_lease_buf = smb2_create_lease_buf, .parse_lease_buf = smb2_parse_lease_buf, + .clone_range = smb2_clone_range, }; struct smb_version_operations smb30_operations = { @@ -1000,6 +1274,7 @@ struct smb_version_operations smb30_operations = { .print_stats = smb2_print_stats, .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -1008,6 +1283,7 @@ struct smb_version_operations smb30_operations = { .logoff = SMB2_logoff, .tree_connect = SMB2_tcon, .tree_disconnect = SMB2_tdis, + .qfs_tcon = smb3_qfs_tcon, .is_path_accessible = smb2_is_path_accessible, .can_echo = smb2_can_echo, .echo = SMB2_echo, @@ -1017,6 +1293,7 @@ struct smb_version_operations smb30_operations = { .set_path_size = smb2_set_path_size, .set_file_size = smb2_set_file_size, .set_file_info = smb2_set_file_info, + .set_compression = smb2_set_compression, .mkdir = smb2_mkdir, .mkdir_setinfo = smb2_mkdir_setinfo, .rmdir = smb2_rmdir, @@ -1051,6 +1328,8 @@ struct smb_version_operations smb30_operations = { .set_oplock_level = smb3_set_oplock_level, .create_lease_buf = smb3_create_lease_buf, .parse_lease_buf = smb3_parse_lease_buf, + .clone_range = smb2_clone_range, + .validate_negotiate = smb3_validate_negotiate, }; struct smb_version_values smb20_values = { diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index eba0efde66d..b0b260dbb19 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -375,7 +375,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) req->Capabilities = cpu_to_le32(ses->server->vals->req_capabilities); - memcpy(req->ClientGUID, cifs_client_guid, SMB2_CLIENT_GUID_SIZE); + /* ClientGUID must be zero for SMB2.02 dialect */ + if (ses->server->vals->protocol_id == SMB20_PROT_ID) + memset(req->ClientGUID, 0, SMB2_CLIENT_GUID_SIZE); + else + memcpy(req->ClientGUID, server->client_guid, + SMB2_CLIENT_GUID_SIZE); iov[0].iov_base = (char *)req; /* 4 for rfc1002 length field */ @@ -413,7 +418,9 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) /* SMB2 only has an extended negflavor */ server->negflavor = CIFS_NEGFLAVOR_EXTENDED; - server->maxBuf = le32_to_cpu(rsp->MaxTransactSize); + /* set it to the maximum buffer size value we can send with 1 credit */ + server->maxBuf = min_t(unsigned int, le32_to_cpu(rsp->MaxTransactSize), + SMB2_MAX_BUFFER_SIZE); server->max_read = le32_to_cpu(rsp->MaxReadSize); server->max_write = le32_to_cpu(rsp->MaxWriteSize); /* BB Do we need to validate the SecurityMode? */ @@ -454,6 +461,82 @@ neg_exit: return rc; } +int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) +{ + int rc = 0; + struct validate_negotiate_info_req vneg_inbuf; + struct validate_negotiate_info_rsp *pneg_rsp; + u32 rsplen; + + cifs_dbg(FYI, "validate negotiate\n"); + + /* + * validation ioctl must be signed, so no point sending this if we + * can not sign it. We could eventually change this to selectively + * sign just this, the first and only signed request on a connection. + * This is good enough for now since a user who wants better security + * would also enable signing on the mount. Having validation of + * negotiate info for signed connections helps reduce attack vectors + */ + if (tcon->ses->server->sign == false) + return 0; /* validation requires signing */ + + vneg_inbuf.Capabilities = + cpu_to_le32(tcon->ses->server->vals->req_capabilities); + memcpy(vneg_inbuf.Guid, tcon->ses->server->client_guid, + SMB2_CLIENT_GUID_SIZE); + + if (tcon->ses->sign) + vneg_inbuf.SecurityMode = + cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED); + else if (global_secflags & CIFSSEC_MAY_SIGN) + vneg_inbuf.SecurityMode = + cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED); + else + vneg_inbuf.SecurityMode = 0; + + vneg_inbuf.DialectCount = cpu_to_le16(1); + vneg_inbuf.Dialects[0] = + cpu_to_le16(tcon->ses->server->vals->protocol_id); + + rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, + FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */, + (char *)&vneg_inbuf, sizeof(struct validate_negotiate_info_req), + (char **)&pneg_rsp, &rsplen); + + if (rc != 0) { + cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc); + return -EIO; + } + + if (rsplen != sizeof(struct validate_negotiate_info_rsp)) { + cifs_dbg(VFS, "invalid size of protocol negotiate response\n"); + return -EIO; + } + + /* check validate negotiate info response matches what we got earlier */ + if (pneg_rsp->Dialect != + cpu_to_le16(tcon->ses->server->vals->protocol_id)) + goto vneg_out; + + if (pneg_rsp->SecurityMode != cpu_to_le16(tcon->ses->server->sec_mode)) + goto vneg_out; + + /* do not validate server guid because not saved at negprot time yet */ + + if ((le32_to_cpu(pneg_rsp->Capabilities) | SMB2_NT_FIND | + SMB2_LARGE_FILES) != tcon->ses->server->capabilities) + goto vneg_out; + + /* validate negotiate successful */ + cifs_dbg(FYI, "validate negotiate info successful\n"); + return 0; + +vneg_out: + cifs_dbg(VFS, "protocol revalidation - security settings mismatch\n"); + return -EIO; +} + int SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, const struct nls_table *nls_cp) @@ -630,6 +713,8 @@ ssetup_ntlmssp_authenticate: goto ssetup_exit; ses->session_flags = le16_to_cpu(rsp->SessionFlags); + if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) + cifs_dbg(VFS, "SMB3 encryption not supported yet\n"); ssetup_exit: free_rsp_buf(resp_buftype, rsp); @@ -687,6 +772,10 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) else return -EIO; + /* no need to send SMB logoff if uid already closed due to reconnect */ + if (ses->need_reconnect) + goto smb2_session_already_dead; + rc = small_smb2_init(SMB2_LOGOFF, NULL, (void **) &req); if (rc) return rc; @@ -701,6 +790,8 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) * No tcon so can't do * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]); */ + +smb2_session_already_dead: return rc; } @@ -711,6 +802,14 @@ static inline void cifs_stats_fail_inc(struct cifs_tcon *tcon, uint16_t code) #define MAX_SHARENAME_LENGTH (255 /* server */ + 80 /* share */ + 1 /* NULL */) +/* These are similar values to what Windows uses */ +static inline void init_copy_chunk_defaults(struct cifs_tcon *tcon) +{ + tcon->max_chunks = 256; + tcon->max_bytes_chunk = 1048576; + tcon->max_bytes_copy = 16777216; +} + int SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, struct cifs_tcon *tcon, const struct nls_table *cp) @@ -812,7 +911,9 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) && ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0)) cifs_dbg(VFS, "DFS capability contradicts DFS flag\n"); - + init_copy_chunk_defaults(tcon); + if (tcon->ses->server->ops->validate_negotiate) + rc = tcon->ses->server->ops->validate_negotiate(xid, tcon); tcon_exit: free_rsp_buf(resp_buftype, rsp); kfree(unc_path); @@ -871,6 +972,7 @@ create_durable_buf(void) buf->ccontext.NameOffset = cpu_to_le16(offsetof (struct create_durable, Name)); buf->ccontext.NameLength = cpu_to_le16(4); + /* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DHnQ" */ buf->Name[0] = 'D'; buf->Name[1] = 'H'; buf->Name[2] = 'n'; @@ -895,6 +997,7 @@ create_reconnect_durable_buf(struct cifs_fid *fid) buf->ccontext.NameLength = cpu_to_le16(4); buf->Data.Fid.PersistentFileId = fid->persistent_fid; buf->Data.Fid.VolatileFileId = fid->volatile_fid; + /* SMB2_CREATE_DURABLE_HANDLE_RECONNECT is "DHnC" */ buf->Name[0] = 'D'; buf->Name[1] = 'H'; buf->Name[2] = 'n'; @@ -994,6 +1097,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, int rc = 0; unsigned int num_iovecs = 2; __u32 file_attributes = 0; + char *dhc_buf = NULL, *lc_buf = NULL; cifs_dbg(FYI, "create/open\n"); @@ -1060,6 +1164,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, kfree(copy_path); return rc; } + lc_buf = iov[num_iovecs-1].iov_base; } if (*oplock == SMB2_OPLOCK_LEVEL_BATCH) { @@ -1074,9 +1179,10 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, if (rc) { cifs_small_buf_release(req); kfree(copy_path); - kfree(iov[num_iovecs-1].iov_base); + kfree(lc_buf); return rc; } + dhc_buf = iov[num_iovecs-1].iov_base; } rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0); @@ -1108,6 +1214,8 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, *oplock = rsp->OplockLevel; creat_exit: kfree(copy_path); + kfree(lc_buf); + kfree(dhc_buf); free_rsp_buf(resp_buftype, rsp); return rc; } @@ -1131,6 +1239,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, cifs_dbg(FYI, "SMB2 IOCTL\n"); + *out_data = NULL; /* zero out returned data len, in case of error */ if (plen) *plen = 0; @@ -1176,19 +1285,38 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, req->Flags = 0; iov[0].iov_base = (char *)req; - /* 4 for rfc1002 length field */ - iov[0].iov_len = get_rfc1002_length(req) + 4; - if (indatalen) - inc_rfc1001_len(req, indatalen); + /* + * If no input data, the size of ioctl struct in + * protocol spec still includes a 1 byte data buffer, + * but if input data passed to ioctl, we do not + * want to double count this, so we do not send + * the dummy one byte of data in iovec[0] if sending + * input data (in iovec[1]). We also must add 4 bytes + * in first iovec to allow for rfc1002 length field. + */ + + if (indatalen) { + iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + inc_rfc1001_len(req, indatalen - 1); + } else + iov[0].iov_len = get_rfc1002_length(req) + 4; + rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0); rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base; - if (rc != 0) { + if ((rc != 0) && (rc != -EINVAL)) { if (tcon) cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE); goto ioctl_exit; + } else if (rc == -EINVAL) { + if ((opcode != FSCTL_SRV_COPYCHUNK_WRITE) && + (opcode != FSCTL_SRV_COPYCHUNK)) { + if (tcon) + cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE); + goto ioctl_exit; + } } /* check if caller wants to look at return data or just return rc */ @@ -1228,6 +1356,31 @@ ioctl_exit: return rc; } +/* + * Individual callers to ioctl worker function follow + */ + +int +SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid) +{ + int rc; + struct compress_ioctl fsctl_input; + char *ret_data = NULL; + + fsctl_input.CompressionState = + __constant_cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); + + rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, + FSCTL_SET_COMPRESSION, true /* is_fsctl */, + (char *)&fsctl_input /* data input */, + 2 /* in data len */, &ret_data /* out data */, NULL); + + cifs_dbg(FYI, "set compression rc %d\n", rc); + + return rc; +} + int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid) @@ -1750,7 +1903,8 @@ smb2_writev_callback(struct mid_q_entry *mid) /* smb2_async_writev - send an async write, and set up mid to handle result */ int -smb2_async_writev(struct cifs_writedata *wdata) +smb2_async_writev(struct cifs_writedata *wdata, + void (*release)(struct kref *kref)) { int rc = -EACCES; struct smb2_write_req *req = NULL; @@ -1798,7 +1952,7 @@ smb2_async_writev(struct cifs_writedata *wdata) smb2_writev_callback, wdata, 0); if (rc) { - kref_put(&wdata->refcount, cifs_writedata_release); + kref_put(&wdata->refcount, release); cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); } @@ -2098,11 +2252,9 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, rc = SendReceive2(xid, ses, iov, num, &resp_buftype, 0); rsp = (struct smb2_set_info_rsp *)iov[0].iov_base; - if (rc != 0) { + if (rc != 0) cifs_stats_fail_inc(tcon, SMB2_SET_INFO_HE); - goto out; - } -out: + free_rsp_buf(resp_buftype, rsp); kfree(iov); return rc; @@ -2293,7 +2445,7 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, 0); if (rc) { cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); - goto qinf_exit; + goto qfsinf_exit; } rsp = (struct smb2_query_info_rsp *)iov.iov_base; @@ -2305,7 +2457,70 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, if (!rc) copy_fs_info_to_kstatfs(info, fsdata); -qinf_exit: +qfsinf_exit: + free_rsp_buf(resp_buftype, iov.iov_base); + return rc; +} + +int +SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, int level) +{ + struct smb2_query_info_rsp *rsp = NULL; + struct kvec iov; + int rc = 0; + int resp_buftype, max_len, min_len; + struct cifs_ses *ses = tcon->ses; + unsigned int rsp_len, offset; + + if (level == FS_DEVICE_INFORMATION) { + max_len = sizeof(FILE_SYSTEM_DEVICE_INFO); + min_len = sizeof(FILE_SYSTEM_DEVICE_INFO); + } else if (level == FS_ATTRIBUTE_INFORMATION) { + max_len = sizeof(FILE_SYSTEM_ATTRIBUTE_INFO); + min_len = MIN_FS_ATTR_INFO_SIZE; + } else if (level == FS_SECTOR_SIZE_INFORMATION) { + max_len = sizeof(struct smb3_fs_ss_info); + min_len = sizeof(struct smb3_fs_ss_info); + } else { + cifs_dbg(FYI, "Invalid qfsinfo level %d\n", level); + return -EINVAL; + } + + rc = build_qfs_info_req(&iov, tcon, level, max_len, + persistent_fid, volatile_fid); + if (rc) + return rc; + + rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, 0); + if (rc) { + cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); + goto qfsattr_exit; + } + rsp = (struct smb2_query_info_rsp *)iov.iov_base; + + rsp_len = le32_to_cpu(rsp->OutputBufferLength); + offset = le16_to_cpu(rsp->OutputBufferOffset); + rc = validate_buf(offset, rsp_len, &rsp->hdr, min_len); + if (rc) + goto qfsattr_exit; + + if (level == FS_ATTRIBUTE_INFORMATION) + memcpy(&tcon->fsAttrInfo, 4 /* RFC1001 len */ + offset + + (char *)&rsp->hdr, min_t(unsigned int, + rsp_len, max_len)); + else if (level == FS_DEVICE_INFORMATION) + memcpy(&tcon->fsDevInfo, 4 /* RFC1001 len */ + offset + + (char *)&rsp->hdr, sizeof(FILE_SYSTEM_DEVICE_INFO)); + else if (level == FS_SECTOR_SIZE_INFORMATION) { + struct smb3_fs_ss_info *ss_info = (struct smb3_fs_ss_info *) + (4 /* RFC1001 len */ + offset + (char *)&rsp->hdr); + tcon->ss_flags = le32_to_cpu(ss_info->Flags); + tcon->perf_sector_size = + le32_to_cpu(ss_info->PhysicalBytesPerSectorForPerf); + } + +qfsattr_exit: free_rsp_buf(resp_buftype, iov.iov_base); return rc; } diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index b83d0118a75..69f3595d395 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -122,6 +122,23 @@ struct smb2_pdu { __le16 StructureSize2; /* size of wct area (varies, request specific) */ } __packed; +struct smb2_transform_hdr { + __be32 smb2_buf_length; /* big endian on wire */ + /* length is only two or three bytes - with + one or two byte type preceding it that MBZ */ + __u8 ProtocolId[4]; /* 0xFD 'S' 'M' 'B' */ + __u8 Signature[16]; + __u8 Nonce[11]; + __u8 Reserved[5]; + __le32 OriginalMessageSize; + __u16 Reserved1; + __le16 EncryptionAlgorithm; + __u64 SessionId; +} __packed; + +/* Encryption Algorithms */ +#define SMB2_ENCRYPTION_AES128_CCM __constant_cpu_to_le16(0x0001) + /* * SMB2 flag definitions */ @@ -166,8 +183,6 @@ struct smb2_symlink_err_rsp { #define SMB2_CLIENT_GUID_SIZE 16 -extern __u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE]; - struct smb2_negotiate_req { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 36 */ @@ -237,6 +252,7 @@ struct smb2_sess_setup_req { /* Currently defined SessionFlags */ #define SMB2_SESSION_FLAG_IS_GUEST 0x0001 #define SMB2_SESSION_FLAG_IS_NULL 0x0002 +#define SMB2_SESSION_FLAG_ENCRYPT_DATA 0x0004 struct smb2_sess_setup_rsp { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 9 */ @@ -419,11 +435,15 @@ struct smb2_tree_disconnect_rsp { #define SMB2_CREATE_SD_BUFFER "SecD" /* security descriptor */ #define SMB2_CREATE_DURABLE_HANDLE_REQUEST "DHnQ" #define SMB2_CREATE_DURABLE_HANDLE_RECONNECT "DHnC" -#define SMB2_CREATE_ALLOCATION_SIZE "AlSi" +#define SMB2_CREATE_ALLOCATION_SIZE "AISi" #define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc" #define SMB2_CREATE_TIMEWARP_REQUEST "TWrp" #define SMB2_CREATE_QUERY_ON_DISK_ID "QFid" #define SMB2_CREATE_REQUEST_LEASE "RqLs" +#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q" +#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C" +#define SMB2_CREATE_APP_INSTANCE_ID 0x45BCA66AEFA7F74A9008FA462E144D74 +#define SVHDX_OPEN_DEVICE_CONTEXT 0x83CE6F1AD851E0986E34401CC9BCFCE9 struct smb2_create_req { struct smb2_hdr hdr; @@ -534,9 +554,16 @@ struct create_durable { } Data; } __packed; +#define COPY_CHUNK_RES_KEY_SIZE 24 +struct resume_key_req { + char ResumeKey[COPY_CHUNK_RES_KEY_SIZE]; + __le32 ContextLength; /* MBZ */ + char Context[0]; /* ignored, Windows sets to 4 bytes of zero */ +} __packed; + /* this goes in the ioctl buffer when doing a copychunk request */ struct copychunk_ioctl { - char SourceKey[24]; + char SourceKey[COPY_CHUNK_RES_KEY_SIZE]; __le32 ChunkCount; /* we are only sending 1 */ __le32 Reserved; /* array will only be one chunk long for us */ @@ -546,13 +573,25 @@ struct copychunk_ioctl { __u32 Reserved2; } __packed; -/* Response and Request are the same format */ -struct validate_negotiate_info { +struct copychunk_ioctl_rsp { + __le32 ChunksWritten; + __le32 ChunkBytesWritten; + __le32 TotalBytesWritten; +} __packed; + +struct validate_negotiate_info_req { __le32 Capabilities; __u8 Guid[SMB2_CLIENT_GUID_SIZE]; __le16 SecurityMode; __le16 DialectCount; - __le16 Dialect[1]; + __le16 Dialects[1]; /* dialect (someday maybe list) client asked for */ +} __packed; + +struct validate_negotiate_info_rsp { + __le32 Capabilities; + __u8 Guid[SMB2_CLIENT_GUID_SIZE]; + __le16 SecurityMode; + __le16 Dialect; /* Dialect in use for the connection */ } __packed; #define RSS_CAPABLE 0x00000001 @@ -569,6 +608,10 @@ struct network_interface_info_ioctl_rsp { #define NO_FILE_ID 0xFFFFFFFFFFFFFFFFULL /* general ioctls to srv not to file */ +struct compress_ioctl { + __le16 CompressionState; /* See cifspdu.h for possible flag values */ +} __packed; + struct smb2_ioctl_req { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 57 */ @@ -584,7 +627,7 @@ struct smb2_ioctl_req { __le32 MaxOutputResponse; __le32 Flags; __u32 Reserved2; - char Buffer[0]; + __u8 Buffer[0]; } __packed; struct smb2_ioctl_rsp { @@ -870,14 +913,16 @@ struct smb2_lease_ack { /* File System Information Classes */ #define FS_VOLUME_INFORMATION 1 /* Query */ -#define FS_LABEL_INFORMATION 2 /* Set */ +#define FS_LABEL_INFORMATION 2 /* Local only */ #define FS_SIZE_INFORMATION 3 /* Query */ #define FS_DEVICE_INFORMATION 4 /* Query */ #define FS_ATTRIBUTE_INFORMATION 5 /* Query */ #define FS_CONTROL_INFORMATION 6 /* Query, Set */ #define FS_FULL_SIZE_INFORMATION 7 /* Query */ #define FS_OBJECT_ID_INFORMATION 8 /* Query, Set */ -#define FS_DRIVER_PATH_INFORMATION 9 /* Query */ +#define FS_DRIVER_PATH_INFORMATION 9 /* Local only */ +#define FS_VOLUME_FLAGS_INFORMATION 10 /* Local only */ +#define FS_SECTOR_SIZE_INFORMATION 11 /* SMB3 or later. Query */ struct smb2_fs_full_size_info { __le64 TotalAllocationUnits; @@ -887,6 +932,22 @@ struct smb2_fs_full_size_info { __le32 BytesPerSector; } __packed; +#define SSINFO_FLAGS_ALIGNED_DEVICE 0x00000001 +#define SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE 0x00000002 +#define SSINFO_FLAGS_NO_SEEK_PENALTY 0x00000004 +#define SSINFO_FLAGS_TRIM_ENABLED 0x00000008 + +/* sector size info struct */ +struct smb3_fs_ss_info { + __le32 LogicalBytesPerSector; + __le32 PhysicalBytesPerSectorForAtomicity; + __le32 PhysicalBytesPerSectorForPerf; + __le32 FileSystemEffectivePhysicalBytesPerSectorForAtomicity; + __le32 Flags; + __le32 ByteOffsetForSectorAlignment; + __le32 ByteOffsetForPartitionAlignment; +} __packed; + /* partial list of QUERY INFO levels */ #define FILE_DIRECTORY_INFORMATION 1 #define FILE_FULL_DIRECTORY_INFORMATION 2 diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index e3fb4801ee9..0ce48db20a6 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -61,7 +61,7 @@ extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst, extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, FILE_ALL_INFO *data, - bool *adjust_tz); + bool *adjust_tz, bool *symlink); extern int smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, const char *full_path, __u64 size, struct cifs_sb_info *cifs_sb, bool set_alloc); @@ -123,7 +123,8 @@ extern int SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon, extern int smb2_async_readv(struct cifs_readdata *rdata); extern int SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, char **buf, int *buf_type); -extern int smb2_async_writev(struct cifs_writedata *wdata); +extern int smb2_async_writev(struct cifs_writedata *wdata, + void (*release)(struct kref *kref)); extern int SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, struct kvec *iov, int n_vec); extern int SMB2_echo(struct TCP_Server_Info *server); @@ -142,12 +143,16 @@ extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, FILE_BASIC_INFO *buf); +extern int SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid); extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, const u64 persistent_fid, const u64 volatile_fid, const __u8 oplock_level); extern int SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id, struct kstatfs *FSData); +extern int SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_file_id, u64 volatile_file_id, int lvl); extern int SMB2_lock(const unsigned int xid, struct cifs_tcon *tcon, const __u64 persist_fid, const __u64 volatile_fid, const __u32 pid, const __u64 length, const __u64 offset, @@ -158,5 +163,6 @@ extern int smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, struct smb2_lock_element *buf); extern int SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, __u8 *lease_key, const __le32 lease_state); +extern int smb3_validate_negotiate(const unsigned int, struct cifs_tcon *); #endif /* _SMB2PROTO_H */ diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 340abca3aa5..59c748ce872 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -466,7 +466,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) static inline void smb2_seq_num_into_buf(struct TCP_Server_Info *server, struct smb2_hdr *hdr) { - hdr->MessageId = get_next_mid(server); + hdr->MessageId = get_next_mid64(server); } static struct mid_q_entry * @@ -516,13 +516,19 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_hdr *buf, return -EAGAIN; } - if (ses->status != CifsGood) { - /* check if SMB2 session is bad because we are setting it up */ + if (ses->status == CifsNew) { if ((buf->Command != SMB2_SESSION_SETUP) && (buf->Command != SMB2_NEGOTIATE)) return -EAGAIN; /* else ok - we are setting up session */ } + + if (ses->status == CifsExiting) { + if (buf->Command != SMB2_LOGOFF) + return -EAGAIN; + /* else ok - we are shutting down the session */ + } + *mid = smb2_mid_entry_alloc(buf, ses->server); if (*mid == NULL) return -ENOMEM; diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h index d952ee48f4d..0e538b5c962 100644 --- a/fs/cifs/smbfsctl.h +++ b/fs/cifs/smbfsctl.h @@ -90,16 +90,30 @@ #define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 /* BB add struct */ #define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */ #define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */ -#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204 /* BB add struct */ +#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204 /* Perform server-side data movement */ #define FSCTL_SRV_COPYCHUNK 0x001440F2 #define FSCTL_SRV_COPYCHUNK_WRITE 0x001480F2 #define FSCTL_QUERY_NETWORK_INTERFACE_INFO 0x001401FC /* BB add struct */ #define FSCTL_SRV_READ_HASH 0x001441BB /* BB add struct */ +/* See FSCC 2.1.2.5 */ #define IO_REPARSE_TAG_MOUNT_POINT 0xA0000003 #define IO_REPARSE_TAG_HSM 0xC0000004 #define IO_REPARSE_TAG_SIS 0x80000007 +#define IO_REPARSE_TAG_HSM2 0x80000006 +#define IO_REPARSE_TAG_DRIVER_EXTENDER 0x80000005 +/* Used by the DFS filter. See MS-DFSC */ +#define IO_REPARSE_TAG_DFS 0x8000000A +/* Used by the DFS filter See MS-DFSC */ +#define IO_REPARSE_TAG_DFSR 0x80000012 +#define IO_REPARSE_TAG_FILTER_MANAGER 0x8000000B +/* See section MS-FSCC 2.1.2.4 */ +#define IO_REPARSE_TAG_SYMLINK 0xA000000C +#define IO_REPARSE_TAG_DEDUP 0x80000013 +#define IO_REPARSE_APPXSTREAM 0xC0000014 +/* NFS symlinks, Win 8/SMB3 and later */ +#define IO_REPARSE_TAG_NFS 0x80000014 /* fsctl flags */ /* If Flags is set to this value, the request is an FSCTL not ioctl request */ diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 6fdcb1b4a10..18cd5650a5f 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -58,7 +58,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) return temp; else { memset(temp, 0, sizeof(struct mid_q_entry)); - temp->mid = smb_buffer->Mid; /* always LE */ + temp->mid = get_mid(smb_buffer); temp->pid = current->pid; temp->command = cpu_to_le16(smb_buffer->Command); cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command); @@ -270,6 +270,26 @@ cifs_rqst_page_to_kvec(struct smb_rqst *rqst, unsigned int idx, iov->iov_len = rqst->rq_pagesz; } +static unsigned long +rqst_len(struct smb_rqst *rqst) +{ + unsigned int i; + struct kvec *iov = rqst->rq_iov; + unsigned long buflen = 0; + + /* total up iov array first */ + for (i = 0; i < rqst->rq_nvec; i++) + buflen += iov[i].iov_len; + + /* add in the page array if there is one */ + if (rqst->rq_npages) { + buflen += rqst->rq_pagesz * (rqst->rq_npages - 1); + buflen += rqst->rq_tailsz; + } + + return buflen; +} + static int smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) { @@ -277,6 +297,7 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) struct kvec *iov = rqst->rq_iov; int n_vec = rqst->rq_nvec; unsigned int smb_buf_length = get_rfc1002_length(iov[0].iov_base); + unsigned long send_length; unsigned int i; size_t total_len = 0, sent; struct socket *ssocket = server->ssocket; @@ -285,6 +306,14 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) if (ssocket == NULL) return -ENOTSOCK; + /* sanity check send length */ + send_length = rqst_len(rqst); + if (send_length != smb_buf_length + 4) { + WARN(1, "Send length mismatch(send_length=%lu smb_buf_length=%u)\n", + send_length, smb_buf_length); + return -EIO; + } + cifs_dbg(FYI, "Sending smb: smb_len=%u\n", smb_buf_length); dump_smb(iov[0].iov_base, iov[0].iov_len); @@ -410,8 +439,13 @@ static int wait_for_free_request(struct TCP_Server_Info *server, const int timeout, const int optype) { - return wait_for_free_credits(server, timeout, - server->ops->get_credits_field(server, optype)); + int *val; + + val = server->ops->get_credits_field(server, optype); + /* Since an echo is already inflight, no need to wait to send another */ + if (*val <= 0 && optype == CIFS_ECHO_OP) + return -EAGAIN; + return wait_for_free_credits(server, timeout, val); } static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf, @@ -426,13 +460,20 @@ static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf, return -EAGAIN; } - if (ses->status != CifsGood) { - /* check if SMB session is bad because we are setting it up */ + if (ses->status == CifsNew) { if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) && (in_buf->Command != SMB_COM_NEGOTIATE)) return -EAGAIN; /* else ok - we are setting up session */ } + + if (ses->status == CifsExiting) { + /* check if SMB session is bad because we are setting it up */ + if (in_buf->Command != SMB_COM_LOGOFF_ANDX) + return -EAGAIN; + /* else ok - we are shutting down session */ + } + *ppmidQ = AllocMidQEntry(in_buf, ses->server); if (*ppmidQ == NULL) return -ENOMEM; diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 09afda4cc58..5ac836a86b1 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -82,9 +82,11 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name) goto remove_ea_exit; ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ - rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, NULL, - (__u16)0, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (pTcon->ses->server->ops->set_EA) + rc = pTcon->ses->server->ops->set_EA(xid, pTcon, + full_path, ea_name, NULL, (__u16)0, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); } remove_ea_exit: kfree(full_path); @@ -149,18 +151,22 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, cifs_dbg(FYI, "attempt to set cifs inode metadata\n"); ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ - rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, - (__u16)value_size, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (pTcon->ses->server->ops->set_EA) + rc = pTcon->ses->server->ops->set_EA(xid, pTcon, + full_path, ea_name, ea_value, (__u16)value_size, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto set_ea_exit; ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */ - rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, - (__u16)value_size, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (pTcon->ses->server->ops->set_EA) + rc = pTcon->ses->server->ops->set_EA(xid, pTcon, + full_path, ea_name, ea_value, (__u16)value_size, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL, strlen(CIFS_XATTR_CIFS_ACL)) == 0) { #ifdef CONFIG_CIFS_ACL @@ -170,8 +176,12 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, rc = -ENOMEM; } else { memcpy(pacl, ea_value, value_size); - rc = set_cifs_acl(pacl, value_size, - direntry->d_inode, full_path, CIFS_ACL_DACL); + if (pTcon->ses->server->ops->set_acl) + rc = pTcon->ses->server->ops->set_acl(pacl, + value_size, direntry->d_inode, + full_path, CIFS_ACL_DACL); + else + rc = -EOPNOTSUPP; if (rc == 0) /* force revalidate of the inode */ CIFS_I(direntry->d_inode)->time = 0; kfree(pacl); @@ -272,17 +282,21 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, /* revalidate/getattr then populate from inode */ } /* BB add else when above is implemented */ ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ - rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value, - buf_size, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (pTcon->ses->server->ops->query_all_EAs) + rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, + full_path, ea_name, ea_value, buf_size, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto get_ea_exit; ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */ - rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value, - buf_size, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (pTcon->ses->server->ops->query_all_EAs) + rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, + full_path, ea_name, ea_value, buf_size, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS, strlen(POSIX_ACL_XATTR_ACCESS)) == 0) { #ifdef CONFIG_CIFS_POSIX @@ -313,8 +327,11 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, u32 acllen; struct cifs_ntsd *pacl; - pacl = get_cifs_acl(cifs_sb, direntry->d_inode, - full_path, &acllen); + if (pTcon->ses->server->ops->get_acl == NULL) + goto get_ea_exit; /* rc already EOPNOTSUPP */ + + pacl = pTcon->ses->server->ops->get_acl(cifs_sb, + direntry->d_inode, full_path, &acllen); if (IS_ERR(pacl)) { rc = PTR_ERR(pacl); cifs_dbg(VFS, "%s: error %zd getting sec desc\n", @@ -400,11 +417,12 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) /* if proc/fs/cifs/streamstoxattr is set then search server for EAs or streams to returns as xattrs */ - rc = CIFSSMBQAllEAs(xid, pTcon, full_path, NULL, data, - buf_size, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + if (pTcon->ses->server->ops->query_all_EAs) + rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, + full_path, NULL, data, buf_size, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); list_ea_exit: kfree(full_path); free_xid(xid); diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c index 911cf30d057..7740b1c871c 100644 --- a/fs/coda/cnode.c +++ b/fs/coda/cnode.c @@ -101,7 +101,7 @@ struct inode *coda_cnode_make(struct CodaFid *fid, struct super_block *sb) inode = coda_iget(sb, fid, &attr); if (IS_ERR(inode)) - printk("coda_cnode_make: coda_iget failed\n"); + pr_warn("%s: coda_iget failed\n", __func__); return inode; } @@ -137,7 +137,7 @@ struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb) unsigned long hash = coda_f2i(fid); if ( !sb ) { - printk("coda_fid_to_inode: no sb!\n"); + pr_warn("%s: no sb!\n", __func__); return NULL; } diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h index b7143cf783a..381c993b142 100644 --- a/fs/coda/coda_int.h +++ b/fs/coda/coda_int.h @@ -10,7 +10,7 @@ extern int coda_hard; extern int coda_fake_statfs; void coda_destroy_inodecache(void); -int coda_init_inodecache(void); +int __init coda_init_inodecache(void); int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync); void coda_sysctl_init(void); void coda_sysctl_clean(void); diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index cc0ea9fe5ec..d42b725b1d2 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -12,6 +12,12 @@ #ifndef _LINUX_CODA_FS #define _LINUX_CODA_FS +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/param.h> #include <linux/mm.h> @@ -40,7 +46,7 @@ extern const struct file_operations coda_ioctl_operations; int coda_open(struct inode *i, struct file *f); int coda_release(struct inode *i, struct file *f); int coda_permission(struct inode *inode, int mask); -int coda_revalidate_inode(struct dentry *); +int coda_revalidate_inode(struct inode *); int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *); int coda_setattr(struct dentry *, struct iattr *); @@ -63,7 +69,7 @@ void coda_sysctl_clean(void); else \ ptr = (cast)vzalloc((unsigned long) size); \ if (!ptr) \ - printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \ + pr_warn("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \ } while (0) diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 190effc6a6f..cd8a63238b1 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -102,7 +102,7 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsig int type = 0; if (length > CODA_MAXNAMLEN) { - printk(KERN_ERR "name too long: lookup, %s (%*s)\n", + pr_err("name too long: lookup, %s (%*s)\n", coda_i2s(dir), (int)length, name); return ERR_PTR(-ENAMETOOLONG); } @@ -387,9 +387,6 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx) BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); host_file = cfi->cfi_container; - if (!host_file->f_op) - return -ENOTDIR; - if (host_file->f_op->iterate) { struct inode *host_inode = file_inode(host_file); mutex_lock(&host_inode->i_mutex); @@ -456,23 +453,23 @@ static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx) ret = kernel_read(host_file, ctx->pos - 2, (char *)vdir, sizeof(*vdir)); if (ret < 0) { - printk(KERN_ERR "coda readdir: read dir %s failed %d\n", - coda_f2s(&cii->c_fid), ret); + pr_err("%s: read dir %s failed %d\n", + __func__, coda_f2s(&cii->c_fid), ret); break; } if (ret == 0) break; /* end of directory file reached */ /* catch truncated reads */ if (ret < vdir_size || ret < vdir_size + vdir->d_namlen) { - printk(KERN_ERR "coda readdir: short read on %s\n", - coda_f2s(&cii->c_fid)); + pr_err("%s: short read on %s\n", + __func__, coda_f2s(&cii->c_fid)); ret = -EBADF; break; } /* validate whether the directory file actually makes sense */ if (vdir->d_reclen < vdir_size + vdir->d_namlen) { - printk(KERN_ERR "coda readdir: invalid dir %s\n", - coda_f2s(&cii->c_fid)); + pr_err("%s: invalid dir %s\n", + __func__, coda_f2s(&cii->c_fid)); ret = -EBADF; break; } @@ -566,13 +563,12 @@ static int coda_dentry_delete(const struct dentry * dentry) * cache manager Venus issues a downcall to the kernel when this * happens */ -int coda_revalidate_inode(struct dentry *dentry) +int coda_revalidate_inode(struct inode *inode) { struct coda_vattr attr; int error; int old_mode; ino_t old_ino; - struct inode *inode = dentry->d_inode; struct coda_inode_info *cii = ITOC(inode); if (!cii->c_flags) @@ -593,8 +589,8 @@ int coda_revalidate_inode(struct dentry *dentry) coda_vattr_to_iattr(inode, &attr); if ((old_mode & S_IFMT) != (inode->i_mode & S_IFMT)) { - printk("Coda: inode %ld, fid %s changed type!\n", - inode->i_ino, coda_f2s(&(cii->c_fid))); + pr_warn("inode %ld, fid %s changed type!\n", + inode->i_ino, coda_f2s(&(cii->c_fid))); } /* the following can happen when a local fid is replaced diff --git a/fs/coda/file.c b/fs/coda/file.c index 380b798f844..9e83b779021 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -36,7 +36,7 @@ coda_file_read(struct file *coda_file, char __user *buf, size_t count, loff_t *p BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); host_file = cfi->cfi_container; - if (!host_file->f_op || !host_file->f_op->read) + if (!host_file->f_op->read) return -EINVAL; return host_file->f_op->read(host_file, buf, count, ppos); @@ -75,7 +75,7 @@ coda_file_write(struct file *coda_file, const char __user *buf, size_t count, lo BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); host_file = cfi->cfi_container; - if (!host_file->f_op || !host_file->f_op->write) + if (!host_file->f_op->write) return -EINVAL; host_inode = file_inode(host_file); @@ -105,7 +105,7 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma) BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); host_file = cfi->cfi_container; - if (!host_file->f_op || !host_file->f_op->mmap) + if (!host_file->f_op->mmap) return -ENODEV; coda_inode = file_inode(coda_file); diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 4dcc0d81a7a..fe3afb2de88 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -73,7 +73,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -int coda_init_inodecache(void) +int __init coda_init_inodecache(void) { coda_inode_cachep = kmem_cache_create("coda_inode_cache", sizeof(struct coda_inode_info), @@ -96,6 +96,7 @@ void coda_destroy_inodecache(void) static int coda_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_NOATIME; return 0; } @@ -118,12 +119,12 @@ static int get_device_index(struct coda_mount_data *data) int idx; if (data == NULL) { - printk("coda_read_super: Bad mount data\n"); + pr_warn("%s: Bad mount data\n", __func__); return -1; } if (data->version != CODA_MOUNT_VERSION) { - printk("coda_read_super: Bad mount version\n"); + pr_warn("%s: Bad mount version\n", __func__); return -1; } @@ -140,13 +141,13 @@ static int get_device_index(struct coda_mount_data *data) fdput(f); if (idx < 0 || idx >= MAX_CODADEVS) { - printk("coda_read_super: Bad minor number\n"); + pr_warn("%s: Bad minor number\n", __func__); return -1; } return idx; Ebadf: - printk("coda_read_super: Bad file\n"); + pr_warn("%s: Bad file\n", __func__); return -1; } @@ -167,19 +168,19 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) if(idx == -1) idx = 0; - printk(KERN_INFO "coda_read_super: device index: %i\n", idx); + pr_info("%s: device index: %i\n", __func__, idx); vc = &coda_comms[idx]; mutex_lock(&vc->vc_mutex); if (!vc->vc_inuse) { - printk("coda_read_super: No pseudo device\n"); + pr_warn("%s: No pseudo device\n", __func__); error = -EINVAL; goto unlock_out; } if (vc->vc_sb) { - printk("coda_read_super: Device already mounted\n"); + pr_warn("%s: Device already mounted\n", __func__); error = -EBUSY; goto unlock_out; } @@ -203,22 +204,23 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) /* get root fid from Venus: this needs the root inode */ error = venus_rootfid(sb, &fid); if ( error ) { - printk("coda_read_super: coda_get_rootfid failed with %d\n", - error); + pr_warn("%s: coda_get_rootfid failed with %d\n", + __func__, error); goto error; } - printk("coda_read_super: rootfid is %s\n", coda_f2s(&fid)); + pr_info("%s: rootfid is %s\n", __func__, coda_f2s(&fid)); /* make root inode */ root = coda_cnode_make(&fid, sb); if (IS_ERR(root)) { error = PTR_ERR(root); - printk("Failure of coda_cnode_make for root: error %d\n", error); + pr_warn("Failure of coda_cnode_make for root: error %d\n", + error); goto error; } - printk("coda_read_super: rootinode is %ld dev %s\n", - root->i_ino, root->i_sb->s_id); + pr_info("%s: rootinode is %ld dev %s\n", + __func__, root->i_ino, root->i_sb->s_id); sb->s_root = d_make_root(root); if (!sb->s_root) { error = -EINVAL; @@ -245,19 +247,19 @@ static void coda_put_super(struct super_block *sb) sb->s_fs_info = NULL; mutex_unlock(&vcp->vc_mutex); - printk("Coda: Bye bye.\n"); + pr_info("Bye bye.\n"); } static void coda_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); coda_cache_clear_inode(inode); } int coda_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - int err = coda_revalidate_inode(dentry); + int err = coda_revalidate_inode(dentry->d_inode); if (!err) generic_fillattr(dentry->d_inode, stat); return err; diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index ebc2bae6c28..5c1e4242368 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -114,14 +114,14 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf, int size = sizeof(*dcbuf); if ( nbytes < sizeof(struct coda_out_hdr) ) { - printk("coda_downcall opc %d uniq %d, not enough!\n", - hdr.opcode, hdr.unique); + pr_warn("coda_downcall opc %d uniq %d, not enough!\n", + hdr.opcode, hdr.unique); count = nbytes; goto out; } if ( nbytes > size ) { - printk("Coda: downcall opc %d, uniq %d, too much!", - hdr.opcode, hdr.unique); + pr_warn("downcall opc %d, uniq %d, too much!", + hdr.opcode, hdr.unique); nbytes = size; } CODA_ALLOC(dcbuf, union outputArgs *, nbytes); @@ -136,7 +136,8 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf, CODA_FREE(dcbuf, nbytes); if (error) { - printk("psdev_write: coda_downcall error: %d\n", error); + pr_warn("%s: coda_downcall error: %d\n", + __func__, error); retval = error; goto out; } @@ -157,16 +158,17 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf, mutex_unlock(&vcp->vc_mutex); if (!req) { - printk("psdev_write: msg (%d, %d) not found\n", - hdr.opcode, hdr.unique); + pr_warn("%s: msg (%d, %d) not found\n", + __func__, hdr.opcode, hdr.unique); retval = -ESRCH; goto out; } /* move data into response buffer. */ if (req->uc_outSize < nbytes) { - printk("psdev_write: too much cnt: %d, cnt: %ld, opc: %d, uniq: %d.\n", - req->uc_outSize, (long)nbytes, hdr.opcode, hdr.unique); + pr_warn("%s: too much cnt: %d, cnt: %ld, opc: %d, uniq: %d.\n", + __func__, req->uc_outSize, (long)nbytes, + hdr.opcode, hdr.unique); nbytes = req->uc_outSize; /* don't have more space! */ } if (copy_from_user(req->uc_data, buf, nbytes)) { @@ -240,8 +242,8 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf, /* Move the input args into userspace */ count = req->uc_inSize; if (nbytes < req->uc_inSize) { - printk ("psdev_read: Venus read %ld bytes of %d in message\n", - (long)nbytes, req->uc_inSize); + pr_warn("%s: Venus read %ld bytes of %d in message\n", + __func__, (long)nbytes, req->uc_inSize); count = nbytes; } @@ -305,7 +307,7 @@ static int coda_psdev_release(struct inode * inode, struct file * file) struct upc_req *req, *tmp; if (!vcp || !vcp->vc_inuse ) { - printk("psdev_release: Not open.\n"); + pr_warn("%s: Not open.\n", __func__); return -1; } @@ -354,8 +356,8 @@ static int init_coda_psdev(void) { int i, err = 0; if (register_chrdev(CODA_PSDEV_MAJOR, "coda", &coda_psdev_fops)) { - printk(KERN_ERR "coda_psdev: unable to get major %d\n", - CODA_PSDEV_MAJOR); + pr_err("%s: unable to get major %d\n", + __func__, CODA_PSDEV_MAJOR); return -EIO; } coda_psdev_class = class_create(THIS_MODULE, "coda"); @@ -393,13 +395,13 @@ static int __init init_coda(void) goto out2; status = init_coda_psdev(); if ( status ) { - printk("Problem (%d) in init_coda_psdev\n", status); + pr_warn("Problem (%d) in init_coda_psdev\n", status); goto out1; } status = register_filesystem(&coda_fs_type); if (status) { - printk("coda: failed to register filesystem!\n"); + pr_warn("failed to register filesystem!\n"); goto out; } return 0; @@ -420,9 +422,8 @@ static void __exit exit_coda(void) int err, i; err = unregister_filesystem(&coda_fs_type); - if ( err != 0 ) { - printk("coda: failed to unregister filesystem\n"); - } + if (err != 0) + pr_warn("failed to unregister filesystem\n"); for (i = 0; i < MAX_CODADEVS; i++) device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i)); class_destroy(coda_psdev_class); diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c index af56ad56a89..34218a8a28c 100644 --- a/fs/coda/sysctl.c +++ b/fs/coda/sysctl.c @@ -14,7 +14,7 @@ #ifdef CONFIG_SYSCTL static struct ctl_table_header *fs_table_header; -static ctl_table coda_table[] = { +static struct ctl_table coda_table[] = { { .procname = "timeout", .data = &coda_timeout, @@ -39,7 +39,7 @@ static ctl_table coda_table[] = { {} }; -static ctl_table fs_table[] = { +static struct ctl_table fs_table[] = { { .procname = "coda", .mode = 0555, diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index 3a731976dc5..21fcf8dcb9c 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -508,8 +508,8 @@ int venus_pioctl(struct super_block *sb, struct CodaFid *fid, inp->coda_ioctl.data = (char *)(INSIZE(ioctl)); /* get the data out of user space */ - if ( copy_from_user((char*)inp + (long)inp->coda_ioctl.data, - data->vi.in, data->vi.in_size) ) { + if (copy_from_user((char *)inp + (long)inp->coda_ioctl.data, + data->vi.in, data->vi.in_size)) { error = -EINVAL; goto exit; } @@ -518,8 +518,8 @@ int venus_pioctl(struct super_block *sb, struct CodaFid *fid, &outsize, inp); if (error) { - printk("coda_pioctl: Venus returns: %d for %s\n", - error, coda_f2s(fid)); + pr_warn("%s: Venus returns: %d for %s\n", + __func__, error, coda_f2s(fid)); goto exit; } @@ -675,7 +675,7 @@ static int coda_upcall(struct venus_comm *vcp, mutex_lock(&vcp->vc_mutex); if (!vcp->vc_inuse) { - printk(KERN_NOTICE "coda: Venus dead, not sending upcall\n"); + pr_notice("Venus dead, not sending upcall\n"); error = -ENXIO; goto exit; } @@ -725,7 +725,7 @@ static int coda_upcall(struct venus_comm *vcp, error = -EINTR; if ((req->uc_flags & CODA_REQ_ABORT) || !signal_pending(current)) { - printk(KERN_WARNING "coda: Unexpected interruption.\n"); + pr_warn("Unexpected interruption.\n"); goto exit; } @@ -735,7 +735,7 @@ static int coda_upcall(struct venus_comm *vcp, /* Venus saw the upcall, make sure we can send interrupt signal */ if (!vcp->vc_inuse) { - printk(KERN_INFO "coda: Venus dead, not sending signal.\n"); + pr_info("Venus dead, not sending signal.\n"); goto exit; } diff --git a/fs/compat.c b/fs/compat.c index 6af20de2c1a..66d3d3c6b4b 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -72,8 +72,8 @@ int compat_printk(const char *fmt, ...) * Not all architectures have sys_utime, so implement this in terms * of sys_utimes. */ -asmlinkage long compat_sys_utime(const char __user *filename, - struct compat_utimbuf __user *t) +COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename, + struct compat_utimbuf __user *, t) { struct timespec tv[2]; @@ -87,13 +87,13 @@ asmlinkage long compat_sys_utime(const char __user *filename, return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0); } -asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filename, struct compat_timespec __user *t, int flags) +COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags) { struct timespec tv[2]; if (t) { - if (get_compat_timespec(&tv[0], &t[0]) || - get_compat_timespec(&tv[1], &t[1])) + if (compat_get_timespec(&tv[0], &t[0]) || + compat_get_timespec(&tv[1], &t[1])) return -EFAULT; if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) @@ -102,7 +102,7 @@ asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filena return do_utimes(dfd, filename, t ? tv : NULL, flags); } -asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filename, struct compat_timeval __user *t) +COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t) { struct timespec tv[2]; @@ -121,7 +121,7 @@ asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filena return do_utimes(dfd, filename, t ? tv : NULL, 0); } -asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_timeval __user *t) +COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t) { return compat_sys_futimesat(AT_FDCWD, filename, t); } @@ -159,8 +159,8 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0; } -asmlinkage long compat_sys_newstat(const char __user * filename, - struct compat_stat __user *statbuf) +COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename, + struct compat_stat __user *, statbuf) { struct kstat stat; int error; @@ -171,8 +171,8 @@ asmlinkage long compat_sys_newstat(const char __user * filename, return cp_compat_stat(&stat, statbuf); } -asmlinkage long compat_sys_newlstat(const char __user * filename, - struct compat_stat __user *statbuf) +COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename, + struct compat_stat __user *, statbuf) { struct kstat stat; int error; @@ -184,9 +184,9 @@ asmlinkage long compat_sys_newlstat(const char __user * filename, } #ifndef __ARCH_WANT_STAT64 -asmlinkage long compat_sys_newfstatat(unsigned int dfd, - const char __user *filename, - struct compat_stat __user *statbuf, int flag) +COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd, + const char __user *, filename, + struct compat_stat __user *, statbuf, int, flag) { struct kstat stat; int error; @@ -198,8 +198,8 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd, } #endif -asmlinkage long compat_sys_newfstat(unsigned int fd, - struct compat_stat __user * statbuf) +COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd, + struct compat_stat __user *, statbuf) { struct kstat stat; int error = vfs_fstat(fd, &stat); @@ -247,7 +247,7 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs * * The following statfs calls are copies of code from fs/statfs.c and * should be checked against those from time to time */ -asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf) +COMPAT_SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct compat_statfs __user *, buf) { struct kstatfs tmp; int error = user_statfs(pathname, &tmp); @@ -256,7 +256,7 @@ asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_sta return error; } -asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf) +COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, buf) { struct kstatfs tmp; int error = fd_statfs(fd, &tmp); @@ -298,7 +298,7 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat return 0; } -asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf) +COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf) { struct kstatfs tmp; int error; @@ -312,7 +312,7 @@ asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t s return error; } -asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf) +COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf) { struct kstatfs tmp; int error; @@ -331,7 +331,7 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c * Given how simple this syscall is that apporach is more maintainable * than the various conversion hacks. */ -asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u) +COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u) { struct compat_ustat tmp; struct kstatfs sbuf; @@ -399,12 +399,28 @@ static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *u } #endif -asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd, - unsigned long arg) +static unsigned int +convert_fcntl_cmd(unsigned int cmd) +{ + switch (cmd) { + case F_GETLK64: + return F_GETLK; + case F_SETLK64: + return F_SETLK; + case F_SETLKW64: + return F_SETLKW; + } + + return cmd; +} + +COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, + compat_ulong_t, arg) { mm_segment_t old_fs; struct flock f; long ret; + unsigned int conv_cmd; switch (cmd) { case F_GETLK: @@ -441,16 +457,18 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd, case F_GETLK64: case F_SETLK64: case F_SETLKW64: + case F_OFD_GETLK: + case F_OFD_SETLK: + case F_OFD_SETLKW: ret = get_compat_flock64(&f, compat_ptr(arg)); if (ret != 0) break; old_fs = get_fs(); set_fs(KERNEL_DS); - ret = sys_fcntl(fd, (cmd == F_GETLK64) ? F_GETLK : - ((cmd == F_SETLK64) ? F_SETLK : F_SETLKW), - (unsigned long)&f); + conv_cmd = convert_fcntl_cmd(cmd); + ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f); set_fs(old_fs); - if (cmd == F_GETLK64 && ret == 0) { + if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) { /* need to return lock information - see above for commentary */ if (f.l_start > COMPAT_LOFF_T_MAX) ret = -EOVERFLOW; @@ -468,16 +486,22 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd, return ret; } -asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd, - unsigned long arg) +COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, + compat_ulong_t, arg) { - if ((cmd == F_GETLK64) || (cmd == F_SETLK64) || (cmd == F_SETLKW64)) + switch (cmd) { + case F_GETLK64: + case F_SETLK64: + case F_SETLKW64: + case F_OFD_GETLK: + case F_OFD_SETLK: + case F_OFD_SETLKW: return -EINVAL; + } return compat_sys_fcntl64(fd, cmd, arg); } -asmlinkage long -compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p) +COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_reqs, u32 __user *, ctx32p) { long ret; aio_context_t ctx64; @@ -496,32 +520,24 @@ compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p) return ret; } -asmlinkage long -compat_sys_io_getevents(aio_context_t ctx_id, - unsigned long min_nr, - unsigned long nr, - struct io_event __user *events, - struct compat_timespec __user *timeout) +COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id, + compat_long_t, min_nr, + compat_long_t, nr, + struct io_event __user *, events, + struct compat_timespec __user *, timeout) { - long ret; struct timespec t; struct timespec __user *ut = NULL; - ret = -EFAULT; - if (unlikely(!access_ok(VERIFY_WRITE, events, - nr * sizeof(struct io_event)))) - goto out; if (timeout) { - if (get_compat_timespec(&t, timeout)) - goto out; + if (compat_get_timespec(&t, timeout)) + return -EFAULT; ut = compat_alloc_user_space(sizeof(*ut)); if (copy_to_user(ut, &t, sizeof(t)) ) - goto out; + return -EFAULT; } - ret = sys_io_getevents(ctx_id, min_nr, nr, events, ut); -out: - return ret; + return sys_io_getevents(ctx_id, min_nr, nr, events, ut); } /* A write operation does a read from user space and vice versa */ @@ -617,8 +633,8 @@ copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64) #define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *)) -asmlinkage long -compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb) +COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, + int, nr, u32 __user *, iocb) { struct iocb __user * __user *iocb64; long ret; @@ -770,10 +786,10 @@ static int do_nfs4_super_data_conv(void *raw_data) #define NCPFS_NAME "ncpfs" #define NFS4_NAME "nfs4" -asmlinkage long compat_sys_mount(const char __user * dev_name, - const char __user * dir_name, - const char __user * type, unsigned long flags, - const void __user * data) +COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, + const char __user *, dir_name, + const char __user *, type, compat_ulong_t, flags, + const void __user *, data) { char *kernel_type; unsigned long data_page; @@ -869,8 +885,8 @@ efault: return -EFAULT; } -asmlinkage long compat_sys_old_readdir(unsigned int fd, - struct compat_old_linux_dirent __user *dirent, unsigned int count) +COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd, + struct compat_old_linux_dirent __user *, dirent, unsigned int, count) { int error; struct fd f = fdget(fd); @@ -948,8 +964,8 @@ efault: return -EFAULT; } -asmlinkage long compat_sys_getdents(unsigned int fd, - struct compat_linux_dirent __user *dirent, unsigned int count) +COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, + struct compat_linux_dirent __user *, dirent, unsigned int, count) { struct fd f; struct compat_linux_dirent __user * lastdirent; @@ -981,7 +997,7 @@ asmlinkage long compat_sys_getdents(unsigned int fd, return error; } -#ifndef __ARCH_OMIT_COMPAT_SYS_GETDENTS64 +#ifdef __ARCH_WANT_COMPAT_SYS_GETDENTS64 struct compat_getdents_callback64 { struct dir_context ctx; @@ -1033,8 +1049,8 @@ efault: return -EFAULT; } -asmlinkage long compat_sys_getdents64(unsigned int fd, - struct linux_dirent64 __user * dirent, unsigned int count) +COMPAT_SYSCALL_DEFINE3(getdents64, unsigned int, fd, + struct linux_dirent64 __user *, dirent, unsigned int, count) { struct fd f; struct linux_dirent64 __user * lastdirent; @@ -1066,7 +1082,7 @@ asmlinkage long compat_sys_getdents64(unsigned int fd, fdput(f); return error; } -#endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */ +#endif /* __ARCH_WANT_COMPAT_SYS_GETDENTS64 */ /* * Exactly like fs/open.c:sys_open(), except that it doesn't set the @@ -1287,9 +1303,9 @@ out_nofds: return ret; } -asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, - compat_ulong_t __user *outp, compat_ulong_t __user *exp, - struct compat_timeval __user *tvp) +COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, + compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, + struct compat_timeval __user *, tvp) { struct timespec end_time, *to = NULL; struct compat_timeval tv; @@ -1320,7 +1336,7 @@ struct compat_sel_arg_struct { compat_uptr_t tvp; }; -asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg) +COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) { struct compat_sel_arg_struct a; @@ -1381,9 +1397,9 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp, return ret; } -asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp, - compat_ulong_t __user *outp, compat_ulong_t __user *exp, - struct compat_timespec __user *tsp, void __user *sig) +COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp, + compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, + struct compat_timespec __user *, tsp, void __user *, sig) { compat_size_t sigsetsize = 0; compat_uptr_t up = 0; @@ -1400,9 +1416,9 @@ asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp, sigsetsize); } -asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, - unsigned int nfds, struct compat_timespec __user *tsp, - const compat_sigset_t __user *sigmask, compat_size_t sigsetsize) +COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, + unsigned int, nfds, struct compat_timespec __user *, tsp, + const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { compat_sigset_t ss32; sigset_t ksigmask, sigsaved; diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index a81147e2e4e..4d24d17bcfc 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -88,6 +88,11 @@ static void cputime_to_compat_timeval(const cputime_t cputime, #define ELF_HWCAP COMPAT_ELF_HWCAP #endif +#ifdef COMPAT_ELF_HWCAP2 +#undef ELF_HWCAP2 +#define ELF_HWCAP2 COMPAT_ELF_HWCAP2 +#endif + #ifdef COMPAT_ARCH_DLINFO #undef ARCH_DLINFO #define ARCH_DLINFO COMPAT_ARCH_DLINFO diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 5d19acfa7c6..e8228904727 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -680,7 +680,8 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, struct i2c_msg __user *tmsgs; struct i2c_msg32 __user *umsgs; compat_caddr_t datap; - int nmsgs, i; + u32 nmsgs; + int i; if (get_user(nmsgs, &udata->nmsgs)) return -EFAULT; @@ -1537,9 +1538,10 @@ static int compat_ioctl_check_table(unsigned int xcmd) return ioctl_pointer[i] == xcmd; } -asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, - unsigned long arg) +COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, + compat_ulong_t, arg32) { + unsigned long arg = arg32; struct fd f = fdget(fd); int error = -EBADF; if (!f.file) @@ -1583,13 +1585,13 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, /*FALL THROUGH*/ default: - if (f.file->f_op && f.file->f_op->compat_ioctl) { + if (f.file->f_op->compat_ioctl) { error = f.file->f_op->compat_ioctl(f.file, cmd, arg); if (error != -ENOIOCTLCMD) goto out_fput; } - if (!f.file->f_op || !f.file->f_op->unlocked_ioctl) + if (!f.file->f_op->unlocked_ioctl) goto do_ioctl; break; } diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index b5f0a3b91f1..bd4a3c16709 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -24,6 +24,12 @@ * configfs Copyright (C) 2005 Oracle. All rights reserved. */ +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/slab.h> #include <linux/list.h> #include <linux/spinlock.h> diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 277bd1be21f..668dcabc569 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -56,29 +56,28 @@ static void configfs_d_iput(struct dentry * dentry, struct configfs_dirent *sd = dentry->d_fsdata; if (sd) { - BUG_ON(sd->s_dentry != dentry); /* Coordinate with configfs_readdir */ spin_lock(&configfs_dirent_lock); - sd->s_dentry = NULL; + /* Coordinate with configfs_attach_attr where will increase + * sd->s_count and update sd->s_dentry to new allocated one. + * Only set sd->dentry to null when this dentry is the only + * sd owner. + * If not do so, configfs_d_iput may run just after + * configfs_attach_attr and set sd->s_dentry to null + * even it's still in use. + */ + if (atomic_read(&sd->s_count) <= 2) + sd->s_dentry = NULL; + spin_unlock(&configfs_dirent_lock); configfs_put(sd); } iput(inode); } -/* - * We _must_ delete our dentries on last dput, as the chain-to-parent - * behavior is required to clear the parents of default_groups. - */ -static int configfs_d_delete(const struct dentry *dentry) -{ - return 1; -} - const struct dentry_operations configfs_dentry_ops = { .d_iput = configfs_d_iput, - /* simple_delete_dentry() isn't exported */ - .d_delete = configfs_d_delete, + .d_delete = always_delete_dentry, }; #ifdef CONFIG_LOCKDEP @@ -426,8 +425,11 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den struct configfs_attribute * attr = sd->s_element; int error; + spin_lock(&configfs_dirent_lock); dentry->d_fsdata = configfs_get(sd); sd->s_dentry = dentry; + spin_unlock(&configfs_dirent_lock); + error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, configfs_init_file); if (error) { @@ -938,9 +940,9 @@ static void client_drop_item(struct config_item *parent_item, #ifdef DEBUG static void configfs_dump_one(struct configfs_dirent *sd, int level) { - printk(KERN_INFO "%*s\"%s\":\n", level, " ", configfs_get_name(sd)); + pr_info("%*s\"%s\":\n", level, " ", configfs_get_name(sd)); -#define type_print(_type) if (sd->s_type & _type) printk(KERN_INFO "%*s %s\n", level, " ", #_type); +#define type_print(_type) if (sd->s_type & _type) pr_info("%*s %s\n", level, " ", #_type); type_print(CONFIGFS_ROOT); type_print(CONFIGFS_DIR); type_print(CONFIGFS_ITEM_ATTR); @@ -1697,7 +1699,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) struct dentry *root = dentry->d_sb->s_root; if (dentry->d_parent != root) { - printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n"); + pr_err("Tried to unregister non-subsystem!\n"); return; } @@ -1707,7 +1709,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) mutex_lock(&configfs_symlink_mutex); spin_lock(&configfs_dirent_lock); if (configfs_detach_prep(dentry, NULL)) { - printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n"); + pr_err("Tried to unregister non-empty subsystem!\n"); } spin_unlock(&configfs_dirent_lock); mutex_unlock(&configfs_symlink_mutex); diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index a9d35b0e06c..5946ad98053 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -168,9 +168,8 @@ static void configfs_set_inode_lock_class(struct configfs_dirent *sd, * In practice the maximum level of locking depth is * already reached. Just inform about possible reasons. */ - printk(KERN_INFO "configfs: Too many levels of inodes" - " for the locking correctness validator.\n"); - printk(KERN_INFO "Spurious warnings may appear.\n"); + pr_info("Too many levels of inodes for the locking correctness validator.\n"); + pr_info("Spurious warnings may appear.\n"); } } } diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 50cee7f9110..e65f9ffbb99 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -19,7 +19,7 @@ * Boston, MA 021110-1307, USA. * * Based on kobject: - * kobject is Copyright (c) 2002-2003 Patrick Mochel + * kobject is Copyright (c) 2002-2003 Patrick Mochel * * configfs Copyright (C) 2005 Oracle. All rights reserved. * @@ -35,9 +35,9 @@ #include <linux/configfs.h> -static inline struct config_item * to_item(struct list_head * entry) +static inline struct config_item *to_item(struct list_head *entry) { - return container_of(entry,struct config_item,ci_entry); + return container_of(entry, struct config_item, ci_entry); } /* Evil kernel */ @@ -47,34 +47,35 @@ static void config_item_release(struct kref *kref); * config_item_init - initialize item. * @item: item in question. */ -void config_item_init(struct config_item * item) +void config_item_init(struct config_item *item) { kref_init(&item->ci_kref); INIT_LIST_HEAD(&item->ci_entry); } +EXPORT_SYMBOL(config_item_init); /** * config_item_set_name - Set the name of an item * @item: item. - * @name: name. + * @fmt: The vsnprintf()'s format string. * * If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a * dynamically allocated string that @item->ci_name points to. * Otherwise, use the static @item->ci_namebuf array. */ -int config_item_set_name(struct config_item * item, const char * fmt, ...) +int config_item_set_name(struct config_item *item, const char *fmt, ...) { int error = 0; int limit = CONFIGFS_ITEM_NAME_LEN; int need; va_list args; - char * name; + char *name; /* * First, try the static array */ - va_start(args,fmt); - need = vsnprintf(item->ci_namebuf,limit,fmt,args); + va_start(args, fmt); + need = vsnprintf(item->ci_namebuf, limit, fmt, args); va_end(args); if (need < limit) name = item->ci_namebuf; @@ -83,13 +84,13 @@ int config_item_set_name(struct config_item * item, const char * fmt, ...) * Need more space? Allocate it and try again */ limit = need + 1; - name = kmalloc(limit,GFP_KERNEL); + name = kmalloc(limit, GFP_KERNEL); if (!name) { error = -ENOMEM; goto Done; } - va_start(args,fmt); - need = vsnprintf(name,limit,fmt,args); + va_start(args, fmt); + need = vsnprintf(name, limit, fmt, args); va_end(args); /* Still? Give up. */ @@ -109,7 +110,6 @@ int config_item_set_name(struct config_item * item, const char * fmt, ...) Done: return error; } - EXPORT_SYMBOL(config_item_set_name); void config_item_init_type_name(struct config_item *item, @@ -131,20 +131,21 @@ void config_group_init_type_name(struct config_group *group, const char *name, } EXPORT_SYMBOL(config_group_init_type_name); -struct config_item * config_item_get(struct config_item * item) +struct config_item *config_item_get(struct config_item *item) { if (item) kref_get(&item->ci_kref); return item; } +EXPORT_SYMBOL(config_item_get); -static void config_item_cleanup(struct config_item * item) +static void config_item_cleanup(struct config_item *item) { - struct config_item_type * t = item->ci_type; - struct config_group * s = item->ci_group; - struct config_item * parent = item->ci_parent; + struct config_item_type *t = item->ci_type; + struct config_group *s = item->ci_group; + struct config_item *parent = item->ci_parent; - pr_debug("config_item %s: cleaning up\n",config_item_name(item)); + pr_debug("config_item %s: cleaning up\n", config_item_name(item)); if (item->ci_name != item->ci_namebuf) kfree(item->ci_name); item->ci_name = NULL; @@ -167,21 +168,23 @@ static void config_item_release(struct kref *kref) * * Decrement the refcount, and if 0, call config_item_cleanup(). */ -void config_item_put(struct config_item * item) +void config_item_put(struct config_item *item) { if (item) kref_put(&item->ci_kref, config_item_release); } +EXPORT_SYMBOL(config_item_put); /** * config_group_init - initialize a group for use - * @k: group + * @group: config_group */ void config_group_init(struct config_group *group) { config_item_init(&group->cg_item); INIT_LIST_HEAD(&group->cg_children); } +EXPORT_SYMBOL(config_group_init); /** * config_group_find_item - search for item in group. @@ -195,11 +198,11 @@ void config_group_init(struct config_group *group) struct config_item *config_group_find_item(struct config_group *group, const char *name) { - struct list_head * entry; - struct config_item * ret = NULL; + struct list_head *entry; + struct config_item *ret = NULL; - list_for_each(entry,&group->cg_children) { - struct config_item * item = to_item(entry); + list_for_each(entry, &group->cg_children) { + struct config_item *item = to_item(entry); if (config_item_name(item) && !strcmp(config_item_name(item), name)) { ret = config_item_get(item); @@ -208,9 +211,4 @@ struct config_item *config_group_find_item(struct config_group *group, } return ret; } - -EXPORT_SYMBOL(config_item_init); -EXPORT_SYMBOL(config_group_init); -EXPORT_SYMBOL(config_item_get); -EXPORT_SYMBOL(config_item_put); EXPORT_SYMBOL(config_group_find_item); diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 7f26c3cf75a..f6c28583339 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -85,7 +85,7 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent) /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); } else { - pr_debug("configfs: could not get root inode\n"); + pr_debug("could not get root inode\n"); return -ENOMEM; } @@ -155,7 +155,7 @@ static int __init configfs_init(void) return 0; out4: - printk(KERN_ERR "configfs: Unable to register filesystem!\n"); + pr_err("Unable to register filesystem!\n"); configfs_inode_exit(); out3: kobject_put(config_kobj); diff --git a/fs/coredump.c b/fs/coredump.c index 9bdeca12ae0..a93f7e6ea4c 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -40,7 +40,6 @@ #include <trace/events/task.h> #include "internal.h" -#include "coredump.h" #include <trace/events/sched.h> @@ -74,10 +73,15 @@ static int expand_corename(struct core_name *cn, int size) static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg) { int free, need; + va_list arg_copy; again: free = cn->size - cn->used; - need = vsnprintf(cn->corename + cn->used, free, fmt, arg); + + va_copy(arg_copy, arg); + need = vsnprintf(cn->corename + cn->used, free, fmt, arg_copy); + va_end(arg_copy); + if (need < free) { cn->used += need; return 0; @@ -302,7 +306,7 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm, if (unlikely(nr < 0)) return nr; - tsk->flags = PF_DUMPCORE; + tsk->flags |= PF_DUMPCORE; if (atomic_read(&mm->mm_users) == nr + 1) goto done; /* @@ -485,7 +489,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) return err; } -void do_coredump(siginfo_t *siginfo) +void do_coredump(const siginfo_t *siginfo) { struct core_state core_state; struct core_name cn; @@ -645,7 +649,7 @@ void do_coredump(siginfo_t *siginfo) */ if (!uid_eq(inode->i_uid, current_fsuid())) goto close_fail; - if (!cprm.file->f_op || !cprm.file->f_op->write) + if (!cprm.file->f_op->write) goto close_fail; if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file)) goto close_fail; @@ -685,40 +689,55 @@ fail: * do on a core-file: use only these functions to write out all the * necessary info. */ -int dump_write(struct file *file, const void *addr, int nr) +int dump_emit(struct coredump_params *cprm, const void *addr, int nr) { - return !dump_interrupted() && - access_ok(VERIFY_READ, addr, nr) && - file->f_op->write(file, addr, nr, &file->f_pos) == nr; + struct file *file = cprm->file; + loff_t pos = file->f_pos; + ssize_t n; + if (cprm->written + nr > cprm->limit) + return 0; + while (nr) { + if (dump_interrupted()) + return 0; + n = __kernel_write(file, addr, nr, &pos); + if (n <= 0) + return 0; + file->f_pos = pos; + cprm->written += n; + nr -= n; + } + return 1; } -EXPORT_SYMBOL(dump_write); +EXPORT_SYMBOL(dump_emit); -int dump_seek(struct file *file, loff_t off) +int dump_skip(struct coredump_params *cprm, size_t nr) { - int ret = 1; - + static char zeroes[PAGE_SIZE]; + struct file *file = cprm->file; if (file->f_op->llseek && file->f_op->llseek != no_llseek) { + if (cprm->written + nr > cprm->limit) + return 0; if (dump_interrupted() || - file->f_op->llseek(file, off, SEEK_CUR) < 0) + file->f_op->llseek(file, nr, SEEK_CUR) < 0) return 0; + cprm->written += nr; + return 1; } else { - char *buf = (char *)get_zeroed_page(GFP_KERNEL); - - if (!buf) - return 0; - while (off > 0) { - unsigned long n = off; - - if (n > PAGE_SIZE) - n = PAGE_SIZE; - if (!dump_write(file, buf, n)) { - ret = 0; - break; - } - off -= n; + while (nr > PAGE_SIZE) { + if (!dump_emit(cprm, zeroes, PAGE_SIZE)) + return 0; + nr -= PAGE_SIZE; } - free_page((unsigned long)buf); + return dump_emit(cprm, zeroes, nr); } - return ret; } -EXPORT_SYMBOL(dump_seek); +EXPORT_SYMBOL(dump_skip); + +int dump_align(struct coredump_params *cprm, int align) +{ + unsigned mod = cprm->written & (align - 1); + if (align & (align - 1)) + return 0; + return mod ? dump_skip(cprm, align - mod) : 1; +} +EXPORT_SYMBOL(dump_align); diff --git a/fs/coredump.h b/fs/coredump.h deleted file mode 100644 index e39ff072110..00000000000 --- a/fs/coredump.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _FS_COREDUMP_H -#define _FS_COREDUMP_H - -extern int __get_dumpable(unsigned long mm_flags); - -#endif diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig index cd06466f365..11b29d491b7 100644 --- a/fs/cramfs/Kconfig +++ b/fs/cramfs/Kconfig @@ -1,5 +1,5 @@ config CRAMFS - tristate "Compressed ROM file system support (cramfs)" + tristate "Compressed ROM file system support (cramfs) (OBSOLETE)" depends on BLOCK select ZLIB_INFLATE help @@ -16,4 +16,7 @@ config CRAMFS cramfs. Note that the root file system (the one containing the directory /) cannot be compiled as a module. + This filesystem is obsoleted by SquashFS, which is much better + in terms of performance and features. + If unsure, say N. diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index e501ac3a49f..ddcfe590b8a 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -17,14 +17,30 @@ #include <linux/init.h> #include <linux/string.h> #include <linux/blkdev.h> -#include <linux/cramfs_fs.h> #include <linux/slab.h> -#include <linux/cramfs_fs_sb.h> #include <linux/vfs.h> #include <linux/mutex.h> - +#include <uapi/linux/cramfs_fs.h> #include <asm/uaccess.h> +#include "internal.h" + +/* + * cramfs super-block data in memory + */ +struct cramfs_sb_info { + unsigned long magic; + unsigned long size; + unsigned long blocks; + unsigned long files; + unsigned long flags; +}; + +static inline struct cramfs_sb_info *CRAMFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + static const struct super_operations cramfs_ops; static const struct inode_operations cramfs_dir_inode_operations; static const struct file_operations cramfs_directory_operations; @@ -179,8 +195,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i struct page *page = NULL; if (blocknr + i < devsize) { - page = read_mapping_page_async(mapping, blocknr + i, - NULL); + page = read_mapping_page(mapping, blocknr + i, NULL); /* synchronous error? */ if (IS_ERR(page)) page = NULL; @@ -219,14 +234,16 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i return read_buffers[buffer] + offset; } -static void cramfs_put_super(struct super_block *sb) +static void cramfs_kill_sb(struct super_block *sb) { - kfree(sb->s_fs_info); - sb->s_fs_info = NULL; + struct cramfs_sb_info *sbi = CRAMFS_SB(sb); + kill_block_super(sb); + kfree(sbi); } static int cramfs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_RDONLY; return 0; } @@ -261,7 +278,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) if (super.magic == CRAMFS_MAGIC_WEND) { if (!silent) printk(KERN_ERR "cramfs: wrong endianness\n"); - goto out; + return -EINVAL; } /* check at 512 byte offset */ @@ -273,20 +290,20 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) printk(KERN_ERR "cramfs: wrong endianness\n"); else if (!silent) printk(KERN_ERR "cramfs: wrong magic\n"); - goto out; + return -EINVAL; } } /* get feature flags first */ if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) { printk(KERN_ERR "cramfs: unsupported filesystem features\n"); - goto out; + return -EINVAL; } /* Check that the root inode is in a sane state */ if (!S_ISDIR(super.root.mode)) { printk(KERN_ERR "cramfs: root is not a directory\n"); - goto out; + return -EINVAL; } /* correct strange, hard-coded permissions of mkcramfs */ super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); @@ -310,22 +327,18 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) (root_offset != 512 + sizeof(struct cramfs_super)))) { printk(KERN_ERR "cramfs: bad root offset %lu\n", root_offset); - goto out; + return -EINVAL; } /* Set it all up.. */ sb->s_op = &cramfs_ops; root = get_cramfs_inode(sb, &super.root, 0); if (IS_ERR(root)) - goto out; + return PTR_ERR(root); sb->s_root = d_make_root(root); if (!sb->s_root) - goto out; + return -ENOMEM; return 0; -out: - kfree(sbi); - sb->s_fs_info = NULL; - return -EINVAL; } static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -550,7 +563,6 @@ static const struct inode_operations cramfs_dir_inode_operations = { }; static const struct super_operations cramfs_ops = { - .put_super = cramfs_put_super, .remount_fs = cramfs_remount, .statfs = cramfs_statfs, }; @@ -565,7 +577,7 @@ static struct file_system_type cramfs_fs_type = { .owner = THIS_MODULE, .name = "cramfs", .mount = cramfs_mount, - .kill_sb = kill_block_super, + .kill_sb = cramfs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("cramfs"); diff --git a/fs/cramfs/internal.h b/fs/cramfs/internal.h new file mode 100644 index 00000000000..349d7127215 --- /dev/null +++ b/fs/cramfs/internal.h @@ -0,0 +1,4 @@ +/* Uncompression interfaces to the underlying zlib */ +int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen); +int cramfs_uncompress_init(void); +void cramfs_uncompress_exit(void); diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c index 023329800d2..1760c1b84d9 100644 --- a/fs/cramfs/uncompress.c +++ b/fs/cramfs/uncompress.c @@ -19,7 +19,7 @@ #include <linux/errno.h> #include <linux/vmalloc.h> #include <linux/zlib.h> -#include <linux/cramfs_fs.h> +#include "internal.h" static z_stream stream; static int initialized; diff --git a/fs/dcache.c b/fs/dcache.c index 41000305d71..06f65857a85 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -88,35 +88,6 @@ EXPORT_SYMBOL(rename_lock); static struct kmem_cache *dentry_cache __read_mostly; -/** - * read_seqbegin_or_lock - begin a sequence number check or locking block - * @lock: sequence lock - * @seq : sequence number to be checked - * - * First try it once optimistically without taking the lock. If that fails, - * take the lock. The sequence number is also used as a marker for deciding - * whether to be a reader (even) or writer (odd). - * N.B. seq must be initialized to an even number to begin with. - */ -static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) -{ - if (!(*seq & 1)) /* Even */ - *seq = read_seqbegin(lock); - else /* Odd */ - read_seqlock_excl(lock); -} - -static inline int need_seqretry(seqlock_t *lock, int seq) -{ - return !(seq & 1) && read_seqretry(lock, seq); -} - -static inline void done_seqretry(seqlock_t *lock, int seq) -{ - if (seq & 1) - read_sequnlock_excl(lock); -} - /* * This is the single most critical data structure when it comes * to the dcache: the hashtable for lookups. Somebody should try @@ -125,8 +96,6 @@ static inline void done_seqretry(seqlock_t *lock, int seq) * This hash-function tries to avoid losing too many bits of hash * information, yet avoid using a prime hash-size or similar. */ -#define D_HASHBITS d_hash_shift -#define D_HASHMASK d_hash_mask static unsigned int d_hash_mask __read_mostly; static unsigned int d_hash_shift __read_mostly; @@ -137,8 +106,8 @@ static inline struct hlist_bl_head *d_hash(const struct dentry *parent, unsigned int hash) { hash += (unsigned long) parent / L1_CACHE_BYTES; - hash = hash + (hash >> D_HASHBITS); - return dentry_hashtable + (hash & D_HASHMASK); + hash = hash + (hash >> d_hash_shift); + return dentry_hashtable + (hash & d_hash_mask); } /* Statistics gathering. */ @@ -181,7 +150,7 @@ static long get_nr_dentry_unused(void) return sum < 0 ? 0 : sum; } -int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, +int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); @@ -223,7 +192,7 @@ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char if (!tcount) return 0; } - mask = ~(~0ul << tcount*8); + mask = bytemask_from_count(tcount); return unlikely(!!((a ^ b) & mask)); } @@ -277,16 +246,8 @@ static void __d_free(struct rcu_head *head) kmem_cache_free(dentry_cache, dentry); } -/* - * no locks, please. - */ -static void d_free(struct dentry *dentry) +static void dentry_free(struct dentry *dentry) { - BUG_ON((int)dentry->d_lockref.count > 0); - this_cpu_dec(nr_dentry); - if (dentry->d_op && dentry->d_op->d_release) - dentry->d_op->d_release(dentry); - /* if dentry was never visible to RCU, immediate free is OK */ if (!(dentry->d_flags & DCACHE_RCUACCESS)) __d_free(&dentry->d_u.d_rcu); @@ -343,6 +304,7 @@ static void dentry_unlink_inode(struct dentry * dentry) __releases(dentry->d_inode->i_lock) { struct inode *inode = dentry->d_inode; + __d_clear_type(dentry); dentry->d_inode = NULL; hlist_del_init(&dentry->d_alias); dentry_rcuwalk_barrier(dentry); @@ -433,77 +395,6 @@ static void dentry_lru_add(struct dentry *dentry) d_lru_add(dentry); } -/* - * Remove a dentry with references from the LRU. - * - * If we are on the shrink list, then we can get to try_prune_one_dentry() and - * lose our last reference through the parent walk. In this case, we need to - * remove ourselves from the shrink list, not the LRU. - */ -static void dentry_lru_del(struct dentry *dentry) -{ - if (dentry->d_flags & DCACHE_LRU_LIST) { - if (dentry->d_flags & DCACHE_SHRINK_LIST) - return d_shrink_del(dentry); - d_lru_del(dentry); - } -} - -/** - * d_kill - kill dentry and return parent - * @dentry: dentry to kill - * @parent: parent dentry - * - * The dentry must already be unhashed and removed from the LRU. - * - * If this is the root of the dentry tree, return NULL. - * - * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by - * d_kill. - */ -static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) - __releases(dentry->d_lock) - __releases(parent->d_lock) - __releases(dentry->d_inode->i_lock) -{ - list_del(&dentry->d_u.d_child); - /* - * Inform try_to_ascend() that we are no longer attached to the - * dentry tree - */ - dentry->d_flags |= DCACHE_DENTRY_KILLED; - if (parent) - spin_unlock(&parent->d_lock); - dentry_iput(dentry); - /* - * dentry_iput drops the locks, at which point nobody (except - * transient RCU lookups) can reach this dentry. - */ - d_free(dentry); - return parent; -} - -/* - * Unhash a dentry without inserting an RCU walk barrier or checking that - * dentry->d_lock is locked. The caller must take care of that, if - * appropriate. - */ -static void __d_shrink(struct dentry *dentry) -{ - if (!d_unhashed(dentry)) { - struct hlist_bl_head *b; - if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) - b = &dentry->d_sb->s_anon; - else - b = d_hash(dentry->d_parent, dentry->d_name.hash); - - hlist_bl_lock(b); - __hlist_bl_del(&dentry->d_hash); - dentry->d_hash.pprev = NULL; - hlist_bl_unlock(b); - } -} - /** * d_drop - drop a dentry * @dentry: dentry to drop @@ -522,7 +413,21 @@ static void __d_shrink(struct dentry *dentry) void __d_drop(struct dentry *dentry) { if (!d_unhashed(dentry)) { - __d_shrink(dentry); + struct hlist_bl_head *b; + /* + * Hashed dentries are normally on the dentry hashtable, + * with the exception of those newly allocated by + * d_obtain_alias, which are always IS_ROOT: + */ + if (unlikely(IS_ROOT(dentry))) + b = &dentry->d_sb->s_anon; + else + b = d_hash(dentry->d_parent, dentry->d_name.hash); + + hlist_bl_lock(b); + __hlist_bl_del(&dentry->d_hash); + dentry->d_hash.pprev = NULL; + hlist_bl_unlock(b); dentry_rcuwalk_barrier(dentry); } } @@ -536,37 +441,12 @@ void d_drop(struct dentry *dentry) } EXPORT_SYMBOL(d_drop); -/* - * Finish off a dentry we've decided to kill. - * dentry->d_lock must be held, returns with it unlocked. - * If ref is non-zero, then decrement the refcount too. - * Returns dentry requiring refcount drop, or NULL if we're done. - */ -static inline struct dentry * -dentry_kill(struct dentry *dentry, int unlock_on_failure) - __releases(dentry->d_lock) +static void __dentry_kill(struct dentry *dentry) { - struct inode *inode; - struct dentry *parent; - - inode = dentry->d_inode; - if (inode && !spin_trylock(&inode->i_lock)) { -relock: - if (unlock_on_failure) { - spin_unlock(&dentry->d_lock); - cpu_relax(); - } - return dentry; /* try again with same dentry */ - } - if (IS_ROOT(dentry)) - parent = NULL; - else + struct dentry *parent = NULL; + bool can_free = true; + if (!IS_ROOT(dentry)) parent = dentry->d_parent; - if (parent && !spin_trylock(&parent->d_lock)) { - if (inode) - spin_unlock(&inode->i_lock); - goto relock; - } /* * The dentry is now unrecoverably dead to the world. @@ -580,10 +460,105 @@ relock: if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry)) dentry->d_op->d_prune(dentry); - dentry_lru_del(dentry); + if (dentry->d_flags & DCACHE_LRU_LIST) { + if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) + d_lru_del(dentry); + } /* if it was on the hash then remove it */ __d_drop(dentry); - return d_kill(dentry, parent); + list_del(&dentry->d_u.d_child); + /* + * Inform d_walk() that we are no longer attached to the + * dentry tree + */ + dentry->d_flags |= DCACHE_DENTRY_KILLED; + if (parent) + spin_unlock(&parent->d_lock); + dentry_iput(dentry); + /* + * dentry_iput drops the locks, at which point nobody (except + * transient RCU lookups) can reach this dentry. + */ + BUG_ON((int)dentry->d_lockref.count > 0); + this_cpu_dec(nr_dentry); + if (dentry->d_op && dentry->d_op->d_release) + dentry->d_op->d_release(dentry); + + spin_lock(&dentry->d_lock); + if (dentry->d_flags & DCACHE_SHRINK_LIST) { + dentry->d_flags |= DCACHE_MAY_FREE; + can_free = false; + } + spin_unlock(&dentry->d_lock); + if (likely(can_free)) + dentry_free(dentry); +} + +/* + * Finish off a dentry we've decided to kill. + * dentry->d_lock must be held, returns with it unlocked. + * If ref is non-zero, then decrement the refcount too. + * Returns dentry requiring refcount drop, or NULL if we're done. + */ +static struct dentry *dentry_kill(struct dentry *dentry) + __releases(dentry->d_lock) +{ + struct inode *inode = dentry->d_inode; + struct dentry *parent = NULL; + + if (inode && unlikely(!spin_trylock(&inode->i_lock))) + goto failed; + + if (!IS_ROOT(dentry)) { + parent = dentry->d_parent; + if (unlikely(!spin_trylock(&parent->d_lock))) { + if (inode) + spin_unlock(&inode->i_lock); + goto failed; + } + } + + __dentry_kill(dentry); + return parent; + +failed: + spin_unlock(&dentry->d_lock); + cpu_relax(); + return dentry; /* try again with same dentry */ +} + +static inline struct dentry *lock_parent(struct dentry *dentry) +{ + struct dentry *parent = dentry->d_parent; + if (IS_ROOT(dentry)) + return NULL; + if (unlikely((int)dentry->d_lockref.count < 0)) + return NULL; + if (likely(spin_trylock(&parent->d_lock))) + return parent; + rcu_read_lock(); + spin_unlock(&dentry->d_lock); +again: + parent = ACCESS_ONCE(dentry->d_parent); + spin_lock(&parent->d_lock); + /* + * We can't blindly lock dentry until we are sure + * that we won't violate the locking order. + * Any changes of dentry->d_parent must have + * been done with parent->d_lock held, so + * spin_lock() above is enough of a barrier + * for checking if it's still our child. + */ + if (unlikely(parent != dentry->d_parent)) { + spin_unlock(&parent->d_lock); + goto again; + } + rcu_read_unlock(); + if (parent != dentry) + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + else + parent = NULL; + return parent; } /* @@ -630,7 +605,8 @@ repeat: goto kill_it; } - dentry->d_flags |= DCACHE_REFERENCED; + if (!(dentry->d_flags & DCACHE_REFERENCED)) + dentry->d_flags |= DCACHE_REFERENCED; dentry_lru_add(dentry); dentry->d_lockref.count--; @@ -638,7 +614,7 @@ repeat: return; kill_it: - dentry = dentry_kill(dentry, 1); + dentry = dentry_kill(dentry); if (dentry) goto repeat; } @@ -851,64 +827,15 @@ restart: } EXPORT_SYMBOL(d_prune_aliases); -/* - * Try to throw away a dentry - free the inode, dput the parent. - * Requires dentry->d_lock is held, and dentry->d_count == 0. - * Releases dentry->d_lock. - * - * This may fail if locks cannot be acquired no problem, just try again. - */ -static struct dentry * try_prune_one_dentry(struct dentry *dentry) - __releases(dentry->d_lock) -{ - struct dentry *parent; - - parent = dentry_kill(dentry, 0); - /* - * If dentry_kill returns NULL, we have nothing more to do. - * if it returns the same dentry, trylocks failed. In either - * case, just loop again. - * - * Otherwise, we need to prune ancestors too. This is necessary - * to prevent quadratic behavior of shrink_dcache_parent(), but - * is also expected to be beneficial in reducing dentry cache - * fragmentation. - */ - if (!parent) - return NULL; - if (parent == dentry) - return dentry; - - /* Prune ancestors. */ - dentry = parent; - while (dentry) { - if (lockref_put_or_lock(&dentry->d_lockref)) - return NULL; - dentry = dentry_kill(dentry, 1); - } - return NULL; -} - static void shrink_dentry_list(struct list_head *list) { - struct dentry *dentry; - - rcu_read_lock(); - for (;;) { - dentry = list_entry_rcu(list->prev, struct dentry, d_lru); - if (&dentry->d_lru == list) - break; /* empty */ + struct dentry *dentry, *parent; - /* - * Get the dentry lock, and re-verify that the dentry is - * this on the shrinking list. If it is, we know that - * DCACHE_SHRINK_LIST and DCACHE_LRU_LIST are set. - */ + while (!list_empty(list)) { + struct inode *inode; + dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); - if (dentry != list_entry(list->prev, struct dentry, d_lru)) { - spin_unlock(&dentry->d_lock); - continue; - } + parent = lock_parent(dentry); /* * The dispose list is isolated and dentries are not accounted @@ -921,30 +848,63 @@ static void shrink_dentry_list(struct list_head *list) * We found an inuse dentry which was not removed from * the LRU because of laziness during lookup. Do not free it. */ - if (dentry->d_lockref.count) { + if ((int)dentry->d_lockref.count > 0) { spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); continue; } - rcu_read_unlock(); - /* - * If 'try_to_prune()' returns a dentry, it will - * be the same one we passed in, and d_lock will - * have been held the whole time, so it will not - * have been added to any other lists. We failed - * to get the inode lock. - * - * We just add it back to the shrink list. - */ - dentry = try_prune_one_dentry(dentry); - rcu_read_lock(); - if (dentry) { + if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) { + bool can_free = dentry->d_flags & DCACHE_MAY_FREE; + spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + if (can_free) + dentry_free(dentry); + continue; + } + + inode = dentry->d_inode; + if (inode && unlikely(!spin_trylock(&inode->i_lock))) { d_shrink_add(dentry, list); spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + continue; + } + + __dentry_kill(dentry); + + /* + * We need to prune ancestors too. This is necessary to prevent + * quadratic behavior of shrink_dcache_parent(), but is also + * expected to be beneficial in reducing dentry cache + * fragmentation. + */ + dentry = parent; + while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) { + parent = lock_parent(dentry); + if (dentry->d_lockref.count != 1) { + dentry->d_lockref.count--; + spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + break; + } + inode = dentry->d_inode; /* can't be NULL */ + if (unlikely(!spin_trylock(&inode->i_lock))) { + spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + cpu_relax(); + continue; + } + __dentry_kill(dentry); + dentry = parent; } } - rcu_read_unlock(); } static enum lru_status @@ -1074,144 +1034,6 @@ void shrink_dcache_sb(struct super_block *sb) } EXPORT_SYMBOL(shrink_dcache_sb); -/* - * destroy a single subtree of dentries for unmount - * - see the comments on shrink_dcache_for_umount() for a description of the - * locking - */ -static void shrink_dcache_for_umount_subtree(struct dentry *dentry) -{ - struct dentry *parent; - - BUG_ON(!IS_ROOT(dentry)); - - for (;;) { - /* descend to the first leaf in the current subtree */ - while (!list_empty(&dentry->d_subdirs)) - dentry = list_entry(dentry->d_subdirs.next, - struct dentry, d_u.d_child); - - /* consume the dentries from this leaf up through its parents - * until we find one with children or run out altogether */ - do { - struct inode *inode; - - /* - * inform the fs that this dentry is about to be - * unhashed and destroyed. - */ - if ((dentry->d_flags & DCACHE_OP_PRUNE) && - !d_unhashed(dentry)) - dentry->d_op->d_prune(dentry); - - dentry_lru_del(dentry); - __d_shrink(dentry); - - if (dentry->d_lockref.count != 0) { - printk(KERN_ERR - "BUG: Dentry %p{i=%lx,n=%s}" - " still in use (%d)" - " [unmount of %s %s]\n", - dentry, - dentry->d_inode ? - dentry->d_inode->i_ino : 0UL, - dentry->d_name.name, - dentry->d_lockref.count, - dentry->d_sb->s_type->name, - dentry->d_sb->s_id); - BUG(); - } - - if (IS_ROOT(dentry)) { - parent = NULL; - list_del(&dentry->d_u.d_child); - } else { - parent = dentry->d_parent; - parent->d_lockref.count--; - list_del(&dentry->d_u.d_child); - } - - inode = dentry->d_inode; - if (inode) { - dentry->d_inode = NULL; - hlist_del_init(&dentry->d_alias); - if (dentry->d_op && dentry->d_op->d_iput) - dentry->d_op->d_iput(dentry, inode); - else - iput(inode); - } - - d_free(dentry); - - /* finished when we fall off the top of the tree, - * otherwise we ascend to the parent and move to the - * next sibling if there is one */ - if (!parent) - return; - dentry = parent; - } while (list_empty(&dentry->d_subdirs)); - - dentry = list_entry(dentry->d_subdirs.next, - struct dentry, d_u.d_child); - } -} - -/* - * destroy the dentries attached to a superblock on unmounting - * - we don't need to use dentry->d_lock because: - * - the superblock is detached from all mountings and open files, so the - * dentry trees will not be rearranged by the VFS - * - s_umount is write-locked, so the memory pressure shrinker will ignore - * any dentries belonging to this superblock that it comes across - * - the filesystem itself is no longer permitted to rearrange the dentries - * in this superblock - */ -void shrink_dcache_for_umount(struct super_block *sb) -{ - struct dentry *dentry; - - if (down_read_trylock(&sb->s_umount)) - BUG(); - - dentry = sb->s_root; - sb->s_root = NULL; - dentry->d_lockref.count--; - shrink_dcache_for_umount_subtree(dentry); - - while (!hlist_bl_empty(&sb->s_anon)) { - dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash); - shrink_dcache_for_umount_subtree(dentry); - } -} - -/* - * This tries to ascend one level of parenthood, but - * we can race with renaming, so we need to re-check - * the parenthood after dropping the lock and check - * that the sequence number still matches. - */ -static struct dentry *try_to_ascend(struct dentry *old, unsigned seq) -{ - struct dentry *new = old->d_parent; - - rcu_read_lock(); - spin_unlock(&old->d_lock); - spin_lock(&new->d_lock); - - /* - * might go back up the wrong parent if we have had a rename - * or deletion - */ - if (new != old->d_parent || - (old->d_flags & DCACHE_DENTRY_KILLED) || - need_seqretry(&rename_lock, seq)) { - spin_unlock(&new->d_lock); - new = NULL; - } - rcu_read_unlock(); - return new; -} - /** * enum d_walk_ret - action to talke during tree walk * @D_WALK_CONTINUE: contrinue walk @@ -1300,9 +1122,24 @@ resume: */ if (this_parent != parent) { struct dentry *child = this_parent; - this_parent = try_to_ascend(this_parent, seq); - if (!this_parent) + this_parent = child->d_parent; + + rcu_read_lock(); + spin_unlock(&child->d_lock); + spin_lock(&this_parent->d_lock); + + /* + * might go back up the wrong parent if we have had a rename + * or deletion + */ + if (this_parent != child->d_parent || + (child->d_flags & DCACHE_DENTRY_KILLED) || + need_seqretry(&rename_lock, seq)) { + spin_unlock(&this_parent->d_lock); + rcu_read_unlock(); goto rename_retry; + } + rcu_read_unlock(); next = child->d_u.d_child.next; goto resume; } @@ -1331,14 +1168,6 @@ rename_retry: * list is non-empty and continue searching. */ -/** - * have_submounts - check for mounts over a dentry - * @parent: dentry to check. - * - * Return true if the parent or its subdirectories contain - * a mount point - */ - static enum d_walk_ret check_mount(void *data, struct dentry *dentry) { int *ret = data; @@ -1349,6 +1178,13 @@ static enum d_walk_ret check_mount(void *data, struct dentry *dentry) return D_WALK_CONTINUE; } +/** + * have_submounts - check for mounts over a dentry + * @parent: dentry to check. + * + * Return true if the parent or its subdirectories contain + * a mount point + */ int have_submounts(struct dentry *parent) { int ret = 0; @@ -1421,34 +1257,23 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) if (data->start == dentry) goto out; - /* - * move only zero ref count dentries to the dispose list. - * - * Those which are presently on the shrink list, being processed - * by shrink_dentry_list(), shouldn't be moved. Otherwise the - * loop in shrink_dcache_parent() might not make any progress - * and loop forever. - */ - if (dentry->d_lockref.count) { - dentry_lru_del(dentry); - } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { - /* - * We can't use d_lru_shrink_move() because we - * need to get the global LRU lock and do the - * LRU accounting. - */ - d_lru_del(dentry); - d_shrink_add(dentry, &data->dispose); + if (dentry->d_flags & DCACHE_SHRINK_LIST) { data->found++; - ret = D_WALK_NORETRY; + } else { + if (dentry->d_flags & DCACHE_LRU_LIST) + d_lru_del(dentry); + if (!dentry->d_lockref.count) { + d_shrink_add(dentry, &data->dispose); + data->found++; + } } /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find * the rest. */ - if (data->found && need_resched()) - ret = D_WALK_QUIT; + if (!list_empty(&data->dispose)) + ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: return ret; } @@ -1478,6 +1303,56 @@ void shrink_dcache_parent(struct dentry *parent) } EXPORT_SYMBOL(shrink_dcache_parent); +static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) +{ + /* it has busy descendents; complain about those instead */ + if (!list_empty(&dentry->d_subdirs)) + return D_WALK_CONTINUE; + + /* root with refcount 1 is fine */ + if (dentry == _data && dentry->d_lockref.count == 1) + return D_WALK_CONTINUE; + + printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} " + " still in use (%d) [unmount of %s %s]\n", + dentry, + dentry->d_inode ? + dentry->d_inode->i_ino : 0UL, + dentry, + dentry->d_lockref.count, + dentry->d_sb->s_type->name, + dentry->d_sb->s_id); + WARN_ON(1); + return D_WALK_CONTINUE; +} + +static void do_one_tree(struct dentry *dentry) +{ + shrink_dcache_parent(dentry); + d_walk(dentry, dentry, umount_check, NULL); + d_drop(dentry); + dput(dentry); +} + +/* + * destroy the dentries attached to a superblock on unmounting + */ +void shrink_dcache_for_umount(struct super_block *sb) +{ + struct dentry *dentry; + + WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked"); + + dentry = sb->s_root; + sb->s_root = NULL; + do_one_tree(dentry); + + while (!hlist_bl_empty(&sb->s_anon)) { + dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash)); + do_one_tree(dentry); + } +} + static enum d_walk_ret check_and_collect(void *_data, struct dentry *dentry) { struct select_data *data = _data; @@ -1638,12 +1513,17 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) } EXPORT_SYMBOL(d_alloc); +/** + * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems) + * @sb: the superblock + * @name: qstr of the name + * + * For a filesystem that just pins its dentries in memory and never + * performs lookups at all, return an unhashed IS_ROOT dentry. + */ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name) { - struct dentry *dentry = __d_alloc(sb, name); - if (dentry) - dentry->d_flags |= DCACHE_DISCONNECTED; - return dentry; + return __d_alloc(sb, name); } EXPORT_SYMBOL(d_alloc_pseudo); @@ -1685,14 +1565,41 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) } EXPORT_SYMBOL(d_set_d_op); +static unsigned d_flags_for_inode(struct inode *inode) +{ + unsigned add_flags = DCACHE_FILE_TYPE; + + if (!inode) + return DCACHE_MISS_TYPE; + + if (S_ISDIR(inode->i_mode)) { + add_flags = DCACHE_DIRECTORY_TYPE; + if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) { + if (unlikely(!inode->i_op->lookup)) + add_flags = DCACHE_AUTODIR_TYPE; + else + inode->i_opflags |= IOP_LOOKUP; + } + } else if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { + if (unlikely(inode->i_op->follow_link)) + add_flags = DCACHE_SYMLINK_TYPE; + else + inode->i_opflags |= IOP_NOFOLLOW; + } + + if (unlikely(IS_AUTOMOUNT(inode))) + add_flags |= DCACHE_NEED_AUTOMOUNT; + return add_flags; +} + static void __d_instantiate(struct dentry *dentry, struct inode *inode) { + unsigned add_flags = d_flags_for_inode(inode); + spin_lock(&dentry->d_lock); - if (inode) { - if (unlikely(IS_AUTOMOUNT(inode))) - dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; + __d_set_type(dentry, add_flags); + if (inode) hlist_add_head(&dentry->d_alias, &inode->i_dentry); - } dentry->d_inode = inode; dentry_rcuwalk_barrier(dentry); spin_unlock(&dentry->d_lock); @@ -1801,6 +1708,33 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode) EXPORT_SYMBOL(d_instantiate_unique); +/** + * d_instantiate_no_diralias - instantiate a non-aliased dentry + * @entry: dentry to complete + * @inode: inode to attach to this dentry + * + * Fill in inode information in the entry. If a directory alias is found, then + * return an error (and drop inode). Together with d_materialise_unique() this + * guarantees that a directory inode may never have more than one alias. + */ +int d_instantiate_no_diralias(struct dentry *entry, struct inode *inode) +{ + BUG_ON(!hlist_unhashed(&entry->d_alias)); + + spin_lock(&inode->i_lock); + if (S_ISDIR(inode->i_mode) && !hlist_empty(&inode->i_dentry)) { + spin_unlock(&inode->i_lock); + iput(inode); + return -EBUSY; + } + __d_instantiate(entry, inode); + spin_unlock(&inode->i_lock); + security_d_instantiate(entry, inode); + + return 0; +} +EXPORT_SYMBOL(d_instantiate_no_diralias); + struct dentry *d_make_root(struct inode *root_inode) { struct dentry *res = NULL; @@ -1870,6 +1804,7 @@ struct dentry *d_obtain_alias(struct inode *inode) static const struct qstr anonstring = QSTR_INIT("/", 1); struct dentry *tmp; struct dentry *res; + unsigned add_flags; if (!inode) return ERR_PTR(-ESTALE); @@ -1895,9 +1830,11 @@ struct dentry *d_obtain_alias(struct inode *inode) } /* attach a disconnected dentry */ + add_flags = d_flags_for_inode(inode) | DCACHE_DISCONNECTED; + spin_lock(&tmp->d_lock); tmp->d_inode = inode; - tmp->d_flags |= DCACHE_DISCONNECTED; + tmp->d_flags |= add_flags; hlist_add_head(&tmp->d_alias, &inode->i_dentry); hlist_bl_lock(&tmp->d_sb->s_anon); hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon); @@ -2495,12 +2432,14 @@ static void switch_names(struct dentry *dentry, struct dentry *target) dentry->d_name.name = dentry->d_iname; } else { /* - * Both are internal. Just copy target to dentry + * Both are internal. */ - memcpy(dentry->d_iname, target->d_name.name, - target->d_name.len + 1); - dentry->d_name.len = target->d_name.len; - return; + unsigned int i; + BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long))); + for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) { + swap(((long *) &dentry->d_iname)[i], + ((long *) &target->d_iname)[i]); + } } } swap(dentry->d_name.len, target->d_name.len); @@ -2557,13 +2496,15 @@ static void dentry_unlock_parents_for_move(struct dentry *dentry, * __d_move - move a dentry * @dentry: entry to move * @target: new dentry + * @exchange: exchange the two dentries * * Update the dcache to reflect the move of a file name. Negative * dcache entries should not be moved in this way. Caller must hold * rename_lock, the i_mutex of the source and target directories, * and the sb->s_vfs_rename_mutex if they differ. See lock_rename(). */ -static void __d_move(struct dentry * dentry, struct dentry * target) +static void __d_move(struct dentry *dentry, struct dentry *target, + bool exchange) { if (!dentry->d_inode) printk(KERN_WARNING "VFS: moving negative dcache entry\n"); @@ -2574,7 +2515,7 @@ static void __d_move(struct dentry * dentry, struct dentry * target) dentry_lock_for_move(dentry, target); write_seqcount_begin(&dentry->d_seq); - write_seqcount_begin(&target->d_seq); + write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED); /* __d_drop does write_seqcount_barrier, but they're OK to nest. */ @@ -2585,8 +2526,15 @@ static void __d_move(struct dentry * dentry, struct dentry * target) __d_drop(dentry); __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash)); - /* Unhash the target: dput() will then get rid of it */ + /* + * Unhash the target (d_delete() is not usable here). If exchanging + * the two dentries, then rehash onto the other's hash queue. + */ __d_drop(target); + if (exchange) { + __d_rehash(target, + d_hash(dentry->d_parent, dentry->d_name.hash)); + } list_del(&dentry->d_u.d_child); list_del(&target->d_u.d_child); @@ -2613,6 +2561,8 @@ static void __d_move(struct dentry * dentry, struct dentry * target) write_seqcount_end(&dentry->d_seq); dentry_unlock_parents_for_move(dentry, target); + if (exchange) + fsnotify_d_move(target); spin_unlock(&target->d_lock); fsnotify_d_move(dentry); spin_unlock(&dentry->d_lock); @@ -2630,11 +2580,30 @@ static void __d_move(struct dentry * dentry, struct dentry * target) void d_move(struct dentry *dentry, struct dentry *target) { write_seqlock(&rename_lock); - __d_move(dentry, target); + __d_move(dentry, target, false); write_sequnlock(&rename_lock); } EXPORT_SYMBOL(d_move); +/* + * d_exchange - exchange two dentries + * @dentry1: first dentry + * @dentry2: second dentry + */ +void d_exchange(struct dentry *dentry1, struct dentry *dentry2) +{ + write_seqlock(&rename_lock); + + WARN_ON(!dentry1->d_inode); + WARN_ON(!dentry2->d_inode); + WARN_ON(IS_ROOT(dentry1)); + WARN_ON(IS_ROOT(dentry2)); + + __d_move(dentry1, dentry2, true); + + write_sequnlock(&rename_lock); +} + /** * d_ancestor - search for an ancestor * @p1: ancestor dentry @@ -2682,7 +2651,7 @@ static struct dentry *__d_unalias(struct inode *inode, m2 = &alias->d_parent->d_inode->i_mutex; out_unalias: if (likely(!d_mountpoint(alias))) { - __d_move(alias, dentry); + __d_move(alias, dentry, false); ret = alias; } out_err: @@ -2706,7 +2675,7 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) dentry_lock_for_move(anon, dentry); write_seqcount_begin(&dentry->d_seq); - write_seqcount_begin(&anon->d_seq); + write_seqcount_begin_nested(&anon->d_seq, DENTRY_D_LOCK_NESTED); dparent = dentry->d_parent; @@ -2725,7 +2694,6 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) spin_unlock(&dentry->d_lock); /* anon->d_lock still locked, returns locked */ - anon->d_flags &= ~DCACHE_DISCONNECTED; } /** @@ -2846,9 +2814,9 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name) u32 dlen = ACCESS_ONCE(name->len); char *p; - if (*buflen < dlen + 1) - return -ENAMETOOLONG; *buflen -= dlen + 1; + if (*buflen < 0) + return -ENAMETOOLONG; p = *buffer -= dlen + 1; *p++ = '/'; while (dlen--) { @@ -2881,27 +2849,36 @@ static int prepend_path(const struct path *path, const struct path *root, char **buffer, int *buflen) { - struct dentry *dentry = path->dentry; - struct vfsmount *vfsmnt = path->mnt; - struct mount *mnt = real_mount(vfsmnt); + struct dentry *dentry; + struct vfsmount *vfsmnt; + struct mount *mnt; int error = 0; - unsigned seq = 0; + unsigned seq, m_seq = 0; char *bptr; int blen; rcu_read_lock(); +restart_mnt: + read_seqbegin_or_lock(&mount_lock, &m_seq); + seq = 0; + rcu_read_lock(); restart: bptr = *buffer; blen = *buflen; + error = 0; + dentry = path->dentry; + vfsmnt = path->mnt; + mnt = real_mount(vfsmnt); read_seqbegin_or_lock(&rename_lock, &seq); while (dentry != root->dentry || vfsmnt != root->mnt) { struct dentry * parent; if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { + struct mount *parent = ACCESS_ONCE(mnt->mnt_parent); /* Global root? */ - if (mnt_has_parent(mnt)) { - dentry = mnt->mnt_mountpoint; - mnt = mnt->mnt_parent; + if (mnt != parent) { + dentry = ACCESS_ONCE(mnt->mnt_mountpoint); + mnt = parent; vfsmnt = &mnt->mnt; continue; } @@ -2936,6 +2913,14 @@ restart: } done_seqretry(&rename_lock, seq); + if (!(m_seq & 1)) + rcu_read_unlock(); + if (need_seqretry(&mount_lock, m_seq)) { + m_seq = 1; + goto restart_mnt; + } + done_seqretry(&mount_lock, m_seq); + if (error >= 0 && bptr == *buffer) { if (--blen < 0) error = -ENAMETOOLONG; @@ -2971,9 +2956,7 @@ char *__d_path(const struct path *path, int error; prepend(&res, &buflen, "\0", 1); - br_read_lock(&vfsmount_lock); error = prepend_path(path, root, &res, &buflen); - br_read_unlock(&vfsmount_lock); if (error < 0) return ERR_PTR(error); @@ -2990,9 +2973,7 @@ char *d_absolute_path(const struct path *path, int error; prepend(&res, &buflen, "\0", 1); - br_read_lock(&vfsmount_lock); error = prepend_path(path, &root, &res, &buflen); - br_read_unlock(&vfsmount_lock); if (error > 1) error = -EINVAL; @@ -3061,15 +3042,18 @@ char *d_path(const struct path *path, char *buf, int buflen) * thus don't need to be hashed. They also don't need a name until a * user wants to identify the object in /proc/pid/fd/. The little hack * below allows us to generate a name for these objects on demand: + * + * Some pseudo inodes are mountable. When they are mounted + * path->dentry == path->mnt->mnt_root. In that case don't call d_dname + * and instead have d_path return the mounted path. */ - if (path->dentry->d_op && path->dentry->d_op->d_dname) + if (path->dentry->d_op && path->dentry->d_op->d_dname && + (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root)) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); rcu_read_lock(); get_fs_root_rcu(current->fs, &root); - br_read_lock(&vfsmount_lock); error = path_with_deleted(path, &root, &res, &buflen); - br_read_unlock(&vfsmount_lock); rcu_read_unlock(); if (error < 0) @@ -3109,30 +3093,33 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen) end = ERR_PTR(-ENAMETOOLONG); return end; } +EXPORT_SYMBOL(simple_dname); /* * Write full pathname from the root of the filesystem into the buffer. */ -static char *__dentry_path(struct dentry *dentry, char *buf, int buflen) +static char *__dentry_path(struct dentry *d, char *buf, int buflen) { + struct dentry *dentry; char *end, *retval; int len, seq = 0; int error = 0; + if (buflen < 2) + goto Elong; + rcu_read_lock(); restart: + dentry = d; end = buf + buflen; len = buflen; prepend(&end, &len, "\0", 1); - if (buflen < 1) - goto Elong; /* Get '/' right */ retval = end-1; *retval = '/'; read_seqbegin_or_lock(&rename_lock, &seq); while (!IS_ROOT(dentry)) { struct dentry *parent = dentry->d_parent; - int error; prefetch(parent); error = prepend_name(&end, &len, &dentry->d_name); @@ -3224,7 +3211,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) get_fs_root_and_pwd_rcu(current->fs, &root, &pwd); error = -ENOENT; - br_read_lock(&vfsmount_lock); if (!d_unlinked(pwd.dentry)) { unsigned long len; char *cwd = page + PATH_MAX; @@ -3232,7 +3218,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) prepend(&cwd, &buflen, "\0", 1); error = prepend_path(&pwd, &root, &cwd, &buflen); - br_read_unlock(&vfsmount_lock); rcu_read_unlock(); if (error < 0) @@ -3253,7 +3238,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) error = -EFAULT; } } else { - br_read_unlock(&vfsmount_lock); rcu_read_unlock(); } diff --git a/fs/dcookies.c b/fs/dcookies.c index ab5954b5026..ac44a69fbea 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c @@ -204,7 +204,7 @@ out: } #ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, size_t, len) +COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, compat_size_t, len) { #ifdef __BIG_ENDIAN return sys_lookup_dcookie(((u64)w0 << 32) | w1, buf, len); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index c7c83ff0f75..8c41b52da35 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -218,6 +218,7 @@ static int debugfs_remount(struct super_block *sb, int *flags, char *data) int err; struct debugfs_fs_info *fsi = sb->s_fs_info; + sync_filesystem(sb); err = debugfs_parse_options(data, &fsi->mount_opts); if (err) goto fail; @@ -358,7 +359,7 @@ exit: * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have. * @parent: a pointer to the parent dentry for this file. This should be a - * directory dentry if set. If this paramater is NULL, then the + * directory dentry if set. If this parameter is NULL, then the * file will be created in the root of the debugfs filesystem. * @data: a pointer to something that the caller will want to get to later * on. The inode.i_private pointer will point to this value on @@ -400,7 +401,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_file); * @name: a pointer to a string containing the name of the directory to * create. * @parent: a pointer to the parent dentry for this file. This should be a - * directory dentry if set. If this paramater is NULL, then the + * directory dentry if set. If this parameter is NULL, then the * directory will be created in the root of the debugfs filesystem. * * This function creates a directory in debugfs with the given name. @@ -425,7 +426,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir); * @name: a pointer to a string containing the name of the symbolic link to * create. * @parent: a pointer to the parent dentry for this symbolic link. This - * should be a directory dentry if set. If this paramater is NULL, + * should be a directory dentry if set. If this parameter is NULL, * then the symbolic link will be created in the root of the debugfs * filesystem. * @target: a pointer to a string containing the path to the target of the @@ -566,8 +567,7 @@ void debugfs_remove_recursive(struct dentry *dentry) mutex_lock(&parent->d_inode->i_mutex); if (child != dentry) { - next = list_entry(child->d_u.d_child.next, struct dentry, - d_u.d_child); + next = list_next_entry(child, d_u.d_child); goto up; } diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 073d30b9d1a..cfe8466f7fe 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -10,6 +10,8 @@ * * ------------------------------------------------------------------------- */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/init.h> #include <linux/fs.h> @@ -148,10 +150,10 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode) /* * parse_mount_options(): - * Set @opts to mount options specified in @data. If an option is not - * specified in @data, set it to its default value. The exception is - * 'newinstance' option which can only be set/cleared on a mount (i.e. - * cannot be changed during remount). + * Set @opts to mount options specified in @data. If an option is not + * specified in @data, set it to its default value. The exception is + * 'newinstance' option which can only be set/cleared on a mount (i.e. + * cannot be changed during remount). * * Note: @data may be NULL (in which case all options are set to default). */ @@ -225,7 +227,7 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) break; #endif default: - printk(KERN_ERR "devpts: called with bogus options\n"); + pr_err("called with bogus options\n"); return -EINVAL; } } @@ -261,7 +263,7 @@ static int mknod_ptmx(struct super_block *sb) dentry = d_alloc_name(root, "ptmx"); if (!dentry) { - printk(KERN_NOTICE "Unable to alloc dentry for ptmx node\n"); + pr_err("Unable to alloc dentry for ptmx node\n"); goto out; } @@ -270,7 +272,7 @@ static int mknod_ptmx(struct super_block *sb) */ inode = new_inode(sb); if (!inode) { - printk(KERN_ERR "Unable to alloc inode for ptmx node\n"); + pr_err("Unable to alloc inode for ptmx node\n"); dput(dentry); goto out; } @@ -303,7 +305,7 @@ static void update_ptmx_mode(struct pts_fs_info *fsi) #else static inline void update_ptmx_mode(struct pts_fs_info *fsi) { - return; + return; } #endif @@ -313,6 +315,7 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data) struct pts_fs_info *fsi = DEVPTS_SB(sb); struct pts_mount_opts *opts = &fsi->mount_opts; + sync_filesystem(sb); err = parse_mount_options(data, PARSE_REMOUNT, opts); /* @@ -332,9 +335,11 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root) struct pts_mount_opts *opts = &fsi->mount_opts; if (opts->setuid) - seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, opts->uid)); + seq_printf(seq, ",uid=%u", + from_kuid_munged(&init_user_ns, opts->uid)); if (opts->setgid) - seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, opts->gid)); + seq_printf(seq, ",gid=%u", + from_kgid_munged(&init_user_ns, opts->gid)); seq_printf(seq, ",mode=%03o", opts->mode); #ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode); @@ -395,7 +400,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) if (s->s_root) return 0; - printk(KERN_ERR "devpts: get root dentry failed\n"); + pr_err("get root dentry failed\n"); fail: return -ENOMEM; @@ -498,6 +503,7 @@ static void devpts_kill_sb(struct super_block *sb) { struct pts_fs_info *fsi = DEVPTS_SB(sb); + ida_destroy(&fsi->allocated_ptys); kfree(fsi); kill_litter_super(sb); } diff --git a/fs/direct-io.c b/fs/direct-io.c index 0e04142d596..17e39b047de 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -71,13 +71,11 @@ struct dio_submit { been performed at the start of a write */ int pages_in_io; /* approximate total IO pages */ - size_t size; /* total request size (doesn't change)*/ sector_t block_in_file; /* Current offset into the underlying file in dio_block units. */ unsigned blocks_available; /* At block_in_file. changes */ int reap_counter; /* rate limit reaping */ sector_t final_block_in_request;/* doesn't change */ - unsigned first_block_in_page; /* doesn't change, Used only once */ int boundary; /* prev block is at a boundary */ get_block_t *get_block; /* block mapping function */ dio_submit_t *submit_io; /* IO submition function */ @@ -98,19 +96,14 @@ struct dio_submit { sector_t cur_page_block; /* Where it starts */ loff_t cur_page_fs_offset; /* Offset in file */ - /* - * Page fetching state. These variables belong to dio_refill_pages(). - */ - int curr_page; /* changes */ - int total_pages; /* doesn't change */ - unsigned long curr_user_address;/* changes */ - + struct iov_iter *iter; /* * Page queue. These variables belong to dio_refill_pages() and * dio_get_page(). */ unsigned head; /* next page to process */ unsigned tail; /* last valid page + 1 */ + size_t from, to; }; /* dio_state communicated between submission path and end_io */ @@ -163,15 +156,10 @@ static inline unsigned dio_pages_present(struct dio_submit *sdio) */ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) { - int ret; - int nr_pages; + ssize_t ret; - nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES); - ret = get_user_pages_fast( - sdio->curr_user_address, /* Where from? */ - nr_pages, /* How many pages? */ - dio->rw == READ, /* Write to memory? */ - &dio->pages[0]); /* Put results here */ + ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES * PAGE_SIZE, + &sdio->from); if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) { struct page *page = ZERO_PAGE(0); @@ -186,18 +174,19 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) dio->pages[0] = page; sdio->head = 0; sdio->tail = 1; - ret = 0; - goto out; + sdio->from = 0; + sdio->to = PAGE_SIZE; + return 0; } if (ret >= 0) { - sdio->curr_user_address += ret * PAGE_SIZE; - sdio->curr_page += ret; + iov_iter_advance(sdio->iter, ret); + ret += sdio->from; sdio->head = 0; - sdio->tail = ret; - ret = 0; + sdio->tail = (ret + PAGE_SIZE - 1) / PAGE_SIZE; + sdio->to = ((ret - 1) & (PAGE_SIZE - 1)) + 1; + return 0; } -out: return ret; } @@ -208,7 +197,7 @@ out: * L1 cache. */ static inline struct page *dio_get_page(struct dio *dio, - struct dio_submit *sdio) + struct dio_submit *sdio) { if (dio_pages_present(sdio) == 0) { int ret; @@ -218,7 +207,7 @@ static inline struct page *dio_get_page(struct dio *dio, return ERR_PTR(ret); BUG_ON(dio_pages_present(sdio) == 0); } - return dio->pages[sdio->head++]; + return dio->pages[sdio->head]; } /** @@ -375,7 +364,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, bio = bio_alloc(GFP_KERNEL, nr_vecs); bio->bi_bdev = bdev; - bio->bi_sector = first_sector; + bio->bi_iter.bi_sector = first_sector; if (dio->is_async) bio->bi_end_io = dio_bio_end_aio; else @@ -422,8 +411,8 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) */ static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) { - while (dio_pages_present(sdio)) - page_cache_release(dio_get_page(dio, sdio)); + while (sdio->head < sdio->tail) + page_cache_release(dio->pages[sdio->head++]); } /* @@ -664,7 +653,6 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio, goto out; sector = start_sector << (sdio->blkbits - 9); nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev)); - nr_pages = min(nr_pages, BIO_MAX_PAGES); BUG_ON(nr_pages <= 0); dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages); sdio->boundary = 0; @@ -719,7 +707,7 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio, if (sdio->bio) { loff_t cur_offset = sdio->cur_page_fs_offset; loff_t bio_next_offset = sdio->logical_offset_in_bio + - sdio->bio->bi_size; + sdio->bio->bi_iter.bi_size; /* * See whether this new request is contiguous with the old. @@ -913,23 +901,22 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, struct buffer_head *map_bh) { const unsigned blkbits = sdio->blkbits; - const unsigned blocks_per_page = PAGE_SIZE >> blkbits; - struct page *page; - unsigned block_in_page; int ret = 0; - /* The I/O can start at any block offset within the first page */ - block_in_page = sdio->first_block_in_page; - while (sdio->block_in_file < sdio->final_block_in_request) { + struct page *page; + size_t from, to; + page = dio_get_page(dio, sdio); if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; } + from = sdio->head ? 0 : sdio->from; + to = (sdio->head == sdio->tail - 1) ? sdio->to : PAGE_SIZE; + sdio->head++; - while (block_in_page < blocks_per_page) { - unsigned offset_in_page = block_in_page << blkbits; + while (from < to) { unsigned this_chunk_bytes; /* # of bytes mapped */ unsigned this_chunk_blocks; /* # of blocks */ unsigned u; @@ -1000,10 +987,10 @@ do_holes: page_cache_release(page); goto out; } - zero_user(page, block_in_page << blkbits, - 1 << blkbits); + zero_user(page, from, 1 << blkbits); sdio->block_in_file++; - block_in_page++; + from += 1 << blkbits; + dio->result += 1 << blkbits; goto next_block; } @@ -1020,7 +1007,7 @@ do_holes: * can add to this page */ this_chunk_blocks = sdio->blocks_available; - u = (PAGE_SIZE - offset_in_page) >> blkbits; + u = (to - from) >> blkbits; if (this_chunk_blocks > u) this_chunk_blocks = u; u = sdio->final_block_in_request - sdio->block_in_file; @@ -1032,7 +1019,7 @@ do_holes: if (this_chunk_blocks == sdio->blocks_available) sdio->boundary = buffer_boundary(map_bh); ret = submit_page_section(dio, sdio, page, - offset_in_page, + from, this_chunk_bytes, sdio->next_block_for_io, map_bh); @@ -1043,7 +1030,8 @@ do_holes: sdio->next_block_for_io += this_chunk_blocks; sdio->block_in_file += this_chunk_blocks; - block_in_page += this_chunk_blocks; + from += this_chunk_bytes; + dio->result += this_chunk_bytes; sdio->blocks_available -= this_chunk_blocks; next_block: BUG_ON(sdio->block_in_file > sdio->final_block_in_request); @@ -1053,7 +1041,6 @@ next_block: /* Drop the ref which was taken in get_user_pages() */ page_cache_release(page); - block_in_page = 0; } out: return ret; @@ -1108,24 +1095,21 @@ static inline int drop_refcount(struct dio *dio) */ static inline ssize_t do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, - struct block_device *bdev, const struct iovec *iov, loff_t offset, - unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, + struct block_device *bdev, struct iov_iter *iter, loff_t offset, + get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags) { - int seg; - size_t size; - unsigned long addr; unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits); unsigned blkbits = i_blkbits; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; - loff_t end = offset; + size_t count = iov_iter_count(iter); + loff_t end = offset + count; struct dio *dio; struct dio_submit sdio = { 0, }; - unsigned long user_addr; - size_t bytes; struct buffer_head map_bh = { 0, }; struct blk_plug plug; + unsigned long align = offset | iov_iter_alignment(iter); if (rw & WRITE) rw = WRITE_ODIRECT; @@ -1135,32 +1119,16 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, * the early prefetch in the caller enough time. */ - if (offset & blocksize_mask) { + if (align & blocksize_mask) { if (bdev) blkbits = blksize_bits(bdev_logical_block_size(bdev)); blocksize_mask = (1 << blkbits) - 1; - if (offset & blocksize_mask) + if (align & blocksize_mask) goto out; } - /* Check the memory alignment. Blocks cannot straddle pages */ - for (seg = 0; seg < nr_segs; seg++) { - addr = (unsigned long)iov[seg].iov_base; - size = iov[seg].iov_len; - end += size; - if (unlikely((addr & blocksize_mask) || - (size & blocksize_mask))) { - if (bdev) - blkbits = blksize_bits( - bdev_logical_block_size(bdev)); - blocksize_mask = (1 << blkbits) - 1; - if ((addr & blocksize_mask) || (size & blocksize_mask)) - goto out; - } - } - /* watch out for a 0 len io from a tricksy fs */ - if (rw == READ && end == offset) + if (rw == READ && !iov_iter_count(iter)) return 0; dio = kmem_cache_alloc(dio_cache, GFP_KERNEL); @@ -1194,13 +1162,19 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, } /* - * For file extending writes updating i_size before data - * writeouts complete can expose uninitialized blocks. So - * even for AIO, we need to wait for i/o to complete before - * returning in this case. + * For file extending writes updating i_size before data writeouts + * complete can expose uninitialized blocks in dumb filesystems. + * In that case we need to wait for I/O completion even if asked + * for an asynchronous write. */ - dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && - (end > i_size_read(inode))); + if (is_sync_kiocb(iocb)) + dio->is_async = false; + else if (!(dio->flags & DIO_ASYNC_EXTEND) && + (rw & WRITE) && end > i_size_read(inode)) + dio->is_async = false; + else + dio->is_async = true; + dio->inode = inode; dio->rw = rw; @@ -1244,6 +1218,10 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, spin_lock_init(&dio->bio_lock); dio->refcount = 1; + sdio.iter = iter; + sdio.final_block_in_request = + (offset + iov_iter_count(iter)) >> blkbits; + /* * In case of non-aligned buffers, we may need 2 more * pages since we need to zero out first and last block. @@ -1251,47 +1229,13 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (unlikely(sdio.blkfactor)) sdio.pages_in_io = 2; - for (seg = 0; seg < nr_segs; seg++) { - user_addr = (unsigned long)iov[seg].iov_base; - sdio.pages_in_io += - ((user_addr + iov[seg].iov_len + PAGE_SIZE-1) / - PAGE_SIZE - user_addr / PAGE_SIZE); - } + sdio.pages_in_io += iov_iter_npages(iter, INT_MAX); blk_start_plug(&plug); - for (seg = 0; seg < nr_segs; seg++) { - user_addr = (unsigned long)iov[seg].iov_base; - sdio.size += bytes = iov[seg].iov_len; - - /* Index into the first page of the first block */ - sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits; - sdio.final_block_in_request = sdio.block_in_file + - (bytes >> blkbits); - /* Page fetching state */ - sdio.head = 0; - sdio.tail = 0; - sdio.curr_page = 0; - - sdio.total_pages = 0; - if (user_addr & (PAGE_SIZE-1)) { - sdio.total_pages++; - bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1)); - } - sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE; - sdio.curr_user_address = user_addr; - - retval = do_direct_IO(dio, &sdio, &map_bh); - - dio->result += iov[seg].iov_len - - ((sdio.final_block_in_request - sdio.block_in_file) << - blkbits); - - if (retval) { - dio_cleanup(dio, &sdio); - break; - } - } /* end iovec loop */ + retval = do_direct_IO(dio, &sdio, &map_bh); + if (retval) + dio_cleanup(dio, &sdio); if (retval == -ENOTBLK) { /* @@ -1343,10 +1287,9 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, */ BUG_ON(retval == -EIOCBQUEUED); if (dio->is_async && retval == 0 && dio->result && - ((rw == READ) || (dio->result == sdio.size))) + (rw == READ || dio->result == count)) retval = -EIOCBQUEUED; - - if (retval != -EIOCBQUEUED) + else dio_await_completion(dio); if (drop_refcount(dio) == 0) { @@ -1360,8 +1303,8 @@ out: ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, - struct block_device *bdev, const struct iovec *iov, loff_t offset, - unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, + struct block_device *bdev, struct iov_iter *iter, loff_t offset, + get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags) { /* @@ -1376,9 +1319,8 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, prefetch(bdev->bd_queue); prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES); - return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_block, end_io, - submit_io, flags); + return do_blockdev_direct_IO(rw, iocb, inode, bdev, iter, offset, + get_block, end_io, submit_io, flags); } EXPORT_SYMBOL(__blockdev_direct_IO); diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 0e90f0c91b9..dcea1e37a1b 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -14,6 +14,7 @@ #include "dlm_internal.h" #include "lock.h" #include "user.h" +#include "ast.h" static uint64_t dlm_cb_seq; static DEFINE_SPINLOCK(dlm_cb_seq_spin); @@ -308,6 +309,6 @@ void dlm_callback_resume(struct dlm_ls *ls) mutex_unlock(&ls->ls_cb_mutex); if (count) - log_debug(ls, "dlm_callback_resume %d", count); + log_rinfo(ls, "dlm_callback_resume %d", count); } diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 76feb4b60fa..d521bddf876 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -157,11 +157,13 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, const char *buf, size_t len) { unsigned int x; + int rc; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - - x = simple_strtoul(buf, NULL, 0); + rc = kstrtouint(buf, 0, &x); + if (rc) + return rc; if (check_zero && !x) return -EINVAL; @@ -730,7 +732,10 @@ static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf) static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf, size_t len) { - cm->nodeid = simple_strtol(buf, NULL, 0); + int rc = kstrtoint(buf, 0, &cm->nodeid); + + if (rc) + return rc; return len; } @@ -742,7 +747,10 @@ static ssize_t comm_local_read(struct dlm_comm *cm, char *buf) static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf, size_t len) { - cm->local= simple_strtol(buf, NULL, 0); + int rc = kstrtoint(buf, 0, &cm->local); + + if (rc) + return rc; if (cm->local && !local_comm) local_comm = cm; return len; @@ -846,7 +854,10 @@ static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf, size_t len) { uint32_t seq = 0; - nd->nodeid = simple_strtol(buf, NULL, 0); + int rc = kstrtoint(buf, 0, &nd->nodeid); + + if (rc) + return rc; dlm_comm_seq(nd->nodeid, &seq); nd->comm_seq = seq; return len; @@ -860,7 +871,10 @@ static ssize_t node_weight_read(struct dlm_node *nd, char *buf) static ssize_t node_weight_write(struct dlm_node *nd, const char *buf, size_t len) { - nd->weight = simple_strtol(buf, NULL, 0); + int rc = kstrtoint(buf, 0, &nd->weight); + + if (rc) + return rc; return len; } diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index b969deef9eb..8d77ba7b175 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -68,7 +68,7 @@ static int print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb, if (lkb->lkb_wait_type) seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); - return seq_printf(s, "\n"); + return seq_puts(s, "\n"); } static int print_format1(struct dlm_rsb *res, struct seq_file *s) @@ -92,31 +92,31 @@ static int print_format1(struct dlm_rsb *res, struct seq_file *s) } if (res->res_nodeid > 0) - rv = seq_printf(s, "\" \nLocal Copy, Master is node %d\n", + rv = seq_printf(s, "\"\nLocal Copy, Master is node %d\n", res->res_nodeid); else if (res->res_nodeid == 0) - rv = seq_printf(s, "\" \nMaster Copy\n"); + rv = seq_puts(s, "\"\nMaster Copy\n"); else if (res->res_nodeid == -1) - rv = seq_printf(s, "\" \nLooking up master (lkid %x)\n", + rv = seq_printf(s, "\"\nLooking up master (lkid %x)\n", res->res_first_lkid); else - rv = seq_printf(s, "\" \nInvalid master %d\n", + rv = seq_printf(s, "\"\nInvalid master %d\n", res->res_nodeid); if (rv) goto out; /* Print the LVB: */ if (res->res_lvbptr) { - seq_printf(s, "LVB: "); + seq_puts(s, "LVB: "); for (i = 0; i < lvblen; i++) { if (i == lvblen / 2) - seq_printf(s, "\n "); + seq_puts(s, "\n "); seq_printf(s, "%02x ", (unsigned char) res->res_lvbptr[i]); } if (rsb_flag(res, RSB_VALNOTVALID)) - seq_printf(s, " (INVALID)"); - rv = seq_printf(s, "\n"); + seq_puts(s, " (INVALID)"); + rv = seq_puts(s, "\n"); if (rv) goto out; } @@ -133,21 +133,21 @@ static int print_format1(struct dlm_rsb *res, struct seq_file *s) } /* Print the locks attached to this resource */ - seq_printf(s, "Granted Queue\n"); + seq_puts(s, "Granted Queue\n"); list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) { rv = print_format1_lock(s, lkb, res); if (rv) goto out; } - seq_printf(s, "Conversion Queue\n"); + seq_puts(s, "Conversion Queue\n"); list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) { rv = print_format1_lock(s, lkb, res); if (rv) goto out; } - seq_printf(s, "Waiting Queue\n"); + seq_puts(s, "Waiting Queue\n"); list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) { rv = print_format1_lock(s, lkb, res); if (rv) @@ -157,13 +157,13 @@ static int print_format1(struct dlm_rsb *res, struct seq_file *s) if (list_empty(&res->res_lookup)) goto out; - seq_printf(s, "Lookup Queue\n"); + seq_puts(s, "Lookup Queue\n"); list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) { rv = seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_rqmode)); if (lkb->lkb_wait_type) seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); - rv = seq_printf(s, "\n"); + rv = seq_puts(s, "\n"); } out: unlock_rsb(res); @@ -300,7 +300,7 @@ static int print_format3(struct dlm_rsb *r, struct seq_file *s) else seq_printf(s, " %02x", (unsigned char)r->res_name[i]); } - rv = seq_printf(s, "\n"); + rv = seq_puts(s, "\n"); if (rv) goto out; @@ -311,7 +311,7 @@ static int print_format3(struct dlm_rsb *r, struct seq_file *s) for (i = 0; i < lvblen; i++) seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]); - rv = seq_printf(s, "\n"); + rv = seq_puts(s, "\n"); if (rv) goto out; @@ -377,7 +377,7 @@ static int print_format4(struct dlm_rsb *r, struct seq_file *s) else seq_printf(s, " %02x", (unsigned char)r->res_name[i]); } - rv = seq_printf(s, "\n"); + rv = seq_puts(s, "\n"); out: unlock_rsb(r); return rv; diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c index 278a75cda44..d975851a7e1 100644 --- a/fs/dlm/dir.c +++ b/fs/dlm/dir.c @@ -68,7 +68,7 @@ int dlm_recover_directory(struct dlm_ls *ls) uint16_t namelen; unsigned int count = 0, count_match = 0, count_bad = 0, count_add = 0; - log_debug(ls, "dlm_recover_directory"); + log_rinfo(ls, "dlm_recover_directory"); if (dlm_no_directory(ls)) goto out_status; @@ -189,7 +189,7 @@ int dlm_recover_directory(struct dlm_ls *ls) error = 0; dlm_set_recover_status(ls, DLM_RS_DIR); - log_debug(ls, "dlm_recover_directory %u in %u new", + log_rinfo(ls, "dlm_recover_directory %u in %u new", count, count_add); out_free: kfree(last_name); diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index e7665c31f7b..5eff6ea3e27 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -65,6 +65,8 @@ struct dlm_mhandle; printk(KERN_ERR "dlm: "fmt"\n" , ##args) #define log_error(ls, fmt, args...) \ printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args) +#define log_rinfo(ls, fmt, args...) \ + printk(KERN_INFO "dlm: %s: " fmt "\n", (ls)->ls_name , ##args); #define log_debug(ls, fmt, args...) \ do { \ diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index e223a911a83..83f3d552030 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -687,6 +687,7 @@ static int find_rsb_dir(struct dlm_ls *ls, char *name, int len, log_error(ls, "find_rsb new from_other %d dir %d our %d %s", from_nodeid, dir_nodeid, our_nodeid, r->res_name); dlm_free_rsb(r); + r = NULL; error = -ENOTBLK; goto out_unlock; } @@ -5462,7 +5463,7 @@ void dlm_recover_purge(struct dlm_ls *ls) up_write(&ls->ls_root_sem); if (lkb_count) - log_debug(ls, "dlm_recover_purge %u locks for %u nodes", + log_rinfo(ls, "dlm_recover_purge %u locks for %u nodes", lkb_count, nodes_count); } @@ -5536,7 +5537,7 @@ void dlm_recover_grant(struct dlm_ls *ls) } if (lkb_count) - log_debug(ls, "dlm_recover_grant %u locks on %u resources", + log_rinfo(ls, "dlm_recover_grant %u locks on %u resources", lkb_count, rsb_count); } @@ -5695,7 +5696,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) put_rsb(r); out: if (error && error != -EEXIST) - log_debug(ls, "dlm_recover_master_copy remote %d %x error %d", + log_rinfo(ls, "dlm_recover_master_copy remote %d %x error %d", from_nodeid, remid, error); rl->rl_result = cpu_to_le32(error); return error; diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 88556dc0458..f3e72787e7f 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -35,8 +35,11 @@ static struct task_struct * scand_task; static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len) { ssize_t ret = len; - int n = simple_strtol(buf, NULL, 0); + int n; + int rc = kstrtoint(buf, 0, &n); + if (rc) + return rc; ls = dlm_find_lockspace_local(ls->ls_local_handle); if (!ls) return -EINVAL; @@ -57,7 +60,10 @@ static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len) static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len) { - ls->ls_uevent_result = simple_strtol(buf, NULL, 0); + int rc = kstrtoint(buf, 0, &ls->ls_uevent_result); + + if (rc) + return rc; set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags); wake_up(&ls->ls_uevent_wait); return len; @@ -70,7 +76,10 @@ static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf) static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len) { - ls->ls_global_id = simple_strtoul(buf, NULL, 0); + int rc = kstrtouint(buf, 0, &ls->ls_global_id); + + if (rc) + return rc; return len; } @@ -81,7 +90,11 @@ static ssize_t dlm_nodir_show(struct dlm_ls *ls, char *buf) static ssize_t dlm_nodir_store(struct dlm_ls *ls, const char *buf, size_t len) { - int val = simple_strtoul(buf, NULL, 0); + int val; + int rc = kstrtoint(buf, 0, &val); + + if (rc) + return rc; if (val == 1) set_bit(LSFL_NODIR, &ls->ls_flags); return len; @@ -190,7 +203,7 @@ static int do_uevent(struct dlm_ls *ls, int in) else kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE); - log_debug(ls, "%s the lockspace group...", in ? "joining" : "leaving"); + log_rinfo(ls, "%s the lockspace group...", in ? "joining" : "leaving"); /* dlm_controld will see the uevent, do the necessary group management and then write to sysfs to wake us */ @@ -198,7 +211,7 @@ static int do_uevent(struct dlm_ls *ls, int in) error = wait_event_interruptible(ls->ls_uevent_wait, test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags)); - log_debug(ls, "group event done %d %d", error, ls->ls_uevent_result); + log_rinfo(ls, "group event done %d %d", error, ls->ls_uevent_result); if (error) goto out; @@ -640,7 +653,7 @@ static int new_lockspace(const char *name, const char *cluster, dlm_create_debug_file(ls); - log_debug(ls, "join complete"); + log_rinfo(ls, "join complete"); *lockspace = ls; return 0; @@ -706,9 +719,7 @@ static int lkb_idr_is_local(int id, void *p, void *data) { struct dlm_lkb *lkb = p; - if (!lkb->lkb_nodeid) - return 1; - return 0; + return lkb->lkb_nodeid == 0 && lkb->lkb_grmode != DLM_LOCK_IV; } static int lkb_idr_is_any(int id, void *p, void *data) @@ -837,7 +848,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) dlm_clear_members(ls); dlm_clear_members_gone(ls); kfree(ls->ls_node_array); - log_debug(ls, "release_lockspace final free"); + log_rinfo(ls, "release_lockspace final free"); kobject_put(&ls->ls_kobj); /* The ls structure will be freed when the kobject is done with */ diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index d90909ec6aa..d08e079ea5d 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -424,7 +424,7 @@ int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) } /* Data available on socket or listen socket received a connect */ -static void lowcomms_data_ready(struct sock *sk, int count_unused) +static void lowcomms_data_ready(struct sock *sk) { struct connection *con = sock2con(sk); if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags)) @@ -617,6 +617,11 @@ static void retry_failed_sctp_send(struct connection *recv_con, int nodeid = sn_send_failed->ssf_info.sinfo_ppid; log_print("Retry sending %d bytes to node id %d", len, nodeid); + + if (!nodeid) { + log_print("Shouldn't resend data via listening connection."); + return; + } con = nodeid2con(nodeid, 0); if (!con) { @@ -649,6 +654,7 @@ static void process_sctp_notification(struct connection *con, struct msghdr *msg, char *buf) { union sctp_notification *sn = (union sctp_notification *)buf; + struct linger linger; switch (sn->sn_header.sn_type) { case SCTP_SEND_FAILED: @@ -713,11 +719,11 @@ static void process_sctp_notification(struct connection *con, return; /* Peel off a new sock */ - sctp_lock_sock(con->sock->sk); + lock_sock(con->sock->sk); ret = sctp_do_peeloff(con->sock->sk, sn->sn_assoc_change.sac_assoc_id, &new_con->sock); - sctp_release_sock(con->sock->sk); + release_sock(con->sock->sk); if (ret < 0) { log_print("Can't peel off a socket for " "connection %d to node %d: err=%d", @@ -727,6 +733,13 @@ static void process_sctp_notification(struct connection *con, } add_sock(new_con->sock, new_con); + linger.l_onoff = 1; + linger.l_linger = 0; + ret = kernel_setsockopt(new_con->sock, SOL_SOCKET, SO_LINGER, + (char *)&linger, sizeof(linger)); + if (ret < 0) + log_print("set socket option SO_LINGER failed"); + log_print("connecting to %d sctp association %d", nodeid, (int)sn->sn_assoc_change.sac_assoc_id); diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 476557b5492..9c47f1c14a8 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -60,18 +60,15 @@ void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc) #define SLOT_DEBUG_LINE 128 -static void log_debug_slots(struct dlm_ls *ls, uint32_t gen, int num_slots, - struct rcom_slot *ro0, struct dlm_slot *array, - int array_size) +static void log_slots(struct dlm_ls *ls, uint32_t gen, int num_slots, + struct rcom_slot *ro0, struct dlm_slot *array, + int array_size) { char line[SLOT_DEBUG_LINE]; int len = SLOT_DEBUG_LINE - 1; int pos = 0; int ret, i; - if (!dlm_config.ci_log_debug) - return; - memset(line, 0, sizeof(line)); if (array) { @@ -95,7 +92,7 @@ static void log_debug_slots(struct dlm_ls *ls, uint32_t gen, int num_slots, } } - log_debug(ls, "generation %u slots %d%s", gen, num_slots, line); + log_rinfo(ls, "generation %u slots %d%s", gen, num_slots, line); } int dlm_slots_copy_in(struct dlm_ls *ls) @@ -129,7 +126,7 @@ int dlm_slots_copy_in(struct dlm_ls *ls) ro->ro_slot = le16_to_cpu(ro->ro_slot); } - log_debug_slots(ls, gen, num_slots, ro0, NULL, 0); + log_slots(ls, gen, num_slots, ro0, NULL, 0); list_for_each_entry(memb, &ls->ls_nodes, list) { for (i = 0, ro = ro0; i < num_slots; i++, ro++) { @@ -274,7 +271,7 @@ int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size, gen++; - log_debug_slots(ls, gen, num, NULL, array, array_size); + log_slots(ls, gen, num, NULL, array, array_size); max_slots = (dlm_config.ci_buffer_size - sizeof(struct dlm_rcom) - sizeof(struct rcom_config)) / sizeof(struct rcom_slot); @@ -447,7 +444,7 @@ static int ping_members(struct dlm_ls *ls) break; } if (error) - log_debug(ls, "ping_members aborted %d last nodeid %d", + log_rinfo(ls, "ping_members aborted %d last nodeid %d", error, ls->ls_recover_nodeid); return error; } @@ -539,7 +536,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) count as a negative change so the "neg" recovery steps will happen */ list_for_each_entry(memb, &ls->ls_nodes_gone, list) { - log_debug(ls, "prev removed member %d", memb->nodeid); + log_rinfo(ls, "prev removed member %d", memb->nodeid); neg++; } @@ -551,10 +548,10 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) continue; if (!node) { - log_debug(ls, "remove member %d", memb->nodeid); + log_rinfo(ls, "remove member %d", memb->nodeid); } else { /* removed and re-added */ - log_debug(ls, "remove member %d comm_seq %u %u", + log_rinfo(ls, "remove member %d comm_seq %u %u", memb->nodeid, memb->comm_seq, node->comm_seq); } @@ -571,7 +568,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) if (dlm_is_member(ls, node->nodeid)) continue; dlm_add_member(ls, node); - log_debug(ls, "add member %d", node->nodeid); + log_rinfo(ls, "add member %d", node->nodeid); } list_for_each_entry(memb, &ls->ls_nodes, list) { @@ -591,7 +588,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) complete(&ls->ls_members_done); } - log_debug(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); + log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); return error; } diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index 60a327863b1..e7cfbaf8d0e 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -74,14 +74,16 @@ static int user_cmd(struct sk_buff *skb, struct genl_info *info) return 0; } -static struct genl_ops dlm_nl_ops = { - .cmd = DLM_CMD_HELLO, - .doit = user_cmd, +static struct genl_ops dlm_nl_ops[] = { + { + .cmd = DLM_CMD_HELLO, + .doit = user_cmd, + }, }; int __init dlm_netlink_init(void) { - return genl_register_family_with_ops(&family, &dlm_nl_ops, 1); + return genl_register_family_with_ops(&family, dlm_nl_ops); } void dlm_netlink_exit(void) diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index a6bc63f6e31..eaea789bf97 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -526,7 +526,7 @@ int dlm_recover_masters(struct dlm_ls *ls) int nodir = dlm_no_directory(ls); int error; - log_debug(ls, "dlm_recover_masters"); + log_rinfo(ls, "dlm_recover_masters"); down_read(&ls->ls_root_sem); list_for_each_entry(r, &ls->ls_root_list, res_root_list) { @@ -552,7 +552,7 @@ int dlm_recover_masters(struct dlm_ls *ls) } up_read(&ls->ls_root_sem); - log_debug(ls, "dlm_recover_masters %u of %u", count, total); + log_rinfo(ls, "dlm_recover_masters %u of %u", count, total); error = dlm_wait_function(ls, &recover_idr_empty); out: @@ -685,7 +685,7 @@ int dlm_recover_locks(struct dlm_ls *ls) } up_read(&ls->ls_root_sem); - log_debug(ls, "dlm_recover_locks %d out", count); + log_rinfo(ls, "dlm_recover_locks %d out", count); error = dlm_wait_function(ls, &recover_list_empty); out: @@ -883,7 +883,7 @@ void dlm_recover_rsbs(struct dlm_ls *ls) up_read(&ls->ls_root_sem); if (count) - log_debug(ls, "dlm_recover_rsbs %d done", count); + log_rinfo(ls, "dlm_recover_rsbs %d done", count); } /* Create a single list of all root rsb's to be used during recovery */ @@ -950,6 +950,6 @@ void dlm_clear_toss(struct dlm_ls *ls) } if (count) - log_debug(ls, "dlm_clear_toss %u done", count); + log_rinfo(ls, "dlm_clear_toss %u done", count); } diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 32f9f8926ec..6859b4bf971 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -55,7 +55,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) unsigned long start; int error, neg = 0; - log_debug(ls, "dlm_recover %llu", (unsigned long long)rv->seq); + log_rinfo(ls, "dlm_recover %llu", (unsigned long long)rv->seq); mutex_lock(&ls->ls_recoverd_active); @@ -76,7 +76,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_members(ls, rv, &neg); if (error) { - log_debug(ls, "dlm_recover_members error %d", error); + log_rinfo(ls, "dlm_recover_members error %d", error); goto fail; } @@ -90,7 +90,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_members_wait(ls); if (error) { - log_debug(ls, "dlm_recover_members_wait error %d", error); + log_rinfo(ls, "dlm_recover_members_wait error %d", error); goto fail; } @@ -103,7 +103,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_directory(ls); if (error) { - log_debug(ls, "dlm_recover_directory error %d", error); + log_rinfo(ls, "dlm_recover_directory error %d", error); goto fail; } @@ -111,11 +111,11 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_directory_wait(ls); if (error) { - log_debug(ls, "dlm_recover_directory_wait error %d", error); + log_rinfo(ls, "dlm_recover_directory_wait error %d", error); goto fail; } - log_debug(ls, "dlm_recover_directory %u out %u messages", + log_rinfo(ls, "dlm_recover_directory %u out %u messages", ls->ls_recover_dir_sent_res, ls->ls_recover_dir_sent_msg); /* @@ -144,7 +144,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_masters(ls); if (error) { - log_debug(ls, "dlm_recover_masters error %d", error); + log_rinfo(ls, "dlm_recover_masters error %d", error); goto fail; } @@ -154,7 +154,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_locks(ls); if (error) { - log_debug(ls, "dlm_recover_locks error %d", error); + log_rinfo(ls, "dlm_recover_locks error %d", error); goto fail; } @@ -162,11 +162,11 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_locks_wait(ls); if (error) { - log_debug(ls, "dlm_recover_locks_wait error %d", error); + log_rinfo(ls, "dlm_recover_locks_wait error %d", error); goto fail; } - log_debug(ls, "dlm_recover_locks %u in", + log_rinfo(ls, "dlm_recover_locks %u in", ls->ls_recover_locks_in); /* @@ -186,7 +186,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_locks_wait(ls); if (error) { - log_debug(ls, "dlm_recover_locks_wait error %d", error); + log_rinfo(ls, "dlm_recover_locks_wait error %d", error); goto fail; } } @@ -205,7 +205,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_done_wait(ls); if (error) { - log_debug(ls, "dlm_recover_done_wait error %d", error); + log_rinfo(ls, "dlm_recover_done_wait error %d", error); goto fail; } @@ -217,25 +217,25 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = enable_locking(ls, rv->seq); if (error) { - log_debug(ls, "enable_locking error %d", error); + log_rinfo(ls, "enable_locking error %d", error); goto fail; } error = dlm_process_requestqueue(ls); if (error) { - log_debug(ls, "dlm_process_requestqueue error %d", error); + log_rinfo(ls, "dlm_process_requestqueue error %d", error); goto fail; } error = dlm_recover_waiters_post(ls); if (error) { - log_debug(ls, "dlm_recover_waiters_post error %d", error); + log_rinfo(ls, "dlm_recover_waiters_post error %d", error); goto fail; } dlm_recover_grant(ls); - log_debug(ls, "dlm_recover %llu generation %u done: %u ms", + log_rinfo(ls, "dlm_recover %llu generation %u done: %u ms", (unsigned long long)rv->seq, ls->ls_generation, jiffies_to_msecs(jiffies - start)); mutex_unlock(&ls->ls_recoverd_active); @@ -245,7 +245,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) fail: dlm_release_root_list(ls); - log_debug(ls, "dlm_recover %llu error %d", + log_rinfo(ls, "dlm_recover %llu error %d", (unsigned long long)rv->seq, error); mutex_unlock(&ls->ls_recoverd_active); return error; diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 9fd702f5bfb..1de7294aad2 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -50,7 +50,7 @@ static void drop_slab(void) } while (nr_objects > 10); } -int drop_caches_sysctl_handler(ctl_table *table, int write, +int drop_caches_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { int ret; @@ -59,10 +59,22 @@ int drop_caches_sysctl_handler(ctl_table *table, int write, if (ret) return ret; if (write) { - if (sysctl_drop_caches & 1) + static int stfu; + + if (sysctl_drop_caches & 1) { iterate_supers(drop_pagecache_sb, NULL); - if (sysctl_drop_caches & 2) + count_vm_event(DROP_PAGECACHE); + } + if (sysctl_drop_caches & 2) { drop_slab(); + count_vm_event(DROP_SLAB); + } + if (!stfu) { + pr_info("%s (%d): drop_caches: %d\n", + current->comm, task_pid_nr(current), + sysctl_drop_caches); + } + stfu |= sysctl_drop_caches & 4; } return 0; } diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index c88e355f763..2f6735dbf1a 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -392,7 +392,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, wait_for_completion(&ecr->completion); rc = ecr->rc; - INIT_COMPLETION(ecr->completion); + reinit_completion(&ecr->completion); } out: ablkcipher_request_free(req); @@ -408,7 +408,7 @@ static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat, struct page *page) { return ecryptfs_lower_header_size(crypt_stat) + - (page->index << PAGE_CACHE_SHIFT); + ((loff_t)page->index << PAGE_CACHE_SHIFT); } /** diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 992cf95830b..db0fad3269c 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -45,14 +45,13 @@ * The function to be used for directory reads is ecryptfs_read. */ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, loff_t pos) + struct iov_iter *to) { ssize_t rc; struct path *path; struct file *file = iocb->ki_filp; - rc = generic_file_aio_read(iocb, iov, nr_segs, pos); + rc = generic_file_read_iter(iocb, to); /* * Even though this is a async interface, we need to wait * for IO to finish to update atime @@ -271,7 +270,7 @@ static int ecryptfs_flush(struct file *file, fl_owner_t td) { struct file *lower_file = ecryptfs_file_to_lower(file); - if (lower_file->f_op && lower_file->f_op->flush) { + if (lower_file->f_op->flush) { filemap_write_and_wait(file->f_mapping); return lower_file->f_op->flush(lower_file, td); } @@ -305,7 +304,7 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag) struct file *lower_file = NULL; lower_file = ecryptfs_file_to_lower(file); - if (lower_file->f_op && lower_file->f_op->fasync) + if (lower_file->f_op->fasync) rc = lower_file->f_op->fasync(fd, lower_file, flag); return rc; } @@ -313,12 +312,10 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag) static long ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct file *lower_file = NULL; + struct file *lower_file = ecryptfs_file_to_lower(file); long rc = -ENOTTY; - if (ecryptfs_file_to_private(file)) - lower_file = ecryptfs_file_to_lower(file); - if (lower_file && lower_file->f_op && lower_file->f_op->unlocked_ioctl) + if (lower_file->f_op->unlocked_ioctl) rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg); return rc; } @@ -327,12 +324,10 @@ ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) static long ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct file *lower_file = NULL; + struct file *lower_file = ecryptfs_file_to_lower(file); long rc = -ENOIOCTLCMD; - if (ecryptfs_file_to_private(file)) - lower_file = ecryptfs_file_to_lower(file); - if (lower_file && lower_file->f_op && lower_file->f_op->compat_ioctl) + if (lower_file->f_op && lower_file->f_op->compat_ioctl) rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg); return rc; } @@ -356,10 +351,10 @@ const struct file_operations ecryptfs_dir_fops = { const struct file_operations ecryptfs_main_fops = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = ecryptfs_read_update_atime, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .read_iter = ecryptfs_read_update_atime, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .iterate = ecryptfs_readdir, .unlocked_ioctl = ecryptfs_unlocked_ioctl, #ifdef CONFIG_COMPAT diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 0f9b66eaa76..d4a9431ec73 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -153,7 +153,7 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, dget(lower_dentry); lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_unlink(lower_dir_inode, lower_dentry); + rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL); if (rc) { printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc); goto out_unlock; @@ -208,7 +208,7 @@ ecryptfs_do_create(struct inode *directory_inode, inode = __ecryptfs_get_inode(lower_dentry->d_inode, directory_inode->i_sb); if (IS_ERR(inode)) { - vfs_unlink(lower_dir_dentry->d_inode, lower_dentry); + vfs_unlink(lower_dir_dentry->d_inode, lower_dentry, NULL); goto out_lock; } fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode); @@ -475,7 +475,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, dget(lower_new_dentry); lower_dir_dentry = lock_parent(lower_new_dentry); rc = vfs_link(lower_old_dentry, lower_dir_dentry->d_inode, - lower_new_dentry); + lower_new_dentry, NULL); if (rc || !lower_new_dentry->d_inode) goto out_lock; rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb); @@ -640,7 +640,8 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_lock; } rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry, - lower_new_dir_dentry->d_inode, lower_new_dentry); + lower_new_dir_dentry->d_inode, lower_new_dentry, + NULL, 0); if (rc) goto out_lock; if (target_inode) @@ -658,19 +659,17 @@ out_lock: return rc; } -static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf, - size_t *bufsiz) +static char *ecryptfs_readlink_lower(struct dentry *dentry, size_t *bufsiz) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); char *lower_buf; + char *buf; mm_segment_t old_fs; int rc; lower_buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!lower_buf) { - rc = -ENOMEM; - goto out; - } + if (!lower_buf) + return ERR_PTR(-ENOMEM); old_fs = get_fs(); set_fs(get_ds()); rc = lower_dentry->d_inode->i_op->readlink(lower_dentry, @@ -679,21 +678,18 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf, set_fs(old_fs); if (rc < 0) goto out; - rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry->d_sb, + rc = ecryptfs_decode_and_decrypt_filename(&buf, bufsiz, dentry->d_sb, lower_buf, rc); out: kfree(lower_buf); - return rc; + return rc ? ERR_PTR(rc) : buf; } static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd) { - char *buf; - size_t len = PATH_MAX; - int rc; - - rc = ecryptfs_readlink_lower(dentry, &buf, &len); - if (rc) + size_t len; + char *buf = ecryptfs_readlink_lower(dentry, &len); + if (IS_ERR(buf)) goto out; fsstack_copy_attr_atime(dentry->d_inode, ecryptfs_dentry_to_lower(dentry)->d_inode); @@ -881,7 +877,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); mutex_lock(&lower_dentry->d_inode->i_mutex); - rc = notify_change(lower_dentry, &lower_ia); + rc = notify_change(lower_dentry, &lower_ia, NULL); mutex_unlock(&lower_dentry->d_inode->i_mutex); } return rc; @@ -982,7 +978,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) lower_ia.ia_valid &= ~ATTR_MODE; mutex_lock(&lower_dentry->d_inode->i_mutex); - rc = notify_change(lower_dentry, &lower_ia); + rc = notify_change(lower_dentry, &lower_ia, NULL); mutex_unlock(&lower_dentry->d_inode->i_mutex); out: fsstack_copy_attr_all(inode, lower_inode); @@ -1002,10 +998,12 @@ static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry, char *target; size_t targetsiz; - rc = ecryptfs_readlink_lower(dentry, &target, &targetsiz); - if (!rc) { + target = ecryptfs_readlink_lower(dentry, &targetsiz); + if (!IS_ERR(target)) { kfree(target); stat->size = targetsiz; + } else { + rc = PTR_ERR(target); } } return rc; diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 7d52806c211..4725a07f003 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1149,7 +1149,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, struct ecryptfs_msg_ctx *msg_ctx; struct ecryptfs_message *msg = NULL; char *auth_tok_sig; - char *payload; + char *payload = NULL; size_t payload_len = 0; int rc; @@ -1203,6 +1203,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, } out: kfree(msg); + kfree(payload); return rc; } diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index e879cf8ff0b..afa1b81c341 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -132,7 +132,7 @@ static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf) */ static void ecryptfs_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); iput(ecryptfs_inode_to_lower(inode)); } diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index 8dd524f3228..cdb2971192a 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -21,7 +21,7 @@ static ssize_t efivarfs_file_write(struct file *file, u32 attributes; struct inode *inode = file->f_mapping->host; unsigned long datasize = count - sizeof(attributes); - ssize_t bytes = 0; + ssize_t bytes; bool set = false; if (count < sizeof(attributes)) @@ -33,14 +33,9 @@ static ssize_t efivarfs_file_write(struct file *file, if (attributes & ~(EFI_VARIABLE_MASK)) return -EINVAL; - data = kmalloc(datasize, GFP_KERNEL); - if (!data) - return -ENOMEM; - - if (copy_from_user(data, userbuf + sizeof(attributes), datasize)) { - bytes = -EFAULT; - goto out; - } + data = memdup_user(userbuf + sizeof(attributes), datasize); + if (IS_ERR(data)) + return PTR_ERR(data); bytes = efivar_entry_set_get_size(var, attributes, &datasize, data, &set); diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index a8766b880c0..0a48886e069 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -83,19 +83,10 @@ static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr) return 0; } -/* - * Retaining negative dentries for an in-memory filesystem just wastes - * memory and lookup time: arrange for them to be deleted immediately. - */ -static int efivarfs_delete_dentry(const struct dentry *dentry) -{ - return 1; -} - -static struct dentry_operations efivarfs_d_ops = { +static const struct dentry_operations efivarfs_d_ops = { .d_compare = efivarfs_d_compare, .d_hash = efivarfs_d_hash, - .d_delete = efivarfs_delete_dentry, + .d_delete = always_delete_dentry, }; static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name) diff --git a/fs/efs/dir.c b/fs/efs/dir.c index b72307ccdf7..ce63b24f7c3 100644 --- a/fs/efs/dir.c +++ b/fs/efs/dir.c @@ -26,7 +26,8 @@ static int efs_readdir(struct file *file, struct dir_context *ctx) int slot; if (inode->i_size & (EFS_DIRBSIZE-1)) - printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n"); + pr_warn("%s(): directory size not a multiple of EFS_DIRBSIZE\n", + __func__); /* work out where this entry can be found */ block = ctx->pos >> EFS_DIRBSIZE_BITS; @@ -43,14 +44,15 @@ static int efs_readdir(struct file *file, struct dir_context *ctx) bh = sb_bread(inode->i_sb, efs_bmap(inode, block)); if (!bh) { - printk(KERN_ERR "EFS: readdir(): failed to read dir block %d\n", block); + pr_err("%s(): failed to read dir block %d\n", + __func__, block); break; } dirblock = (struct efs_dir *) bh->b_data; if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) { - printk(KERN_ERR "EFS: readdir(): invalid directory block\n"); + pr_err("%s(): invalid directory block\n", __func__); brelse(bh); break; } @@ -69,10 +71,9 @@ static int efs_readdir(struct file *file, struct dir_context *ctx) inodenum = be32_to_cpu(dirslot->inode); namelen = dirslot->namelen; nameptr = dirslot->name; - -#ifdef DEBUG - printk(KERN_DEBUG "EFS: readdir(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", block, slot, dirblock->slots-1, inodenum, nameptr, namelen); -#endif + pr_debug("%s(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", + __func__, block, slot, dirblock->slots-1, + inodenum, nameptr, namelen); if (!namelen) continue; /* found the next entry */ @@ -80,7 +81,8 @@ static int efs_readdir(struct file *file, struct dir_context *ctx) /* sanity check */ if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) { - printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot); + pr_warn("directory entry %d exceeds directory block\n", + slot); continue; } diff --git a/fs/efs/efs.h b/fs/efs/efs.h index 5528926ac7f..5bbf9612140 100644 --- a/fs/efs/efs.h +++ b/fs/efs/efs.h @@ -7,6 +7,12 @@ #ifndef _EFS_EFS_H_ #define _EFS_EFS_H_ +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/fs.h> #include <asm/uaccess.h> diff --git a/fs/efs/file.c b/fs/efs/file.c index 1ccb364ffa6..a37dcee4686 100644 --- a/fs/efs/file.c +++ b/fs/efs/file.c @@ -22,10 +22,8 @@ int efs_get_block(struct inode *inode, sector_t iblock, /* * i have no idea why this happens as often as it does */ - printk(KERN_WARNING "EFS: bmap(): block %d >= %ld (filesize %ld)\n", - block, - inode->i_blocks, - inode->i_size); + pr_warn("%s(): block %d >= %ld (filesize %ld)\n", + __func__, block, inode->i_blocks, inode->i_size); #endif return 0; } @@ -38,7 +36,7 @@ int efs_get_block(struct inode *inode, sector_t iblock, int efs_bmap(struct inode *inode, efs_block_t block) { if (block < 0) { - printk(KERN_WARNING "EFS: bmap(): block < 0\n"); + pr_warn("%s(): block < 0\n", __func__); return 0; } @@ -48,10 +46,8 @@ int efs_bmap(struct inode *inode, efs_block_t block) { /* * i have no idea why this happens as often as it does */ - printk(KERN_WARNING "EFS: bmap(): block %d >= %ld (filesize %ld)\n", - block, - inode->i_blocks, - inode->i_size); + pr_warn("%s(): block %d >= %ld (filesize %ld)\n", + __func__, block, inode->i_blocks, inode->i_size); #endif return 0; } diff --git a/fs/efs/inode.c b/fs/efs/inode.c index d15ccf20f1b..079d20306ee 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -89,7 +89,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) bh = sb_bread(inode->i_sb, block); if (!bh) { - printk(KERN_WARNING "EFS: bread() failed at block %d\n", block); + pr_warn("%s() failed at block %d\n", __func__, block); goto read_inode_error; } @@ -130,19 +130,16 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) for(i = 0; i < EFS_DIRECTEXTENTS; i++) { extent_copy(&(efs_inode->di_u.di_extents[i]), &(in->extents[i])); if (i < in->numextents && in->extents[i].cooked.ex_magic != 0) { - printk(KERN_WARNING "EFS: extent %d has bad magic number in inode %lu\n", i, inode->i_ino); + pr_warn("extent %d has bad magic number in inode %lu\n", + i, inode->i_ino); brelse(bh); goto read_inode_error; } } brelse(bh); - -#ifdef DEBUG - printk(KERN_DEBUG "EFS: efs_iget(): inode %lu, extents %d, mode %o\n", - inode->i_ino, in->numextents, inode->i_mode); -#endif - + pr_debug("efs_iget(): inode %lu, extents %d, mode %o\n", + inode->i_ino, in->numextents, inode->i_mode); switch (inode->i_mode & S_IFMT) { case S_IFDIR: inode->i_op = &efs_dir_inode_operations; @@ -162,7 +159,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) init_special_inode(inode, inode->i_mode, device); break; default: - printk(KERN_WARNING "EFS: unsupported inode mode %o\n", inode->i_mode); + pr_warn("unsupported inode mode %o\n", inode->i_mode); goto read_inode_error; break; } @@ -171,7 +168,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) return inode; read_inode_error: - printk(KERN_WARNING "EFS: failed to read inode %lu\n", inode->i_ino); + pr_warn("failed to read inode %lu\n", inode->i_ino); iget_failed(inode); return ERR_PTR(-EIO); } @@ -216,7 +213,7 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { /* if we only have one extent then nothing can be found */ if (in->numextents == 1) { - printk(KERN_ERR "EFS: map_block() failed to map (1 extent)\n"); + pr_err("%s() failed to map (1 extent)\n", __func__); return 0; } @@ -234,13 +231,12 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { } } - printk(KERN_ERR "EFS: map_block() failed to map block %u (dir)\n", block); + pr_err("%s() failed to map block %u (dir)\n", __func__, block); return 0; } -#ifdef DEBUG - printk(KERN_DEBUG "EFS: map_block(): indirect search for logical block %u\n", block); -#endif + pr_debug("%s(): indirect search for logical block %u\n", + __func__, block); direxts = in->extents[0].cooked.ex_offset; indexts = in->numextents; @@ -262,7 +258,8 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { if (dirext == direxts) { /* should never happen */ - printk(KERN_ERR "EFS: couldn't find direct extent for indirect extent %d (block %u)\n", cur, block); + pr_err("couldn't find direct extent for indirect extent %d (block %u)\n", + cur, block); if (bh) brelse(bh); return 0; } @@ -279,12 +276,12 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { bh = sb_bread(inode->i_sb, iblock); if (!bh) { - printk(KERN_ERR "EFS: bread() failed at block %d\n", iblock); + pr_err("%s() failed at block %d\n", + __func__, iblock); return 0; } -#ifdef DEBUG - printk(KERN_DEBUG "EFS: map_block(): read indirect extent block %d\n", iblock); -#endif + pr_debug("%s(): read indirect extent block %d\n", + __func__, iblock); first = 0; lastblock = iblock; } @@ -294,7 +291,8 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { extent_copy(&(exts[ioffset]), &ext); if (ext.cooked.ex_magic != 0) { - printk(KERN_ERR "EFS: extent %d has bad magic number in block %d\n", cur, iblock); + pr_err("extent %d has bad magic number in block %d\n", + cur, iblock); if (bh) brelse(bh); return 0; } @@ -306,7 +304,7 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { } } if (bh) brelse(bh); - printk(KERN_ERR "EFS: map_block() failed to map block %u (indir)\n", block); + pr_err("%s() failed to map block %u (indir)\n", __func__, block); return 0; } diff --git a/fs/efs/namei.c b/fs/efs/namei.c index 96f66d213a1..356c044e2cd 100644 --- a/fs/efs/namei.c +++ b/fs/efs/namei.c @@ -23,20 +23,22 @@ static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) efs_block_t block; if (inode->i_size & (EFS_DIRBSIZE-1)) - printk(KERN_WARNING "EFS: WARNING: find_entry(): directory size not a multiple of EFS_DIRBSIZE\n"); + pr_warn("%s(): directory size not a multiple of EFS_DIRBSIZE\n", + __func__); for(block = 0; block < inode->i_blocks; block++) { bh = sb_bread(inode->i_sb, efs_bmap(inode, block)); if (!bh) { - printk(KERN_ERR "EFS: find_entry(): failed to read dir block %d\n", block); + pr_err("%s(): failed to read dir block %d\n", + __func__, block); return 0; } dirblock = (struct efs_dir *) bh->b_data; if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) { - printk(KERN_ERR "EFS: find_entry(): invalid directory block\n"); + pr_err("%s(): invalid directory block\n", __func__); brelse(bh); return(0); } diff --git a/fs/efs/super.c b/fs/efs/super.c index c6f57a74a55..7fca462ea4e 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -26,11 +26,18 @@ static struct dentry *efs_mount(struct file_system_type *fs_type, return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super); } +static void efs_kill_sb(struct super_block *s) +{ + struct efs_sb_info *sbi = SUPER_INFO(s); + kill_block_super(s); + kfree(sbi); +} + static struct file_system_type efs_fs_type = { .owner = THIS_MODULE, .name = "efs", .mount = efs_mount, - .kill_sb = kill_block_super, + .kill_sb = efs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("efs"); @@ -84,7 +91,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { efs_inode_cachep = kmem_cache_create("efs_inode_cache", sizeof(struct efs_inode_info), @@ -105,14 +112,9 @@ static void destroy_inodecache(void) kmem_cache_destroy(efs_inode_cachep); } -static void efs_put_super(struct super_block *s) -{ - kfree(s->s_fs_info); - s->s_fs_info = NULL; -} - static int efs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_RDONLY; return 0; } @@ -120,7 +122,6 @@ static int efs_remount(struct super_block *sb, int *flags, char *data) static const struct super_operations efs_superblock_operations = { .alloc_inode = efs_alloc_inode, .destroy_inode = efs_destroy_inode, - .put_super = efs_put_super, .statfs = efs_statfs, .remount_fs = efs_remount, }; @@ -133,7 +134,7 @@ static const struct export_operations efs_export_ops = { static int __init init_efs_fs(void) { int err; - printk("EFS: "EFS_VERSION" - http://aeschi.ch.eu.org/efs/\n"); + pr_info(EFS_VERSION" - http://aeschi.ch.eu.org/efs/\n"); err = init_inodecache(); if (err) goto out1; @@ -178,12 +179,12 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) { csum += be32_to_cpu(cs); } if (csum) { - printk(KERN_INFO "EFS: SGI disklabel: checksum bad, label corrupted\n"); + pr_warn("SGI disklabel: checksum bad, label corrupted\n"); return 0; } #ifdef DEBUG - printk(KERN_DEBUG "EFS: bf: \"%16s\"\n", vh->vh_bootfile); + pr_debug("bf: \"%16s\"\n", vh->vh_bootfile); for(i = 0; i < NVDIR; i++) { int j; @@ -195,9 +196,8 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) { name[j] = (char) 0; if (name[0]) { - printk(KERN_DEBUG "EFS: vh: %8s block: 0x%08x size: 0x%08x\n", - name, - (int) be32_to_cpu(vh->vh_vd[i].vd_lbn), + pr_debug("vh: %8s block: 0x%08x size: 0x%08x\n", + name, (int) be32_to_cpu(vh->vh_vd[i].vd_lbn), (int) be32_to_cpu(vh->vh_vd[i].vd_nbytes)); } } @@ -210,12 +210,11 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) { } #ifdef DEBUG if (be32_to_cpu(vh->vh_pt[i].pt_nblks)) { - printk(KERN_DEBUG "EFS: pt %2d: start: %08d size: %08d type: 0x%02x (%s)\n", - i, - (int) be32_to_cpu(vh->vh_pt[i].pt_firstlbn), - (int) be32_to_cpu(vh->vh_pt[i].pt_nblks), - pt_type, - (pt_entry->pt_name) ? pt_entry->pt_name : "unknown"); + pr_debug("pt %2d: start: %08d size: %08d type: 0x%02x (%s)\n", + i, (int)be32_to_cpu(vh->vh_pt[i].pt_firstlbn), + (int)be32_to_cpu(vh->vh_pt[i].pt_nblks), + pt_type, (pt_entry->pt_name) ? + pt_entry->pt_name : "unknown"); } #endif if (IS_EFS(pt_type)) { @@ -225,11 +224,10 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) { } if (slice == -1) { - printk(KERN_NOTICE "EFS: partition table contained no EFS partitions\n"); + pr_notice("partition table contained no EFS partitions\n"); #ifdef DEBUG } else { - printk(KERN_INFO "EFS: using slice %d (type %s, offset 0x%x)\n", - slice, + pr_info("using slice %d (type %s, offset 0x%x)\n", slice, (pt_entry->pt_name) ? pt_entry->pt_name : "unknown", sblock); #endif @@ -259,7 +257,6 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) struct efs_sb_info *sb; struct buffer_head *bh; struct inode *root; - int ret = -EINVAL; sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL); if (!sb) @@ -268,17 +265,17 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) s->s_magic = EFS_SUPER_MAGIC; if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) { - printk(KERN_ERR "EFS: device does not support %d byte blocks\n", + pr_err("device does not support %d byte blocks\n", EFS_BLOCKSIZE); - goto out_no_fs_ul; + return -EINVAL; } /* read the vh (volume header) block */ bh = sb_bread(s, 0); if (!bh) { - printk(KERN_ERR "EFS: cannot read volume header\n"); - goto out_no_fs_ul; + pr_err("cannot read volume header\n"); + return -EINVAL; } /* @@ -290,27 +287,28 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) brelse(bh); if (sb->fs_start == -1) { - goto out_no_fs_ul; + return -EINVAL; } bh = sb_bread(s, sb->fs_start + EFS_SUPER); if (!bh) { - printk(KERN_ERR "EFS: cannot read superblock\n"); - goto out_no_fs_ul; + pr_err("cannot read superblock\n"); + return -EINVAL; } if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) { #ifdef DEBUG - printk(KERN_WARNING "EFS: invalid superblock at block %u\n", sb->fs_start + EFS_SUPER); + pr_warn("invalid superblock at block %u\n", + sb->fs_start + EFS_SUPER); #endif brelse(bh); - goto out_no_fs_ul; + return -EINVAL; } brelse(bh); if (!(s->s_flags & MS_RDONLY)) { #ifdef DEBUG - printk(KERN_INFO "EFS: forcing read-only mode\n"); + pr_info("forcing read-only mode\n"); #endif s->s_flags |= MS_RDONLY; } @@ -318,25 +316,17 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) s->s_export_op = &efs_export_ops; root = efs_iget(s, EFS_ROOTINODE); if (IS_ERR(root)) { - printk(KERN_ERR "EFS: get root inode failed\n"); - ret = PTR_ERR(root); - goto out_no_fs; + pr_err("get root inode failed\n"); + return PTR_ERR(root); } s->s_root = d_make_root(root); if (!(s->s_root)) { - printk(KERN_ERR "EFS: get root dentry failed\n"); - ret = -ENOMEM; - goto out_no_fs; + pr_err("get root dentry failed\n"); + return -ENOMEM; } return 0; - -out_no_fs_ul: -out_no_fs: - s->s_fs_info = NULL; - kfree(sb); - return ret; } static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) { diff --git a/fs/eventfd.c b/fs/eventfd.c index 35470d9b96e..d6a88e7812f 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -349,15 +349,12 @@ EXPORT_SYMBOL_GPL(eventfd_fget); */ struct eventfd_ctx *eventfd_ctx_fdget(int fd) { - struct file *file; struct eventfd_ctx *ctx; - - file = eventfd_fget(fd); - if (IS_ERR(file)) - return (struct eventfd_ctx *) file; - ctx = eventfd_ctx_get(file->private_data); - fput(file); - + struct fd f = fdget(fd); + if (!f.file) + return ERR_PTR(-EBADF); + ctx = eventfd_ctx_fileget(f.file); + fdput(f); return ctx; } EXPORT_SYMBOL_GPL(eventfd_ctx_fdget); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 473e09da7d0..b10b48c2a7a 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -34,7 +34,6 @@ #include <linux/mutex.h> #include <linux/anon_inodes.h> #include <linux/device.h> -#include <linux/freezer.h> #include <asm/uaccess.h> #include <asm/io.h> #include <asm/mman.h> @@ -42,6 +41,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/compat.h> +#include <linux/rculist.h> /* * LOCKING: @@ -134,8 +134,12 @@ struct nested_calls { * of these on a server and we do not want this to take another cache line. */ struct epitem { - /* RB tree node used to link this structure to the eventpoll RB tree */ - struct rb_node rbn; + union { + /* RB tree node links this structure to the eventpoll RB tree */ + struct rb_node rbn; + /* Used to free the struct epitem */ + struct rcu_head rcu; + }; /* List header used to link this structure to the eventpoll ready list */ struct list_head rdllink; @@ -289,7 +293,7 @@ static LIST_HEAD(tfile_check_list); static long zero; static long long_max = LONG_MAX; -ctl_table epoll_table[] = { +struct ctl_table epoll_table[] = { { .procname = "max_user_watches", .data = &max_user_watches, @@ -581,14 +585,14 @@ static inline void ep_pm_stay_awake_rcu(struct epitem *epi) * @sproc: Pointer to the scan callback. * @priv: Private opaque data passed to the @sproc callback. * @depth: The current depth of recursive f_op->poll calls. + * @ep_locked: caller already holds ep->mtx * * Returns: The same integer error code returned by the @sproc callback. */ static int ep_scan_ready_list(struct eventpoll *ep, int (*sproc)(struct eventpoll *, struct list_head *, void *), - void *priv, - int depth) + void *priv, int depth, bool ep_locked) { int error, pwake = 0; unsigned long flags; @@ -599,7 +603,9 @@ static int ep_scan_ready_list(struct eventpoll *ep, * We need to lock this because we could be hit by * eventpoll_release_file() and epoll_ctl(). */ - mutex_lock_nested(&ep->mtx, depth); + + if (!ep_locked) + mutex_lock_nested(&ep->mtx, depth); /* * Steal the ready list, and re-init the original one to the @@ -663,7 +669,8 @@ static int ep_scan_ready_list(struct eventpoll *ep, } spin_unlock_irqrestore(&ep->lock, flags); - mutex_unlock(&ep->mtx); + if (!ep_locked) + mutex_unlock(&ep->mtx); /* We have to call this outside the lock */ if (pwake) @@ -672,6 +679,12 @@ static int ep_scan_ready_list(struct eventpoll *ep, return error; } +static void epi_rcu_free(struct rcu_head *head) +{ + struct epitem *epi = container_of(head, struct epitem, rcu); + kmem_cache_free(epi_cache, epi); +} + /* * Removes a "struct epitem" from the eventpoll RB tree and deallocates * all the associated resources. Must be called with "mtx" held. @@ -693,8 +706,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) /* Remove the current item from the list of epoll hooks */ spin_lock(&file->f_lock); - if (ep_is_linked(&epi->fllink)) - list_del_init(&epi->fllink); + list_del_rcu(&epi->fllink); spin_unlock(&file->f_lock); rb_erase(&epi->rbn, &ep->rbr); @@ -705,9 +717,14 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) spin_unlock_irqrestore(&ep->lock, flags); wakeup_source_unregister(ep_wakeup_source(epi)); - - /* At this point it is safe to free the eventpoll item */ - kmem_cache_free(epi_cache, epi); + /* + * At this point it is safe to free the eventpoll item. Use the union + * field epi->rcu, since we are trying to minimize the size of + * 'struct epitem'. The 'rbn' field is no longer in use. Protected by + * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make + * use of the rbn field. + */ + call_rcu(&epi->rcu, epi_rcu_free); atomic_long_dec(&ep->user->epoll_watches); @@ -808,15 +825,34 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, return 0; } +static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, + poll_table *pt); + +struct readyevents_arg { + struct eventpoll *ep; + bool locked; +}; + static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests) { - return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1); + struct readyevents_arg *arg = priv; + + return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL, + call_nests + 1, arg->locked); } static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) { int pollflags; struct eventpoll *ep = file->private_data; + struct readyevents_arg arg; + + /* + * During ep_insert() we already hold the ep->mtx for the tfile. + * Prevent re-aquisition. + */ + arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc); + arg.ep = ep; /* Insert inside our poll wait queue */ poll_wait(file, &ep->poll_wait, wait); @@ -828,7 +864,7 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) * could re-enter here. */ pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS, - ep_poll_readyevents_proc, ep, ep, current); + ep_poll_readyevents_proc, &arg, ep, current); return pollflags != -1 ? pollflags : 0; } @@ -873,9 +909,8 @@ static const struct file_operations eventpoll_fops = { */ void eventpoll_release_file(struct file *file) { - struct list_head *lsthead = &file->f_ep_links; struct eventpoll *ep; - struct epitem *epi; + struct epitem *epi, *next; /* * We don't want to get "file->f_lock" because it is not @@ -891,17 +926,12 @@ void eventpoll_release_file(struct file *file) * Besides, ep_remove() acquires the lock, so we can't hold it here. */ mutex_lock(&epmutex); - - while (!list_empty(lsthead)) { - epi = list_first_entry(lsthead, struct epitem, fllink); - + list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) { ep = epi->ep; - list_del_init(&epi->fllink); mutex_lock_nested(&ep->mtx, 0); ep_remove(ep, epi); mutex_unlock(&ep->mtx); } - mutex_unlock(&epmutex); } @@ -1139,7 +1169,9 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) struct file *child_file; struct epitem *epi; - list_for_each_entry(epi, &file->f_ep_links, fllink) { + /* CTL_DEL can remove links here, but that can't increase our count */ + rcu_read_lock(); + list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) { child_file = epi->ep->file; if (is_file_epoll(child_file)) { if (list_empty(&child_file->f_ep_links)) { @@ -1161,6 +1193,7 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) "file is not an ep!\n"); } } + rcu_read_unlock(); return error; } @@ -1232,7 +1265,7 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi) * Must be called with "mtx" held. */ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, - struct file *tfile, int fd) + struct file *tfile, int fd, int full_check) { int error, revents, pwake = 0; unsigned long flags; @@ -1287,7 +1320,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* Add the current item to the list of active epoll hook for this file */ spin_lock(&tfile->f_lock); - list_add_tail(&epi->fllink, &tfile->f_ep_links); + list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links); spin_unlock(&tfile->f_lock); /* @@ -1298,7 +1331,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* now check if we've created too many backpaths */ error = -EINVAL; - if (reverse_path_check()) + if (full_check && reverse_path_check()) goto error_remove_epi; /* We have to drop the new item inside our item list to keep track of it */ @@ -1328,8 +1361,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, error_remove_epi: spin_lock(&tfile->f_lock); - if (ep_is_linked(&epi->fllink)) - list_del_init(&epi->fllink); + list_del_rcu(&epi->fllink); spin_unlock(&tfile->f_lock); rb_erase(&epi->rbn, &ep->rbr); @@ -1522,7 +1554,7 @@ static int ep_send_events(struct eventpoll *ep, esed.maxevents = maxevents; esed.events = events; - return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0); + return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false); } static inline struct timespec ep_set_mstimeout(long ms) @@ -1605,8 +1637,7 @@ fetch_events: } spin_unlock_irqrestore(&ep->lock, flags); - if (!freezable_schedule_hrtimeout_range(to, slack, - HRTIMER_MODE_ABS)) + if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) timed_out = 1; spin_lock_irqsave(&ep->lock, flags); @@ -1793,11 +1824,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event __user *, event) { int error; - int did_lock_epmutex = 0; + int full_check = 0; struct fd f, tf; struct eventpoll *ep; struct epitem *epi; struct epoll_event epds; + struct eventpoll *tep = NULL; error = -EFAULT; if (ep_op_has_event(op) && @@ -1816,12 +1848,11 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, /* The target file descriptor must support poll */ error = -EPERM; - if (!tf.file->f_op || !tf.file->f_op->poll) + if (!tf.file->f_op->poll) goto error_tgt_fput; /* Check if EPOLLWAKEUP is allowed */ - if ((epds.events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND)) - epds.events &= ~EPOLLWAKEUP; + ep_take_care_of_epollwakeup(&epds); /* * We have to check that the file structure underneath the file descriptor @@ -1846,27 +1877,37 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, * and hang them on the tfile_check_list, so we can check that we * haven't created too many possible wakeup paths. * - * We need to hold the epmutex across both ep_insert and ep_remove - * b/c we want to make sure we are looking at a coherent view of - * epoll network. + * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when + * the epoll file descriptor is attaching directly to a wakeup source, + * unless the epoll file descriptor is nested. The purpose of taking the + * 'epmutex' on add is to prevent complex toplogies such as loops and + * deep wakeup paths from forming in parallel through multiple + * EPOLL_CTL_ADD operations. */ - if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) { - mutex_lock(&epmutex); - did_lock_epmutex = 1; - } + mutex_lock_nested(&ep->mtx, 0); if (op == EPOLL_CTL_ADD) { - if (is_file_epoll(tf.file)) { - error = -ELOOP; - if (ep_loop_check(ep, tf.file) != 0) { - clear_tfile_check_list(); - goto error_tgt_fput; + if (!list_empty(&f.file->f_ep_links) || + is_file_epoll(tf.file)) { + full_check = 1; + mutex_unlock(&ep->mtx); + mutex_lock(&epmutex); + if (is_file_epoll(tf.file)) { + error = -ELOOP; + if (ep_loop_check(ep, tf.file) != 0) { + clear_tfile_check_list(); + goto error_tgt_fput; + } + } else + list_add(&tf.file->f_tfile_llink, + &tfile_check_list); + mutex_lock_nested(&ep->mtx, 0); + if (is_file_epoll(tf.file)) { + tep = tf.file->private_data; + mutex_lock_nested(&tep->mtx, 1); } - } else - list_add(&tf.file->f_tfile_llink, &tfile_check_list); + } } - mutex_lock_nested(&ep->mtx, 0); - /* * Try to lookup the file inside our RB tree, Since we grabbed "mtx" * above, we can be sure to be able to use the item looked up by @@ -1879,10 +1920,11 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, case EPOLL_CTL_ADD: if (!epi) { epds.events |= POLLERR | POLLHUP; - error = ep_insert(ep, &epds, tf.file, fd); + error = ep_insert(ep, &epds, tf.file, fd, full_check); } else error = -EEXIST; - clear_tfile_check_list(); + if (full_check) + clear_tfile_check_list(); break; case EPOLL_CTL_DEL: if (epi) @@ -1898,10 +1940,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, error = -ENOENT; break; } + if (tep != NULL) + mutex_unlock(&tep->mtx); mutex_unlock(&ep->mtx); error_tgt_fput: - if (did_lock_epmutex) + if (full_check) mutex_unlock(&epmutex); fdput(tf); diff --git a/fs/exec.c b/fs/exec.c index 8875dd10ae7..a3d33fe592d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -26,6 +26,7 @@ #include <linux/file.h> #include <linux/fdtable.h> #include <linux/mm.h> +#include <linux/vmacache.h> #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/swap.h> @@ -62,7 +63,6 @@ #include <trace/events/task.h> #include "internal.h" -#include "coredump.h" #include <trace/events/sched.h> @@ -98,6 +98,7 @@ static inline void put_binfmt(struct linux_binfmt * fmt) module_put(fmt->module); } +#ifdef CONFIG_USELIB /* * Note that a shared library must be both readable and executable due to * security reasons. @@ -106,6 +107,7 @@ static inline void put_binfmt(struct linux_binfmt * fmt) */ SYSCALL_DEFINE1(uselib, const char __user *, library) { + struct linux_binfmt *fmt; struct file *file; struct filename *tmp = getname(library); int error = PTR_ERR(tmp); @@ -136,29 +138,27 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) fsnotify_open(file); error = -ENOEXEC; - if(file->f_op) { - struct linux_binfmt * fmt; - read_lock(&binfmt_lock); - list_for_each_entry(fmt, &formats, lh) { - if (!fmt->load_shlib) - continue; - if (!try_module_get(fmt->module)) - continue; - read_unlock(&binfmt_lock); - error = fmt->load_shlib(file); - read_lock(&binfmt_lock); - put_binfmt(fmt); - if (error != -ENOEXEC) - break; - } + read_lock(&binfmt_lock); + list_for_each_entry(fmt, &formats, lh) { + if (!fmt->load_shlib) + continue; + if (!try_module_get(fmt->module)) + continue; read_unlock(&binfmt_lock); + error = fmt->load_shlib(file); + read_lock(&binfmt_lock); + put_binfmt(fmt); + if (error != -ENOEXEC) + break; } + read_unlock(&binfmt_lock); exit: fput(file); out: return error; } +#endif /* #ifdef CONFIG_USELIB */ #ifdef CONFIG_MMU /* @@ -657,10 +657,10 @@ int setup_arg_pages(struct linux_binprm *bprm, unsigned long rlim_stack; #ifdef CONFIG_STACK_GROWSUP - /* Limit stack size to 1GB */ + /* Limit stack size */ stack_base = rlimit_max(RLIMIT_STACK); - if (stack_base > (1 << 30)) - stack_base = 1 << 30; + if (stack_base > STACK_SIZE_MAX) + stack_base = STACK_SIZE_MAX; /* Make sure we didn't let the argument array grow too large. */ if (vma->vm_end - vma->vm_start > stack_base) @@ -751,11 +751,10 @@ EXPORT_SYMBOL(setup_arg_pages); #endif /* CONFIG_MMU */ -struct file *open_exec(const char *name) +static struct file *do_open_exec(struct filename *name) { struct file *file; int err; - struct filename tmp = { .name = name }; static const struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC | MAY_OPEN, @@ -763,7 +762,7 @@ struct file *open_exec(const char *name) .lookup_flags = LOOKUP_FOLLOW, }; - file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags); + file = do_filp_open(AT_FDCWD, name, &open_exec_flags); if (IS_ERR(file)) goto out; @@ -787,6 +786,12 @@ exit: fput(file); return ERR_PTR(err); } + +struct file *open_exec(const char *name) +{ + struct filename tmp = { .name = name }; + return do_open_exec(&tmp); +} EXPORT_SYMBOL(open_exec); int kernel_read(struct file *file, loff_t offset, @@ -808,7 +813,7 @@ EXPORT_SYMBOL(kernel_read); ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) { - ssize_t res = file->f_op->read(file, (void __user *)addr, len, &pos); + ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); if (res > 0) flush_icache_range(addr, addr + len); return res; @@ -818,7 +823,7 @@ EXPORT_SYMBOL(read_code); static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; - struct mm_struct * old_mm, *active_mm; + struct mm_struct *old_mm, *active_mm; /* Notify parent that we're no longer interested in the old VM */ tsk = current; @@ -844,8 +849,9 @@ static int exec_mmap(struct mm_struct *mm) tsk->mm = mm; tsk->active_mm = mm; activate_mm(active_mm, mm); + tsk->mm->vmacache_seqnum = 0; + vmacache_flush(tsk); task_unlock(tsk); - arch_pick_mmap_layout(mm); if (old_mm) { up_read(&old_mm->mmap_sem); BUG_ON(active_mm != old_mm); @@ -1040,28 +1046,13 @@ EXPORT_SYMBOL_GPL(get_task_comm); * so that a new one can be started */ -void set_task_comm(struct task_struct *tsk, char *buf) +void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) { task_lock(tsk); trace_task_rename(tsk, buf); strlcpy(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); - perf_event_comm(tsk); -} - -static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len) -{ - int i, ch; - - /* Copies the binary name from after last slash */ - for (i = 0; (ch = *(fn++)) != '\0';) { - if (ch == '/') - i = 0; /* overwrite what we wrote */ - else - if (i < len - 1) - tcomm[i++] = ch; - } - tcomm[i] = '\0'; + perf_event_comm(tsk, exec); } int flush_old_exec(struct linux_binprm * bprm) @@ -1077,8 +1068,6 @@ int flush_old_exec(struct linux_binprm * bprm) goto out; set_mm_exe_file(bprm->mm, bprm->file); - - filename_to_taskname(bprm->tcomm, bprm->filename, sizeof(bprm->tcomm)); /* * Release all of the old mmap stuff */ @@ -1090,8 +1079,8 @@ int flush_old_exec(struct linux_binprm * bprm) bprm->mm = NULL; /* We're using it now */ set_fs(USER_DS); - current->flags &= - ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE); + current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | + PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); current->personality &= ~bprm->per_clear; @@ -1121,7 +1110,8 @@ void setup_new_exec(struct linux_binprm * bprm) else set_dumpable(current->mm, suid_dumpable); - set_task_comm(current, bprm->tcomm); + perf_event_exec(); + __set_task_comm(current, kbasename(bprm->filename), true); /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on @@ -1141,9 +1131,7 @@ void setup_new_exec(struct linux_binprm * bprm) /* An exec changes our domain. We are no longer part of the thread group */ - current->self_exec_id++; - flush_signal_handlers(current, 0); do_close_on_exec(current->files); } @@ -1168,13 +1156,17 @@ int prepare_bprm_creds(struct linux_binprm *bprm) return -ENOMEM; } -void free_bprm(struct linux_binprm *bprm) +static void free_bprm(struct linux_binprm *bprm) { free_arg_pages(bprm); if (bprm->cred) { mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } + if (bprm->file) { + allow_write_access(bprm->file); + fput(bprm->file); + } /* If a binfmt changed the interp, free it. */ if (bprm->interp != bprm->filename) kfree(bprm->interp); @@ -1226,11 +1218,10 @@ EXPORT_SYMBOL(install_exec_creds); * - the caller must hold ->cred_guard_mutex to protect against * PTRACE_ATTACH */ -static int check_unsafe_exec(struct linux_binprm *bprm) +static void check_unsafe_exec(struct linux_binprm *bprm) { struct task_struct *p = current, *t; unsigned n_fs; - int res = 0; if (p->ptrace) { if (p->ptrace & PT_PTRACE_CAP) @@ -1246,44 +1237,35 @@ static int check_unsafe_exec(struct linux_binprm *bprm) if (current->no_new_privs) bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; + t = p; n_fs = 1; spin_lock(&p->fs->lock); rcu_read_lock(); - for (t = next_thread(p); t != p; t = next_thread(t)) { + while_each_thread(p, t) { if (t->fs == p->fs) n_fs++; } rcu_read_unlock(); - if (p->fs->users > n_fs) { + if (p->fs->users > n_fs) bprm->unsafe |= LSM_UNSAFE_SHARE; - } else { - res = -EAGAIN; - if (!p->fs->in_exec) { - p->fs->in_exec = 1; - res = 1; - } - } + else + p->fs->in_exec = 1; spin_unlock(&p->fs->lock); - - return res; } -/* - * Fill the binprm structure from the inode. +/* + * Fill the binprm structure from the inode. * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes * * This may be called multiple times for binary chains (scripts for example). */ int prepare_binprm(struct linux_binprm *bprm) { - umode_t mode; - struct inode * inode = file_inode(bprm->file); + struct inode *inode = file_inode(bprm->file); + umode_t mode = inode->i_mode; int retval; - mode = inode->i_mode; - if (bprm->file->f_op == NULL) - return -EACCES; /* clear any previous set[ug]id data from a previous binary */ bprm->cred->euid = current_euid(); @@ -1385,10 +1367,6 @@ int search_binary_handler(struct linux_binprm *bprm) if (retval) return retval; - retval = audit_bprm(bprm); - if (retval) - return retval; - retval = -ENOENT; retry: read_lock(&binfmt_lock); @@ -1436,16 +1414,10 @@ static int exec_binprm(struct linux_binprm *bprm) ret = search_binary_handler(bprm); if (ret >= 0) { + audit_bprm(bprm); trace_sched_process_exec(current, old_pid, bprm); ptrace_event(PTRACE_EVENT_EXEC, old_vpid); - current->did_exec = 1; proc_exec_connector(current); - - if (bprm->file) { - allow_write_access(bprm->file); - fput(bprm->file); - bprm->file = NULL; /* to catch use-after-free */ - } } return ret; @@ -1454,16 +1426,18 @@ static int exec_binprm(struct linux_binprm *bprm) /* * sys_execve() executes a new program. */ -static int do_execve_common(const char *filename, +static int do_execve_common(struct filename *filename, struct user_arg_ptr argv, struct user_arg_ptr envp) { struct linux_binprm *bprm; struct file *file; struct files_struct *displaced; - bool clear_in_exec; int retval; + if (IS_ERR(filename)) + return PTR_ERR(filename); + /* * We move the actual failure in case of RLIMIT_NPROC excess from * set*uid() to execve() because too many poorly written programs @@ -1493,13 +1467,10 @@ static int do_execve_common(const char *filename, if (retval) goto out_free; - retval = check_unsafe_exec(bprm); - if (retval < 0) - goto out_free; - clear_in_exec = retval; + check_unsafe_exec(bprm); current->in_execve = 1; - file = open_exec(filename); + file = do_open_exec(filename); retval = PTR_ERR(file); if (IS_ERR(file)) goto out_unmark; @@ -1507,12 +1478,11 @@ static int do_execve_common(const char *filename, sched_exec(); bprm->file = file; - bprm->filename = filename; - bprm->interp = filename; + bprm->filename = bprm->interp = filename->name; retval = bprm_mm_init(bprm); if (retval) - goto out_file; + goto out_unmark; bprm->argc = count(argv, MAX_ARG_STRINGS); if ((retval = bprm->argc) < 0) @@ -1547,7 +1517,9 @@ static int do_execve_common(const char *filename, current->fs->in_exec = 0; current->in_execve = 0; acct_update_integrals(current); + task_numa_free(current); free_bprm(bprm); + putname(filename); if (displaced) put_files_struct(displaced); return retval; @@ -1558,15 +1530,8 @@ out: mmput(bprm->mm); } -out_file: - if (bprm->file) { - allow_write_access(bprm->file); - fput(bprm->file); - } - out_unmark: - if (clear_in_exec) - current->fs->in_exec = 0; + current->fs->in_exec = 0; current->in_execve = 0; out_free: @@ -1576,10 +1541,11 @@ out_files: if (displaced) reset_files_struct(displaced); out_ret: + putname(filename); return retval; } -int do_execve(const char *filename, +int do_execve(struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp) { @@ -1589,7 +1555,7 @@ int do_execve(const char *filename, } #ifdef CONFIG_COMPAT -static int compat_do_execve(const char *filename, +static int compat_do_execve(struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp) { @@ -1616,61 +1582,22 @@ void set_binfmt(struct linux_binfmt *new) if (new) __module_get(new->module); } - EXPORT_SYMBOL(set_binfmt); /* - * set_dumpable converts traditional three-value dumpable to two flags and - * stores them into mm->flags. It modifies lower two bits of mm->flags, but - * these bits are not changed atomically. So get_dumpable can observe the - * intermediate state. To avoid doing unexpected behavior, get get_dumpable - * return either old dumpable or new one by paying attention to the order of - * modifying the bits. - * - * dumpable | mm->flags (binary) - * old new | initial interim final - * ---------+----------------------- - * 0 1 | 00 01 01 - * 0 2 | 00 10(*) 11 - * 1 0 | 01 00 00 - * 1 2 | 01 11 11 - * 2 0 | 11 10(*) 00 - * 2 1 | 11 11 01 - * - * (*) get_dumpable regards interim value of 10 as 11. + * set_dumpable stores three-value SUID_DUMP_* into mm->flags. */ void set_dumpable(struct mm_struct *mm, int value) { - switch (value) { - case SUID_DUMP_DISABLE: - clear_bit(MMF_DUMPABLE, &mm->flags); - smp_wmb(); - clear_bit(MMF_DUMP_SECURELY, &mm->flags); - break; - case SUID_DUMP_USER: - set_bit(MMF_DUMPABLE, &mm->flags); - smp_wmb(); - clear_bit(MMF_DUMP_SECURELY, &mm->flags); - break; - case SUID_DUMP_ROOT: - set_bit(MMF_DUMP_SECURELY, &mm->flags); - smp_wmb(); - set_bit(MMF_DUMPABLE, &mm->flags); - break; - } -} + unsigned long old, new; -int __get_dumpable(unsigned long mm_flags) -{ - int ret; - - ret = mm_flags & MMF_DUMPABLE_MASK; - return (ret > SUID_DUMP_USER) ? SUID_DUMP_ROOT : ret; -} + if (WARN_ON((unsigned)value > SUID_DUMP_ROOT)) + return; -int get_dumpable(struct mm_struct *mm) -{ - return __get_dumpable(mm->flags); + do { + old = ACCESS_ONCE(mm->flags); + new = (old & ~MMF_DUMPABLE_MASK) | value; + } while (cmpxchg(&mm->flags, old, new) != old); } SYSCALL_DEFINE3(execve, @@ -1678,25 +1605,13 @@ SYSCALL_DEFINE3(execve, const char __user *const __user *, argv, const char __user *const __user *, envp) { - struct filename *path = getname(filename); - int error = PTR_ERR(path); - if (!IS_ERR(path)) { - error = do_execve(path->name, argv, envp); - putname(path); - } - return error; + return do_execve(getname(filename), argv, envp); } #ifdef CONFIG_COMPAT -asmlinkage long compat_sys_execve(const char __user * filename, - const compat_uptr_t __user * argv, - const compat_uptr_t __user * envp) +COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, + const compat_uptr_t __user *, argv, + const compat_uptr_t __user *, envp) { - struct filename *path = getname(filename); - int error = PTR_ERR(path); - if (!IS_ERR(path)) { - error = compat_do_execve(path->name, argv, envp); - putname(path); - } - return error; + return compat_do_execve(getname(filename), argv, envp); } #endif diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore index 1ca7fb7b6ba..2daf2329c28 100644 --- a/fs/exofs/Kconfig.ore +++ b/fs/exofs/Kconfig.ore @@ -9,4 +9,6 @@ config ORE tristate depends on EXOFS_FS || PNFS_OBJLAYOUT select ASYNC_XOR + select RAID6_PQ + select ASYNC_PQ default SCSI_OSD_ULD diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 491c6c078e7..71bf8e4fb5d 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -67,17 +67,17 @@ static int exofs_flush(struct file *file, fl_owner_t id) const struct file_operations exofs_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .open = generic_file_open, .release = exofs_release_file, .fsync = exofs_file_fsync, .flush = exofs_flush, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, }; const struct inode_operations exofs_file_inode_operations = { diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index a52a5d23c30..3f9cafd7393 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -577,7 +577,7 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) if (offset >= i_size) { *uptodate = true; - EXOFS_DBGMSG("offset >= i_size index=0x%lx\n", index); + EXOFS_DBGMSG2("offset >= i_size index=0x%lx\n", index); return ZERO_PAGE(0); } @@ -596,10 +596,10 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) *uptodate = true; else *uptodate = PageUptodate(page); - EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate); + EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate); return page; } else { - EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n", + EXOFS_DBGMSG2("YES that_locked_page index=0x%lx\n", pcol->that_locked_page->index); *uptodate = true; return pcol->that_locked_page; @@ -611,11 +611,11 @@ static void __r4w_put_page(void *priv, struct page *page) struct page_collect *pcol = priv; if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) { - EXOFS_DBGMSG("index=0x%lx\n", page->index); + EXOFS_DBGMSG2("index=0x%lx\n", page->index); page_cache_release(page); return; } - EXOFS_DBGMSG("that_locked_page index=0x%lx\n", + EXOFS_DBGMSG2("that_locked_page index=0x%lx\n", ZERO_PAGE(0) == page ? -1 : page->index); } @@ -961,6 +961,14 @@ static void exofs_invalidatepage(struct page *page, unsigned int offset, WARN_ON(1); } + + /* TODO: Should be easy enough to do proprly */ +static ssize_t exofs_direct_IO(int rw, struct kiocb *iocb, + struct iov_iter *iter, loff_t offset) +{ + return 0; +} + const struct address_space_operations exofs_aops = { .readpage = exofs_readpage, .readpages = exofs_readpages, @@ -974,7 +982,7 @@ const struct address_space_operations exofs_aops = { /* Not implemented Yet */ .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */ - .direct_IO = NULL, /* TODO: Should be trivial to do */ + .direct_IO = exofs_direct_IO, /* With these NULL has special meaning or default is not exported */ .get_xip_mem = NULL, @@ -1010,7 +1018,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize) if (likely(!ret)) truncate_setsize(inode, newsize); - EXOFS_DBGMSG("(0x%lx) size=0x%llx ret=>%d\n", + EXOFS_DBGMSG2("(0x%lx) size=0x%llx ret=>%d\n", inode->i_ino, newsize, ret); return ret; } @@ -1094,14 +1102,13 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, /* If object is lost on target we might as well enable it's * delete. */ - if ((ret == -ENOENT) || (ret == -EINVAL)) - ret = 0; + ret = 0; goto out; } ret = extract_attr_from_ios(ios, &attrs[0]); if (ret) { - EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); + EXOFS_ERR("%s: extract_attr 0 of inode failed\n", __func__); goto out; } WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE); @@ -1109,7 +1116,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, ret = extract_attr_from_ios(ios, &attrs[1]); if (ret) { - EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); + EXOFS_ERR("%s: extract_attr 1 of inode failed\n", __func__); goto out; } if (attrs[1].len) { @@ -1124,7 +1131,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, ret = extract_attr_from_ios(ios, &attrs[2]); if (ret) { - EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); + EXOFS_ERR("%s: extract_attr 2 of inode failed\n", __func__); goto out; } if (attrs[2].len) { @@ -1479,7 +1486,7 @@ void exofs_evict_inode(struct inode *inode) struct ore_io_state *ios; int ret; - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); /* TODO: should do better here */ if (inode->i_nlink || is_bad_inode(inode)) diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index b7442288860..cfc0205d62c 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -58,9 +58,12 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) layout->parity = 1; break; case PNFS_OSD_RAID_PQ: + layout->parity = 2; + break; case PNFS_OSD_RAID_4: default: - ORE_ERR("Only RAID_0/5 for now\n"); + ORE_ERR("Only RAID_0/5/6 for now received-enum=%d\n", + layout->raid_algorithm); return -EINVAL; } if (0 != (layout->stripe_unit & ~PAGE_MASK)) { @@ -103,7 +106,7 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) layout->max_io_length = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * - layout->group_width; + (layout->group_width - layout->parity); if (layout->parity) { unsigned stripe_length = (layout->group_width - layout->parity) * @@ -112,6 +115,8 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) layout->max_io_length /= stripe_length; layout->max_io_length *= stripe_length; } + ORE_DBGMSG("max_io_length=0x%lx\n", layout->max_io_length); + return 0; } EXPORT_SYMBOL(ore_verify_layout); @@ -286,7 +291,8 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, if (length) { ore_calc_stripe_info(layout, offset, length, &ios->si); ios->length = ios->si.length; - ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; + ios->nr_pages = ((ios->offset & (PAGE_SIZE - 1)) + + ios->length + PAGE_SIZE - 1) / PAGE_SIZE; if (layout->parity) _ore_post_alloc_raid_stuff(ios); } @@ -430,8 +436,12 @@ int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error) if (likely(!ret)) continue; - if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { - /* start read offset passed endof file */ + if ((OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) && + per_dev->bio) { + /* start read offset passed endof file. + * Note: if we do not have bio it means read-attributes + * In this case we should return error to caller. + */ _clear_bio(per_dev->bio); ORE_DBGMSG("start read offset passed end of file " "offset=0x%llx, length=0x%llx\n", @@ -536,24 +546,28 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, u64 H = LmodS - G * T; u32 N = div_u64(H, U); + u32 Nlast; /* "H - (N * U)" is just "H % U" so it's bound to u32 */ u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; + u32 first_dev = C - C % group_width; div_u64_rem(file_offset, stripe_unit, &si->unit_off); si->obj_offset = si->unit_off + (N * stripe_unit) + (M * group_depth * stripe_unit); + si->cur_comp = C - first_dev; + si->cur_pg = si->unit_off / PAGE_SIZE; if (parity) { u32 LCMdP = lcm(group_width, parity) / parity; /* R = N % LCMdP; */ u32 RxP = (N % LCMdP) * parity; - u32 first_dev = C - C % group_width; si->par_dev = (group_width + group_width - parity - RxP) % group_width + first_dev; - si->dev = (group_width + C - RxP) % group_width + first_dev; + si->dev = (group_width + group_width + C - RxP) % + group_width + first_dev; si->bytes_in_stripe = U; si->first_stripe_start = M * S + G * T + N * U; } else { @@ -568,6 +582,10 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, si->length = T - H; if (si->length > length) si->length = length; + + Nlast = div_u64(H + si->length + U - 1, U); + si->maxdevUnits = Nlast - N; + si->M = M; } EXPORT_SYMBOL(ore_calc_stripe_info); @@ -583,13 +601,16 @@ int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, int ret; if (per_dev->bio == NULL) { - unsigned pages_in_stripe = ios->layout->group_width * - (ios->layout->stripe_unit / PAGE_SIZE); - unsigned nr_pages = ios->nr_pages * ios->layout->group_width / - (ios->layout->group_width - - ios->layout->parity); - unsigned bio_size = (nr_pages + pages_in_stripe) / - ios->layout->group_width; + unsigned bio_size; + + if (!ios->reading) { + bio_size = ios->si.maxdevUnits; + } else { + bio_size = (ios->si.maxdevUnits + 1) * + (ios->layout->group_width - ios->layout->parity) / + ios->layout->group_width; + } + bio_size *= (ios->layout->stripe_unit / PAGE_SIZE); per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); if (unlikely(!per_dev->bio)) { @@ -609,8 +630,12 @@ int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], pglen, pgbase); if (unlikely(pglen != added_len)) { - ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n", - per_dev->bio->bi_vcnt); + /* If bi_vcnt == bi_max then this is a SW BUG */ + ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=0x%x " + "bi_max=0x%x BIO_MAX=0x%x cur_len=0x%x\n", + per_dev->bio->bi_vcnt, + per_dev->bio->bi_max_vecs, + BIO_MAX_PAGES_KMALLOC, cur_len); ret = -ENOMEM; goto out; } @@ -632,6 +657,43 @@ out: /* we fail the complete unit on an error eg don't advance return ret; } +static int _add_parity_units(struct ore_io_state *ios, + struct ore_striping_info *si, + unsigned dev, unsigned first_dev, + unsigned mirrors_p1, unsigned devs_in_group, + unsigned cur_len) +{ + unsigned do_parity; + int ret = 0; + + for (do_parity = ios->layout->parity; do_parity; --do_parity) { + struct ore_per_dev_state *per_dev; + + per_dev = &ios->per_dev[dev - first_dev]; + if (!per_dev->length && !per_dev->offset) { + /* Only/always the parity unit of the first + * stripe will be empty. So this is a chance to + * initialize the per_dev info. + */ + per_dev->dev = dev; + per_dev->offset = si->obj_offset - si->unit_off; + } + + ret = _ore_add_parity_unit(ios, si, per_dev, cur_len, + do_parity == 1); + if (unlikely(ret)) + break; + + if (do_parity != 1) { + dev = ((dev + mirrors_p1) % devs_in_group) + first_dev; + si->cur_comp = (si->cur_comp + 1) % + ios->layout->group_width; + } + } + + return ret; +} + static int _prepare_for_striping(struct ore_io_state *ios) { struct ore_striping_info *si = &ios->si; @@ -641,7 +703,6 @@ static int _prepare_for_striping(struct ore_io_state *ios) unsigned devs_in_group = group_width * mirrors_p1; unsigned dev = si->dev; unsigned first_dev = dev - (dev % devs_in_group); - unsigned dev_order; unsigned cur_pg = ios->pages_consumed; u64 length = ios->length; int ret = 0; @@ -653,16 +714,13 @@ static int _prepare_for_striping(struct ore_io_state *ios) BUG_ON(length > si->length); - dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev); - si->cur_comp = dev_order; - si->cur_pg = si->unit_off / PAGE_SIZE; - while (length) { - unsigned comp = dev - first_dev; - struct ore_per_dev_state *per_dev = &ios->per_dev[comp]; + struct ore_per_dev_state *per_dev = + &ios->per_dev[dev - first_dev]; unsigned cur_len, page_off = 0; - if (!per_dev->length) { + if (!per_dev->length && !per_dev->offset) { + /* First time initialize the per_dev info. */ per_dev->dev = dev; if (dev == si->dev) { WARN_ON(dev == si->par_dev); @@ -671,13 +729,7 @@ static int _prepare_for_striping(struct ore_io_state *ios) page_off = si->unit_off & ~PAGE_MASK; BUG_ON(page_off && (page_off != ios->pgbase)); } else { - if (si->cur_comp > dev_order) - per_dev->offset = - si->obj_offset - si->unit_off; - else /* si->cur_comp < dev_order */ - per_dev->offset = - si->obj_offset + stripe_unit - - si->unit_off; + per_dev->offset = si->obj_offset - si->unit_off; cur_len = stripe_unit; } } else { @@ -691,11 +743,9 @@ static int _prepare_for_striping(struct ore_io_state *ios) if (unlikely(ret)) goto out; - dev += mirrors_p1; - dev = (dev % devs_in_group) + first_dev; - length -= cur_len; + dev = ((dev + mirrors_p1) % devs_in_group) + first_dev; si->cur_comp = (si->cur_comp + 1) % group_width; if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) { if (!length && ios->sp2d) { @@ -703,23 +753,16 @@ static int _prepare_for_striping(struct ore_io_state *ios) * stripe. then operate on parity dev. */ dev = si->par_dev; - } - if (ios->sp2d) - /* In writes cur_len just means if it's the - * last one. See _ore_add_parity_unit. - */ - cur_len = length; - per_dev = &ios->per_dev[dev - first_dev]; - if (!per_dev->length) { - /* Only/always the parity unit of the first - * stripe will be empty. So this is a chance to - * initialize the per_dev info. - */ - per_dev->dev = dev; - per_dev->offset = si->obj_offset - si->unit_off; + /* If last stripe operate on parity comp */ + si->cur_comp = group_width - ios->layout->parity; } - ret = _ore_add_parity_unit(ios, si, per_dev, cur_len); + /* In writes cur_len just means if it's the + * last one. See _ore_add_parity_unit. + */ + ret = _add_parity_units(ios, si, dev, first_dev, + mirrors_p1, devs_in_group, + ios->sp2d ? length : cur_len); if (unlikely(ret)) goto out; @@ -730,6 +773,8 @@ static int _prepare_for_striping(struct ore_io_state *ios) /* Next stripe, start fresh */ si->cur_comp = 0; si->cur_pg = 0; + si->obj_offset += cur_len; + si->unit_off = 0; } } out: @@ -1098,7 +1143,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc, size_attr->attr = g_attr_logical_length; size_attr->attr.val_ptr = &size_attr->newsize; - ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", + ORE_DBGMSG2("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", _LLU(oc->comps->obj.id), _LLU(obj_size), i); ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, &size_attr->attr); diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 7682b970d0f..7f20f25c232 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -21,12 +21,12 @@ #undef ORE_DBGMSG2 #define ORE_DBGMSG2 ORE_DBGMSG -struct page *_raid_page_alloc(void) +static struct page *_raid_page_alloc(void) { return alloc_page(GFP_KERNEL); } -void _raid_page_free(struct page *p) +static void _raid_page_free(struct page *p) { __free_page(p); } @@ -218,22 +218,28 @@ static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) { unsigned p; + unsigned tx_flags = ASYNC_TX_ACK; + + if (sp2d->parity == 1) + tx_flags |= ASYNC_TX_XOR_ZERO_DST; + for (p = 0; p < sp2d->pages_in_unit; p++) { struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; if (!_1ps->write_count) continue; - init_async_submit(&_1ps->submit, - ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK, - NULL, - NULL, NULL, - (addr_conv_t *)_1ps->scribble); - - /* TODO: raid6 */ - _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages, - 0, sp2d->data_devs, PAGE_SIZE, - &_1ps->submit); + init_async_submit(&_1ps->submit, tx_flags, + NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble); + + if (sp2d->parity == 1) + _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], + _1ps->pages, 0, sp2d->data_devs, + PAGE_SIZE, &_1ps->submit); + else /* parity == 2 */ + _1ps->tx = async_gen_syndrome(_1ps->pages, 0, + sp2d->data_devs + sp2d->parity, + PAGE_SIZE, &_1ps->submit); } for (p = 0; p < sp2d->pages_in_unit; p++) { @@ -404,9 +410,8 @@ static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset) ore_calc_stripe_info(ios->layout, *offset, 0, &si); - p = si.unit_off / PAGE_SIZE; - c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, - ios->layout->mirrors_p1, si.par_dev, si.dev); + p = si.cur_pg; + c = si.cur_comp; page = ios->sp2d->_1p_stripes[p].pages[c]; pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE); @@ -534,9 +539,8 @@ static int _read_4_write_last_stripe(struct ore_io_state *ios) goto read_it; ore_calc_stripe_info(ios->layout, offset, 0, &read_si); - p = read_si.unit_off / PAGE_SIZE; - c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, - ios->layout->mirrors_p1, read_si.par_dev, read_si.dev); + p = read_si.cur_pg; + c = read_si.cur_comp; if (min_p == sp2d->pages_in_unit) { /* Didn't do it yet */ @@ -620,7 +624,7 @@ static int _read_4_write_execute(struct ore_io_state *ios) int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, struct ore_per_dev_state *per_dev, - unsigned cur_len) + unsigned cur_len, bool do_xor) { if (ios->reading) { if (per_dev->cur_sg >= ios->sgs_per_dev) { @@ -640,17 +644,16 @@ int _ore_add_parity_unit(struct ore_io_state *ios, si->cur_pg = _sp2d_min_pg(sp2d); num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg; - if (!cur_len) /* If last stripe operate on parity comp */ - si->cur_comp = sp2d->data_devs; - if (!per_dev->length) { per_dev->offset += si->cur_pg * PAGE_SIZE; /* If first stripe, Read in all read4write pages * (if needed) before we calculate the first parity. */ - _read_4_write_first_stripe(ios); + if (do_xor) + _read_4_write_first_stripe(ios); } - if (!cur_len) /* If last stripe r4w pages of last stripe */ + if (!cur_len && do_xor) + /* If last stripe r4w pages of last stripe */ _read_4_write_last_stripe(ios); _read_4_write_execute(ios); @@ -662,7 +665,7 @@ int _ore_add_parity_unit(struct ore_io_state *ios, ++(ios->cur_par_page); } - BUG_ON(si->cur_comp != sp2d->data_devs); + BUG_ON(si->cur_comp < sp2d->data_devs); BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, @@ -670,9 +673,10 @@ int _ore_add_parity_unit(struct ore_io_state *ios, if (unlikely(ret)) return ret; - /* TODO: raid6 if (last_parity_dev) */ - _gen_xor_unit(sp2d); - _sp2d_reset(sp2d, ios->r4w, ios->private); + if (do_xor) { + _gen_xor_unit(sp2d); + _sp2d_reset(sp2d, ios->r4w, ios->private); + } } return 0; } diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h index 2ffd2c3c6e4..cf6375d8212 100644 --- a/fs/exofs/ore_raid.h +++ b/fs/exofs/ore_raid.h @@ -31,24 +31,6 @@ #define ORE_DBGMSG2(M...) do {} while (0) /* #define ORE_DBGMSG2 ORE_DBGMSG */ -/* Calculate the component order in a stripe. eg the logical data unit - * address within the stripe of @dev given the @par_dev of this stripe. - */ -static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1, - unsigned par_dev, unsigned dev) -{ - unsigned first_dev = dev - dev % devs_in_group; - - dev -= first_dev; - par_dev -= first_dev; - - if (devs_in_group == par_dev) /* The raid 0 case */ - return dev / mirrors_p1; - /* raid4/5/6 case */ - return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) / - mirrors_p1; -} - /* ios_raid.c stuff needed by ios.c */ int _ore_post_alloc_raid_stuff(struct ore_io_state *ios); void _ore_free_raid_stuff(struct ore_io_state *ios); @@ -56,7 +38,8 @@ void _ore_free_raid_stuff(struct ore_io_state *ios); void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, bool not_last); int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, - struct ore_per_dev_state *per_dev, unsigned cur_len); + struct ore_per_dev_state *per_dev, unsigned cur_len, + bool do_xor); void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, struct ore_striping_info *si, struct page *page); static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d, diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 9d976332873..ed73ed8ebbe 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -543,7 +543,7 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, return !(odi->systemid_len || odi->osdname_len); } -int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, +static int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, struct exofs_dev **peds) { struct __alloc_ore_devs_and_exofs_devs { diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index a235f001688..b01fbfb51f4 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -69,145 +69,162 @@ find_acceptable_alias(struct dentry *result, return NULL; } -/* - * Find root of a disconnected subtree and return a reference to it. - */ -static struct dentry * -find_disconnected_root(struct dentry *dentry) +static bool dentry_connected(struct dentry *dentry) { dget(dentry); - while (!IS_ROOT(dentry)) { + while (dentry->d_flags & DCACHE_DISCONNECTED) { struct dentry *parent = dget_parent(dentry); - if (!(parent->d_flags & DCACHE_DISCONNECTED)) { + dput(dentry); + if (IS_ROOT(dentry)) { dput(parent); - break; + return false; } + dentry = parent; + } + dput(dentry); + return true; +} + +static void clear_disconnected(struct dentry *dentry) +{ + dget(dentry); + while (dentry->d_flags & DCACHE_DISCONNECTED) { + struct dentry *parent = dget_parent(dentry); + + WARN_ON_ONCE(IS_ROOT(dentry)); + + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_DISCONNECTED; + spin_unlock(&dentry->d_lock); dput(dentry); dentry = parent; } - return dentry; + dput(dentry); +} + +/* + * Reconnect a directory dentry with its parent. + * + * This can return a dentry, or NULL, or an error. + * + * In the first case the returned dentry is the parent of the given + * dentry, and may itself need to be reconnected to its parent. + * + * In the NULL case, a concurrent VFS operation has either renamed or + * removed this directory. The concurrent operation has reconnected our + * dentry, so we no longer need to. + */ +static struct dentry *reconnect_one(struct vfsmount *mnt, + struct dentry *dentry, char *nbuf) +{ + struct dentry *parent; + struct dentry *tmp; + int err; + + parent = ERR_PTR(-EACCES); + mutex_lock(&dentry->d_inode->i_mutex); + if (mnt->mnt_sb->s_export_op->get_parent) + parent = mnt->mnt_sb->s_export_op->get_parent(dentry); + mutex_unlock(&dentry->d_inode->i_mutex); + + if (IS_ERR(parent)) { + dprintk("%s: get_parent of %ld failed, err %d\n", + __func__, dentry->d_inode->i_ino, PTR_ERR(parent)); + return parent; + } + + dprintk("%s: find name of %lu in %lu\n", __func__, + dentry->d_inode->i_ino, parent->d_inode->i_ino); + err = exportfs_get_name(mnt, parent, nbuf, dentry); + if (err == -ENOENT) + goto out_reconnected; + if (err) + goto out_err; + dprintk("%s: found name: %s\n", __func__, nbuf); + mutex_lock(&parent->d_inode->i_mutex); + tmp = lookup_one_len(nbuf, parent, strlen(nbuf)); + mutex_unlock(&parent->d_inode->i_mutex); + if (IS_ERR(tmp)) { + dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp)); + goto out_err; + } + if (tmp != dentry) { + dput(tmp); + goto out_reconnected; + } + dput(tmp); + if (IS_ROOT(dentry)) { + err = -ESTALE; + goto out_err; + } + return parent; + +out_err: + dput(parent); + return ERR_PTR(err); +out_reconnected: + dput(parent); + /* + * Someone must have renamed our entry into another parent, in + * which case it has been reconnected by the rename. + * + * Or someone removed it entirely, in which case filehandle + * lookup will succeed but the directory is now IS_DEAD and + * subsequent operations on it will fail. + * + * Alternatively, maybe there was no race at all, and the + * filesystem is just corrupt and gave us a parent that doesn't + * actually contain any entry pointing to this inode. So, + * double check that this worked and return -ESTALE if not: + */ + if (!dentry_connected(dentry)) + return ERR_PTR(-ESTALE); + return NULL; } /* * Make sure target_dir is fully connected to the dentry tree. * - * It may already be, as the flag isn't always updated when connection happens. + * On successful return, DCACHE_DISCONNECTED will be cleared on + * target_dir, and target_dir->d_parent->...->d_parent will reach the + * root of the filesystem. + * + * Whenever DCACHE_DISCONNECTED is unset, target_dir is fully connected. + * But the converse is not true: target_dir may have DCACHE_DISCONNECTED + * set but already be connected. In that case we'll verify the + * connection to root and then clear the flag. + * + * Note that target_dir could be removed by a concurrent operation. In + * that case reconnect_path may still succeed with target_dir fully + * connected, but further operations using the filehandle will fail when + * necessary (due to S_DEAD being set on the directory). */ static int reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf) { - int noprogress = 0; - int err = -ESTALE; + struct dentry *dentry, *parent; - /* - * It is possible that a confused file system might not let us complete - * the path to the root. For example, if get_parent returns a directory - * in which we cannot find a name for the child. While this implies a - * very sick filesystem we don't want it to cause knfsd to spin. Hence - * the noprogress counter. If we go through the loop 10 times (2 is - * probably enough) without getting anywhere, we just give up - */ - while (target_dir->d_flags & DCACHE_DISCONNECTED && noprogress++ < 10) { - struct dentry *pd = find_disconnected_root(target_dir); - - if (!IS_ROOT(pd)) { - /* must have found a connected parent - great */ - spin_lock(&pd->d_lock); - pd->d_flags &= ~DCACHE_DISCONNECTED; - spin_unlock(&pd->d_lock); - noprogress = 0; - } else if (pd == mnt->mnt_sb->s_root) { - printk(KERN_ERR "export: Eeek filesystem root is not connected, impossible\n"); - spin_lock(&pd->d_lock); - pd->d_flags &= ~DCACHE_DISCONNECTED; - spin_unlock(&pd->d_lock); - noprogress = 0; - } else { - /* - * We have hit the top of a disconnected path, try to - * find parent and connect. - * - * Racing with some other process renaming a directory - * isn't much of a problem here. If someone renames - * the directory, it will end up properly connected, - * which is what we want - * - * Getting the parent can't be supported generically, - * the locking is too icky. - * - * Instead we just return EACCES. If server reboots - * or inodes get flushed, you lose - */ - struct dentry *ppd = ERR_PTR(-EACCES); - struct dentry *npd; - - mutex_lock(&pd->d_inode->i_mutex); - if (mnt->mnt_sb->s_export_op->get_parent) - ppd = mnt->mnt_sb->s_export_op->get_parent(pd); - mutex_unlock(&pd->d_inode->i_mutex); - - if (IS_ERR(ppd)) { - err = PTR_ERR(ppd); - dprintk("%s: get_parent of %ld failed, err %d\n", - __func__, pd->d_inode->i_ino, err); - dput(pd); - break; - } + dentry = dget(target_dir); - dprintk("%s: find name of %lu in %lu\n", __func__, - pd->d_inode->i_ino, ppd->d_inode->i_ino); - err = exportfs_get_name(mnt, ppd, nbuf, pd); - if (err) { - dput(ppd); - dput(pd); - if (err == -ENOENT) - /* some race between get_parent and - * get_name? just try again - */ - continue; - break; - } - dprintk("%s: found name: %s\n", __func__, nbuf); - mutex_lock(&ppd->d_inode->i_mutex); - npd = lookup_one_len(nbuf, ppd, strlen(nbuf)); - mutex_unlock(&ppd->d_inode->i_mutex); - if (IS_ERR(npd)) { - err = PTR_ERR(npd); - dprintk("%s: lookup failed: %d\n", - __func__, err); - dput(ppd); - dput(pd); - break; - } - /* we didn't really want npd, we really wanted - * a side-effect of the lookup. - * hopefully, npd == pd, though it isn't really - * a problem if it isn't - */ - if (npd == pd) - noprogress = 0; - else - printk("%s: npd != pd\n", __func__); - dput(npd); - dput(ppd); - if (IS_ROOT(pd)) { - /* something went wrong, we have to give up */ - dput(pd); - break; - } - } - dput(pd); - } + while (dentry->d_flags & DCACHE_DISCONNECTED) { + BUG_ON(dentry == mnt->mnt_sb->s_root); - if (target_dir->d_flags & DCACHE_DISCONNECTED) { - /* something went wrong - oh-well */ - if (!err) - err = -ESTALE; - return err; - } + if (IS_ROOT(dentry)) + parent = reconnect_one(mnt, dentry, nbuf); + else + parent = dget_parent(dentry); + if (!parent) + break; + dput(dentry); + if (IS_ERR(parent)) + return PTR_ERR(parent); + dentry = parent; + } + dput(dentry); + clear_disconnected(target_dir); return 0; } @@ -215,7 +232,7 @@ struct getdents_callback { struct dir_context ctx; char *name; /* name that was found. It already points to a buffer NAME_MAX+1 is size */ - unsigned long ino; /* the inum we are looking for */ + u64 ino; /* the inum we are looking for */ int found; /* inode matched? */ int sequence; /* sequence counter */ }; @@ -242,7 +259,7 @@ static int filldir_one(void * __buf, const char * name, int len, /** * get_name - default export_operations->get_name function - * @dentry: the directory in which to find a name + * @path: the directory in which to find a name * @name: a pointer to a %NAME_MAX+1 char buffer to store the name * @child: the dentry for the child directory. * @@ -255,10 +272,14 @@ static int get_name(const struct path *path, char *name, struct dentry *child) struct inode *dir = path->dentry->d_inode; int error; struct file *file; + struct kstat stat; + struct path child_path = { + .mnt = path->mnt, + .dentry = child, + }; struct getdents_callback buffer = { .ctx.actor = filldir_one, .name = name, - .ino = child->d_inode->i_ino }; error = -ENOTDIR; @@ -268,6 +289,16 @@ static int get_name(const struct path *path, char *name, struct dentry *child) if (!dir->i_fop) goto out; /* + * inode->i_ino is unsigned long, kstat->ino is u64, so the + * former would be insufficient on 32-bit hosts when the + * filesystem supports 64-bit inode numbers. So we need to + * actually call ->getattr, not just read i_ino: + */ + error = vfs_getattr_nosec(&child_path, &stat); + if (error) + return error; + buffer.ino = stat.ino; + /* * Open the directory ... */ file = dentry_open(path, O_RDONLY, cred); @@ -306,7 +337,7 @@ out: /** * export_encode_fh - default export_operations->encode_fh function * @inode: the object to encode - * @fh: where to store the file handle fragment + * @fid: where to store the file handle fragment * @max_len: maximum length to store there * @parent: parent directory inode, if wanted * diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 110b6b371a4..27695e6f4e4 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -4,7 +4,6 @@ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> */ -#include <linux/capability.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> @@ -148,13 +147,6 @@ ext2_get_acl(struct inode *inode, int type) struct posix_acl *acl; int retval; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return NULL; - - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - switch (type) { case ACL_TYPE_ACCESS: name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -189,19 +181,14 @@ ext2_get_acl(struct inode *inode, int type) /* * inode->i_mutex: down */ -static int -ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) +int +ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int name_index; void *value = NULL; size_t size = 0; int error; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return 0; - switch(type) { case ACL_TYPE_ACCESS: name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -250,169 +237,21 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) int ext2_init_acl(struct inode *inode, struct inode *dir) { - struct posix_acl *acl = NULL; - int error = 0; - - if (!S_ISLNK(inode->i_mode)) { - if (test_opt(dir->i_sb, POSIX_ACL)) { - acl = ext2_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (!acl) - inode->i_mode &= ~current_umask(); - } - if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - if (S_ISDIR(inode->i_mode)) { - error = ext2_set_acl(inode, ACL_TYPE_DEFAULT, acl); - if (error) - goto cleanup; - } - error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); - if (error < 0) - return error; - if (error > 0) { - /* This is an extended ACL */ - error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl); - } - } -cleanup: - posix_acl_release(acl); - return error; -} - -/* - * Does chmod for an inode that may have an Access Control List. The - * inode->i_mode field must be updated to the desired value by the caller - * before calling this function. - * Returns 0 on success, or a negative error number. - * - * We change the ACL rather than storing some ACL entries in the file - * mode permission bits (which would be more efficient), because that - * would break once additional permissions (like ACL_APPEND, ACL_DELETE - * for directories) are added. There are no more bits available in the - * file mode. - * - * inode->i_mutex: down - */ -int -ext2_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; - int error; + struct posix_acl *default_acl, *acl; + int error; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return 0; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); if (error) return error; - error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl); - posix_acl_release(acl); - return error; -} -/* - * Extended attribut handlers - */ -static size_t -ext2_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return 0; - if (list && size <= list_size) - memcpy(list, POSIX_ACL_XATTR_ACCESS, size); - return size; -} - -static size_t -ext2_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return 0; - if (list && size <= list_size) - memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); - return size; -} - -static int -ext2_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) -{ - struct posix_acl *acl; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return -EOPNOTSUPP; - - acl = ext2_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int -ext2_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags, int type) -{ - struct posix_acl *acl; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(dentry->d_inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - error = posix_acl_valid(acl); - if (error) - goto release_and_out; - } - } else - acl = NULL; - - error = ext2_set_acl(dentry->d_inode, type, acl); - -release_and_out: - posix_acl_release(acl); + if (default_acl) { + error = ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = ext2_set_acl(inode, acl, ACL_TYPE_ACCESS); + posix_acl_release(acl); + } return error; } - -const struct xattr_handler ext2_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = ext2_xattr_list_acl_access, - .get = ext2_xattr_get_acl, - .set = ext2_xattr_set_acl, -}; - -const struct xattr_handler ext2_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = ext2_xattr_list_acl_default, - .get = ext2_xattr_get_acl, - .set = ext2_xattr_set_acl, -}; diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index 503bfb0ed79..44937f9fcf3 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -55,7 +55,7 @@ static inline int ext2_acl_count(size_t size) /* acl.c */ extern struct posix_acl *ext2_get_acl(struct inode *inode, int type); -extern int ext2_acl_chmod (struct inode *); +extern int ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int ext2_init_acl (struct inode *, struct inode *); #else @@ -63,12 +63,6 @@ extern int ext2_init_acl (struct inode *, struct inode *); #define ext2_get_acl NULL #define ext2_set_acl NULL -static inline int -ext2_acl_chmod (struct inode *inode) -{ - return 0; -} - static inline int ext2_init_acl (struct inode *inode, struct inode *dir) { return 0; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index a5b3a5db312..7c87b22a722 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -62,10 +62,10 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync) */ const struct file_operations ext2_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .unlocked_ioctl = ext2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, @@ -75,7 +75,7 @@ const struct file_operations ext2_file_operations = { .release = ext2_release_file, .fsync = ext2_fsync, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, }; #ifdef CONFIG_EXT2_FS_XIP @@ -103,5 +103,6 @@ const struct inode_operations ext2_file_inode_operations = { #endif .setattr = ext2_setattr, .get_acl = ext2_get_acl, + .set_acl = ext2_set_acl, .fiemap = ext2_fiemap, }; diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 7cadd823bb3..7d66fb0e4cc 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -284,7 +284,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) int best_ndir = inodes_per_group; int best_group = -1; - get_random_bytes(&group, sizeof(group)); + group = prandom_u32(); parent_group = (unsigned)group % ngroups; for (i = 0; i < ngroups; i++) { group = (parent_group + i) % ngroups; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index c260de6d7b6..36d35c36311 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -78,7 +78,7 @@ void ext2_evict_inode(struct inode * inode) dquot_drop(inode); } - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (want_delete) { sb_start_intwrite(inode->i_sb); @@ -632,6 +632,8 @@ static int ext2_get_blocks(struct inode *inode, int count = 0; ext2_fsblk_t first_block = 0; + BUG_ON(maxblocks == 0); + depth = ext2_block_to_path(inode,iblock,offsets,&blocks_to_boundary); if (depth == 0) @@ -848,18 +850,18 @@ static sector_t ext2_bmap(struct address_space *mapping, sector_t block) } static ssize_t -ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs) +ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, - ext2_get_block); + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext2_get_block); if (ret < 0 && (rw & WRITE)) - ext2_write_failed(mapping, offset + iov_length(iov, nr_segs)); + ext2_write_failed(mapping, offset + count); return ret; } @@ -1564,7 +1566,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr) } setattr_copy(inode, iattr); if (iattr->ia_valid & ATTR_MODE) - error = ext2_acl_chmod(inode); + error = posix_acl_chmod(inode, inode->i_mode); mark_inode_dirty(inode); return error; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 256dd5f4c1c..c268d0af1db 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -421,6 +421,7 @@ const struct inode_operations ext2_dir_inode_operations = { #endif .setattr = ext2_setattr, .get_acl = ext2_get_acl, + .set_acl = ext2_set_acl, .tmpfile = ext2_tmpfile, }; @@ -433,4 +434,5 @@ const struct inode_operations ext2_special_inode_operations = { #endif .setattr = ext2_setattr, .get_acl = ext2_get_acl, + .set_acl = ext2_set_acl, }; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 288534920fe..3750031cfa2 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -192,7 +192,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { ext2_inode_cachep = kmem_cache_create("ext2_inode_cache", sizeof(struct ext2_inode_info), @@ -1254,6 +1254,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) unsigned long old_sb_flags; int err; + sync_filesystem(sb); spin_lock(&sbi->s_lock); /* Store the old options */ @@ -1493,6 +1494,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, sb->s_blocksize - offset : towrite; tmp_bh.b_state = 0; + tmp_bh.b_size = sb->s_blocksize; err = ext2_get_block(inode, blk, &tmp_bh, 1); if (err < 0) goto out; diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 2d7557db3ae..91426141c33 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -103,8 +103,8 @@ static struct mb_cache *ext2_xattr_cache; static const struct xattr_handler *ext2_xattr_handler_map[] = { [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, #ifdef CONFIG_EXT2_FS_POSIX_ACL - [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext2_xattr_acl_access_handler, - [EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext2_xattr_acl_default_handler, + [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, #endif [EXT2_XATTR_INDEX_TRUSTED] = &ext2_xattr_trusted_handler, #ifdef CONFIG_EXT2_FS_SECURITY @@ -116,8 +116,8 @@ const struct xattr_handler *ext2_xattr_handlers[] = { &ext2_xattr_user_handler, &ext2_xattr_trusted_handler, #ifdef CONFIG_EXT2_FS_POSIX_ACL - &ext2_xattr_acl_access_handler, - &ext2_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif #ifdef CONFIG_EXT2_FS_SECURITY &ext2_xattr_security_handler, diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h index 5e41cccff76..60edf298644 100644 --- a/fs/ext2/xattr.h +++ b/fs/ext2/xattr.h @@ -57,8 +57,6 @@ struct ext2_xattr_entry { extern const struct xattr_handler ext2_xattr_user_handler; extern const struct xattr_handler ext2_xattr_trusted_handler; -extern const struct xattr_handler ext2_xattr_acl_access_handler; -extern const struct xattr_handler ext2_xattr_acl_default_handler; extern const struct xattr_handler ext2_xattr_security_handler; extern ssize_t ext2_listxattr(struct dentry *, char *, size_t); diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index cfedb2cb0d8..c0ebc4db884 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -42,8 +42,8 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name, value, size, flags); } -int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array, - void *fs_info) +static int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { const struct xattr *xattr; int err = 0; diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c index 1c3312858fc..e98171a11cf 100644 --- a/fs/ext2/xip.c +++ b/fs/ext2/xip.c @@ -35,6 +35,7 @@ __ext2_get_block(struct inode *inode, pgoff_t pgoff, int create, int rc; memset(&tmp, 0, sizeof(struct buffer_head)); + tmp.b_size = 1 << inode->i_blkbits; rc = ext2_get_block(inode, pgoff, &tmp, create); *result = tmp.b_blocknr; diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index dbb5ad59a7f..8bbaf5bcf98 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -145,13 +145,6 @@ ext3_get_acl(struct inode *inode, int type) struct posix_acl *acl; int retval; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return NULL; - - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - switch (type) { case ACL_TYPE_ACCESS: name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -190,7 +183,7 @@ ext3_get_acl(struct inode *inode, int type) * inode->i_mutex: down unless called from ext3_new_inode */ static int -ext3_set_acl(handle_t *handle, struct inode *inode, int type, +__ext3_set_acl(handle_t *handle, struct inode *inode, int type, struct posix_acl *acl) { int name_index; @@ -198,9 +191,6 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, size_t size = 0; int error; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - switch(type) { case ACL_TYPE_ACCESS: name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -243,204 +233,49 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, return error; } -/* - * Initialize the ACLs of a new inode. Called from ext3_new_inode. - * - * dir->i_mutex: down - * inode->i_mutex: up (access to inode is still exclusive) - */ int -ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - struct posix_acl *acl = NULL; - int error = 0; - - if (!S_ISLNK(inode->i_mode)) { - if (test_opt(dir->i_sb, POSIX_ACL)) { - acl = ext3_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (!acl) - inode->i_mode &= ~current_umask(); - } - if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - if (S_ISDIR(inode->i_mode)) { - error = ext3_set_acl(handle, inode, - ACL_TYPE_DEFAULT, acl); - if (error) - goto cleanup; - } - error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); - if (error < 0) - return error; - - if (error > 0) { - /* This is an extended ACL */ - error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); - } - } -cleanup: - posix_acl_release(acl); - return error; -} - -/* - * Does chmod for an inode that may have an Access Control List. The - * inode->i_mode field must be updated to the desired value by the caller - * before calling this function. - * Returns 0 on success, or a negative error number. - * - * We change the ACL rather than storing some ACL entries in the file - * mode permission bits (which would be more efficient), because that - * would break once additional permissions (like ACL_APPEND, ACL_DELETE - * for directories) are added. There are no more bits available in the - * file mode. - * - * inode->i_mutex: down - */ -int -ext3_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; handle_t *handle; - int retries = 0; - int error; + int error, retries = 0; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return 0; - acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (error) - return error; retry: - handle = ext3_journal_start(inode, - EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - ext3_std_error(inode->i_sb, error); - goto out; - } - error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); + handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + error = __ext3_set_acl(handle, inode, type, acl); ext3_journal_stop(handle); - if (error == -ENOSPC && - ext3_should_retry_alloc(inode->i_sb, &retries)) + if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; -out: - posix_acl_release(acl); return error; } /* - * Extended attribute handlers + * Initialize the ACLs of a new inode. Called from ext3_new_inode. + * + * dir->i_mutex: down + * inode->i_mutex: up (access to inode is still exclusive) */ -static size_t -ext3_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len, - const char *name, size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return 0; - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_ACCESS, size); - return size; -} - -static size_t -ext3_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len, - const char *name, size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return 0; - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); - return size; -} - -static int -ext3_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) +int +ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) { - struct posix_acl *acl; + struct posix_acl *default_acl, *acl; int error; - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return -EOPNOTSUPP; - - acl = ext3_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int -ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags, int type) -{ - struct inode *inode = dentry->d_inode; - handle_t *handle; - struct posix_acl *acl; - int error, retries = 0; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - error = posix_acl_valid(acl); - if (error) - goto release_and_out; - } - } else - acl = NULL; - -retry: - handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - error = ext3_set_acl(handle, inode, type, acl); - ext3_journal_stop(handle); - if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) - goto retry; + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (error) + return error; -release_and_out: - posix_acl_release(acl); + if (default_acl) { + error = __ext3_set_acl(handle, inode, ACL_TYPE_DEFAULT, + default_acl); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = __ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, + acl); + posix_acl_release(acl); + } return error; } - -const struct xattr_handler ext3_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = ext3_xattr_list_acl_access, - .get = ext3_xattr_get_acl, - .set = ext3_xattr_set_acl, -}; - -const struct xattr_handler ext3_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = ext3_xattr_list_acl_default, - .get = ext3_xattr_get_acl, - .set = ext3_xattr_set_acl, -}; diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h index dbc921e458c..ea1c69edab9 100644 --- a/fs/ext3/acl.h +++ b/fs/ext3/acl.h @@ -55,18 +55,13 @@ static inline int ext3_acl_count(size_t size) /* acl.c */ extern struct posix_acl *ext3_get_acl(struct inode *inode, int type); -extern int ext3_acl_chmod (struct inode *); +extern int ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT3_FS_POSIX_ACL */ #include <linux/sched.h> #define ext3_get_acl NULL - -static inline int -ext3_acl_chmod(struct inode *inode) -{ - return 0; -} +#define ext3_set_acl NULL static inline int ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 22548f56197..158b5d4ce06 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -1727,10 +1727,7 @@ allocated: percpu_counter_sub(&sbi->s_freeblocks_counter, num); BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); - err = ext3_journal_dirty_metadata(handle, gdp_bh); - if (!fatal) - fatal = err; - + fatal = ext3_journal_dirty_metadata(handle, gdp_bh); if (fatal) goto out; diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index bafdd48eefd..17742eed2c1 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -275,7 +275,7 @@ static inline loff_t ext3_get_htree_eof(struct file *filp) * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX) * will be invalid once the directory was converted into a dx directory */ -loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence) +static loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; int dx_dir = is_dx_dir(inode); @@ -309,43 +309,17 @@ struct fname { */ static void free_rb_tree_fname(struct rb_root *root) { - struct rb_node *n = root->rb_node; - struct rb_node *parent; - struct fname *fname; - - while (n) { - /* Do the node's children first */ - if (n->rb_left) { - n = n->rb_left; - continue; - } - if (n->rb_right) { - n = n->rb_right; - continue; - } - /* - * The node has no children; free it, and then zero - * out parent's link to it. Finally go to the - * beginning of the loop and try to free the parent - * node. - */ - parent = rb_parent(n); - fname = rb_entry(n, struct fname, rb_hash); - while (fname) { - struct fname * old = fname; + struct fname *fname, *next; + + rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash) + do { + struct fname *old = fname; fname = fname->next; - kfree (old); - } - if (!parent) - *root = RB_ROOT; - else if (parent->rb_left == n) - parent->rb_left = NULL; - else if (parent->rb_right == n) - parent->rb_right = NULL; - n = parent; - } -} + kfree(old); + } while (fname); + *root = RB_ROOT; +} static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp, loff_t pos) diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 25cb413277e..a062fa1e1b1 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -50,10 +50,10 @@ static int ext3_release_file (struct inode * inode, struct file * filp) const struct file_operations ext3_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .unlocked_ioctl = ext3_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext3_compat_ioctl, @@ -63,7 +63,7 @@ const struct file_operations ext3_file_operations = { .release = ext3_release_file, .fsync = ext3_sync_file, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, }; const struct inode_operations ext3_file_inode_operations = { @@ -75,6 +75,7 @@ const struct inode_operations ext3_file_inode_operations = { .removexattr = generic_removexattr, #endif .get_acl = ext3_get_acl, + .set_acl = ext3_set_acl, .fiemap = ext3_fiemap, }; diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 082afd78b10..a1b810230cc 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -215,7 +215,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) int best_ndir = inodes_per_group; int best_group = -1; - get_random_bytes(&group, sizeof(group)); + group = prandom_u32(); parent_group = (unsigned)group % ngroups; for (i = 0; i < ngroups; i++) { group = (parent_group + i) % ngroups; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 2bd85486b87..2c6ccc49ba2 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -228,7 +228,7 @@ void ext3_evict_inode (struct inode *inode) log_wait_commit(journal, commit_tid); filemap_write_and_wait(&inode->i_data); } - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); ext3_discard_reservation(inode); rsv = ei->i_block_alloc_info; @@ -1559,56 +1559,17 @@ static int buffer_unmapped(handle_t *handle, struct buffer_head *bh) } /* - * Note that we always start a transaction even if we're not journalling - * data. This is to preserve ordering: any hole instantiation within - * __block_write_full_page -> ext3_get_block() should be journalled - * along with the data so we don't crash and then get metadata which + * Note that whenever we need to map blocks we start a transaction even if + * we're not journalling data. This is to preserve ordering: any hole + * instantiation within __block_write_full_page -> ext3_get_block() should be + * journalled along with the data so we don't crash and then get metadata which * refers to old data. * * In all journalling modes block_write_full_page() will start the I/O. * - * Problem: - * - * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> - * ext3_writepage() - * - * Similar for: - * - * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ... - * - * Same applies to ext3_get_block(). We will deadlock on various things like - * lock_journal and i_truncate_mutex. - * - * Setting PF_MEMALLOC here doesn't work - too many internal memory - * allocations fail. - * - * 16May01: If we're reentered then journal_current_handle() will be - * non-zero. We simply *return*. - * - * 1 July 2001: @@@ FIXME: - * In journalled data mode, a data buffer may be metadata against the - * current transaction. But the same file is part of a shared mapping - * and someone does a writepage() on it. - * - * We will move the buffer onto the async_data list, but *after* it has - * been dirtied. So there's a small window where we have dirty data on - * BJ_Metadata. - * - * Note that this only applies to the last partial page in the file. The - * bit which block_write_full_page() uses prepare/commit for. (That's - * broken code anyway: it's wrong for msync()). - * - * It's a rare case: affects the final partial page, for journalled data - * where the file is subject to bith write() and writepage() in the same - * transction. To fix it we'll need a custom block_write_full_page(). - * We'll probably need that anyway for journalling writepage() output. - * * We don't honour synchronous mounts for writepage(). That would be * disastrous. Any write() or metadata operation will sync the fs for * us. - * - * AKPM2: if all the page's buffers are mapped to disk and !data=journal, - * we don't need to open a transaction here. */ static int ext3_ordered_writepage(struct page *page, struct writeback_control *wbc) @@ -1673,12 +1634,9 @@ static int ext3_ordered_writepage(struct page *page, * block_write_full_page() succeeded. Otherwise they are unmapped, * and generally junk. */ - if (ret == 0) { - err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, + if (ret == 0) + ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, journal_dirty_data_fn); - if (!ret) - ret = err; - } walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, bput_one); err = ext3_journal_stop(handle); @@ -1758,17 +1716,17 @@ static int ext3_journalled_writepage(struct page *page, WARN_ON_ONCE(IS_RDONLY(inode) && !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); - if (ext3_journal_current_handle()) - goto no_write; - trace_ext3_journalled_writepage(page); - handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto no_write; - } - if (!page_has_buffers(page) || PageChecked(page)) { + if (ext3_journal_current_handle()) + goto no_write; + + handle = ext3_journal_start(inode, + ext3_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto no_write; + } /* * It's mmapped pagecache. Add buffers and journal it. There * doesn't seem much point in redirtying the page here. @@ -1791,17 +1749,18 @@ static int ext3_journalled_writepage(struct page *page, atomic_set(&EXT3_I(inode)->i_datasync_tid, handle->h_transaction->t_tid); unlock_page(page); + err = ext3_journal_stop(handle); + if (!ret) + ret = err; } else { /* - * It may be a page full of checkpoint-mode buffers. We don't - * really know unless we go poke around in the buffer_heads. - * But block_write_full_page will do the right thing. + * It is a page full of checkpoint-mode buffers. Go and write + * them. They should have been already mapped when they went + * to the journal so provide NULL get_block function to catch + * errors. */ - ret = block_write_full_page(page, ext3_get_block, wbc); + ret = block_write_full_page(page, NULL, wbc); } - err = ext3_journal_stop(handle); - if (!ret) - ret = err; out: return ret; @@ -1862,8 +1821,7 @@ static int ext3_releasepage(struct page *page, gfp_t wait) * VFS code falls back into buffered path in that case so we are safe. */ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; @@ -1871,10 +1829,10 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, handle_t *handle; ssize_t ret; int orphan = 0; - size_t count = iov_length(iov, nr_segs); + size_t count = iov_iter_count(iter); int retries = 0; - trace_ext3_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); + trace_ext3_direct_IO_enter(inode, offset, count, rw); if (rw == WRITE) { loff_t final_size = offset + count; @@ -1898,15 +1856,14 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, } retry: - ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, - ext3_get_block); + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext3_get_block); /* * In case of error extending write may have instantiated a few * blocks outside i_size. Trim these off again. */ if (unlikely((rw & WRITE) && ret < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + iov_length(iov, nr_segs); + loff_t end = offset + count; if (end > isize) ext3_truncate_failed_direct_write(inode); @@ -1925,6 +1882,8 @@ retry: * and pretend the write failed... */ ext3_truncate_failed_direct_write(inode); ret = PTR_ERR(handle); + if (inode->i_nlink) + ext3_orphan_del(NULL, inode); goto out; } if (inode->i_nlink) @@ -1949,8 +1908,7 @@ retry: ret = err; } out: - trace_ext3_direct_IO_exit(inode, offset, - iov_length(iov, nr_segs), rw, ret); + trace_ext3_direct_IO_exit(inode, offset, count, rw, ret); return ret; } @@ -3212,21 +3170,20 @@ out_brelse: * * We are called from a few places: * - * - Within generic_file_write() for O_SYNC files. + * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running * transaction to commit. * - * - Within sys_sync(), kupdate and such. - * We wait on commit, if tol to. + * - Within flush work (for sys_sync(), kupdate and such). + * We wait on commit, if told to. * - * - Within prune_icache() (PF_MEMALLOC == true) - * Here we simply return. We can't afford to block kswapd on the - * journal commit. + * - Within iput_final() -> write_inode_now() + * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in - * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for - * knfsd. + * ext3_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL + * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in @@ -3238,13 +3195,13 @@ out_brelse: * stuff(); * inode->i_size = expr; * - * is in error because a kswapd-driven write_inode() could occur while - * `stuff()' is running, and the new i_size will be lost. Plus the inode - * will no longer be on the superblock's dirty inode list. + * is in error because write_inode() could occur while `stuff()' is running, + * and the new i_size will be lost. Plus the inode will no longer be on the + * superblock's dirty inode list. */ int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) { - if (current->flags & PF_MEMALLOC) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; if (ext3_journal_current_handle()) { @@ -3253,7 +3210,12 @@ int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) return -EIO; } - if (wbc->sync_mode != WB_SYNC_ALL) + /* + * No need to force transaction in WB_SYNC_NONE mode. Also + * ext3_sync_fs() will force the commit after everything is + * written. + */ + if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; return ext3_force_commit(inode->i_sb); @@ -3365,7 +3327,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) mark_inode_dirty(inode); if (ia_valid & ATTR_MODE) - rc = ext3_acl_chmod(inode); + rc = posix_acl_chmod(inode, inode->i_mode); err_out: ext3_std_error(inode->i_sb, error); diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 1194b1f0f83..f197736dccf 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1783,7 +1783,7 @@ retry: d_tmpfile(dentry, inode); err = ext3_orphan_add(handle, inode); if (err) - goto err_drop_inode; + goto err_unlock_inode; mark_inode_dirty(inode); unlock_new_inode(inode); } @@ -1791,10 +1791,9 @@ retry: if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; -err_drop_inode: +err_unlock_inode: ext3_journal_stop(handle); unlock_new_inode(inode); - iput(inode); return err; } @@ -2570,6 +2569,7 @@ const struct inode_operations ext3_dir_inode_operations = { .removexattr = generic_removexattr, #endif .get_acl = ext3_get_acl, + .set_acl = ext3_set_acl, }; const struct inode_operations ext3_special_inode_operations = { @@ -2581,4 +2581,5 @@ const struct inode_operations ext3_special_inode_operations = { .removexattr = generic_removexattr, #endif .get_acl = ext3_get_acl, + .set_acl = ext3_set_acl, }; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index c50c7619037..08cdfe5461e 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -527,7 +527,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { ext3_inode_cachep = kmem_cache_create("ext3_inode_cache", sizeof(struct ext3_inode_info), @@ -2649,6 +2649,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) int i; #endif + sync_filesystem(sb); + /* Store the original options */ old_sb_flags = sb->s_flags; old_opts.s_mount_opt = sbi->s_mount_opt; @@ -2825,6 +2827,10 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) * bitmap, and an inode table. */ overhead += ngroups * (2 + sbi->s_itb_per_group); + + /* Add the journal blocks as well */ + overhead += sbi->s_journal->j_maxlen; + sbi->s_overhead_last = overhead; smp_wmb(); sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count); diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index b1fc96383e0..c6874be6d58 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -102,8 +102,8 @@ static struct mb_cache *ext3_xattr_cache; static const struct xattr_handler *ext3_xattr_handler_map[] = { [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler, #ifdef CONFIG_EXT3_FS_POSIX_ACL - [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler, - [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext3_xattr_acl_default_handler, + [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, #endif [EXT3_XATTR_INDEX_TRUSTED] = &ext3_xattr_trusted_handler, #ifdef CONFIG_EXT3_FS_SECURITY @@ -115,8 +115,8 @@ const struct xattr_handler *ext3_xattr_handlers[] = { &ext3_xattr_user_handler, &ext3_xattr_trusted_handler, #ifdef CONFIG_EXT3_FS_POSIX_ACL - &ext3_xattr_acl_access_handler, - &ext3_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif #ifdef CONFIG_EXT3_FS_SECURITY &ext3_xattr_security_handler, diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h index 2be4f69bfa6..32e93ebf803 100644 --- a/fs/ext3/xattr.h +++ b/fs/ext3/xattr.h @@ -60,8 +60,6 @@ struct ext3_xattr_entry { extern const struct xattr_handler ext3_xattr_user_handler; extern const struct xattr_handler ext3_xattr_trusted_handler; -extern const struct xattr_handler ext3_xattr_acl_access_handler; -extern const struct xattr_handler ext3_xattr_acl_default_handler; extern const struct xattr_handler ext3_xattr_security_handler; extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index 3387664ad70..722c2bf9645 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -43,8 +43,9 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name, name, value, size, flags); } -int ext3_initxattrs(struct inode *inode, const struct xattr *xattr_array, - void *fs_info) +static int ext3_initxattrs(struct inode *inode, + const struct xattr *xattr_array, + void *fs_info) { const struct xattr *xattr; handle_t *handle = fs_info; diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 39a54a0e9fe..d40c8dbbb0d 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -152,13 +152,6 @@ ext4_get_acl(struct inode *inode, int type) struct posix_acl *acl; int retval; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return NULL; - - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -196,7 +189,7 @@ ext4_get_acl(struct inode *inode, int type) * inode->i_mutex: down unless called from ext4_new_inode */ static int -ext4_set_acl(handle_t *handle, struct inode *inode, int type, +__ext4_set_acl(handle_t *handle, struct inode *inode, int type, struct posix_acl *acl) { int name_index; @@ -204,9 +197,6 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, size_t size = 0; int error; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -248,208 +238,51 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, return error; } -/* - * Initialize the ACLs of a new inode. Called from ext4_new_inode. - * - * dir->i_mutex: down - * inode->i_mutex: up (access to inode is still exclusive) - */ int -ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - struct posix_acl *acl = NULL; - int error = 0; - - if (!S_ISLNK(inode->i_mode)) { - if (test_opt(dir->i_sb, POSIX_ACL)) { - acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (!acl) - inode->i_mode &= ~current_umask(); - } - if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - if (S_ISDIR(inode->i_mode)) { - error = ext4_set_acl(handle, inode, - ACL_TYPE_DEFAULT, acl); - if (error) - goto cleanup; - } - error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); - if (error < 0) - return error; - - if (error > 0) { - /* This is an extended ACL */ - error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); - } - } -cleanup: - posix_acl_release(acl); - return error; -} - -/* - * Does chmod for an inode that may have an Access Control List. The - * inode->i_mode field must be updated to the desired value by the caller - * before calling this function. - * Returns 0 on success, or a negative error number. - * - * We change the ACL rather than storing some ACL entries in the file - * mode permission bits (which would be more efficient), because that - * would break once additional permissions (like ACL_APPEND, ACL_DELETE - * for directories) are added. There are no more bits available in the - * file mode. - * - * inode->i_mutex: down - */ -int -ext4_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; handle_t *handle; - int retries = 0; - int error; - + int error, retries = 0; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return 0; - acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (error) - return error; retry: handle = ext4_journal_start(inode, EXT4_HT_XATTR, ext4_jbd2_credits_xattr(inode)); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - ext4_std_error(inode->i_sb, error); - goto out; - } - error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + error = __ext4_set_acl(handle, inode, type, acl); ext4_journal_stop(handle); - if (error == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) + if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; -out: - posix_acl_release(acl); return error; } /* - * Extended attribute handlers + * Initialize the ACLs of a new inode. Called from ext4_new_inode. + * + * dir->i_mutex: down + * inode->i_mutex: up (access to inode is still exclusive) */ -static size_t -ext4_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len, - const char *name, size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return 0; - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_ACCESS, size); - return size; -} - -static size_t -ext4_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len, - const char *name, size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return 0; - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); - return size; -} - -static int -ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) +int +ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) { - struct posix_acl *acl; + struct posix_acl *default_acl, *acl; int error; - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return -EOPNOTSUPP; - - acl = ext4_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int -ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags, int type) -{ - struct inode *inode = dentry->d_inode; - handle_t *handle; - struct posix_acl *acl; - int error, retries = 0; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - error = posix_acl_valid(acl); - if (error) - goto release_and_out; - } - } else - acl = NULL; + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (error) + return error; -retry: - handle = ext4_journal_start(inode, EXT4_HT_XATTR, - ext4_jbd2_credits_xattr(inode)); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - goto release_and_out; + if (default_acl) { + error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, + default_acl); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, + acl); + posix_acl_release(acl); } - error = ext4_set_acl(handle, inode, type, acl); - ext4_journal_stop(handle); - if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - -release_and_out: - posix_acl_release(acl); return error; } - -const struct xattr_handler ext4_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = ext4_xattr_list_acl_access, - .get = ext4_xattr_get_acl, - .set = ext4_xattr_set_acl, -}; - -const struct xattr_handler ext4_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = ext4_xattr_list_acl_default, - .get = ext4_xattr_get_acl, - .set = ext4_xattr_set_acl, -}; diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 18cb39ed7c7..da2c79577d7 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -55,18 +55,13 @@ static inline int ext4_acl_count(size_t size) /* acl.c */ struct posix_acl *ext4_get_acl(struct inode *inode, int type); -extern int ext4_acl_chmod(struct inode *); +int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT4_FS_POSIX_ACL */ #include <linux/sched.h> #define ext4_get_acl NULL - -static inline int -ext4_acl_chmod(struct inode *inode) -{ - return 0; -} +#define ext4_set_acl NULL static inline int ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index dc5d572ebd6..fca382037dd 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -83,9 +83,9 @@ static inline int ext4_block_in_group(struct super_block *sb, /* Return the number of clusters used for file system metadata; this * represents the overhead needed by the file system. */ -unsigned ext4_num_overhead_clusters(struct super_block *sb, - ext4_group_t block_group, - struct ext4_group_desc *gdp) +static unsigned ext4_num_overhead_clusters(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp) { unsigned num_clusters; int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c; @@ -176,9 +176,10 @@ static unsigned int num_clusters_in_group(struct super_block *sb, } /* Initializes an uninitialized block bitmap */ -void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, - ext4_group_t block_group, - struct ext4_group_desc *gdp) +static void ext4_init_block_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t block_group, + struct ext4_group_desc *gdp) { unsigned int bit, bit_max; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -193,7 +194,16 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return; } @@ -307,6 +317,7 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, ext4_group_t block_group, struct buffer_head *bh) { + struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t offset; ext4_grpblk_t next_zero_bit; ext4_fsblk_t blk; @@ -326,14 +337,14 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, /* check whether block bitmap block number is set */ blk = ext4_block_bitmap(sb, desc); offset = blk - group_first_block; - if (!ext4_test_bit(offset, bh->b_data)) + if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; /* check whether the inode bitmap block number is set */ blk = ext4_inode_bitmap(sb, desc); offset = blk - group_first_block; - if (!ext4_test_bit(offset, bh->b_data)) + if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; @@ -341,21 +352,23 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, blk = ext4_inode_table(sb, desc); offset = blk - group_first_block; next_zero_bit = ext4_find_next_zero_bit(bh->b_data, - offset + EXT4_SB(sb)->s_itb_per_group, - offset); - if (next_zero_bit < offset + EXT4_SB(sb)->s_itb_per_group) + EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group), + EXT4_B2C(sbi, offset)); + if (next_zero_bit < + EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group)) /* bad bitmap for inode tables */ return blk; return 0; } -void ext4_validate_block_bitmap(struct super_block *sb, - struct ext4_group_desc *desc, - ext4_group_t block_group, - struct buffer_head *bh) +static void ext4_validate_block_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + ext4_group_t block_group, + struct buffer_head *bh) { ext4_fsblk_t blk; struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); + struct ext4_sb_info *sbi = EXT4_SB(sb); if (buffer_verified(bh)) return; @@ -366,6 +379,9 @@ void ext4_validate_block_bitmap(struct super_block *sb, ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: block %llu: invalid block bitmap", block_group, blk); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); return; } @@ -373,6 +389,9 @@ void ext4_validate_block_bitmap(struct super_block *sb, desc, bh))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); return; } @@ -640,6 +659,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) struct ext4_group_desc *gdp; ext4_group_t i; ext4_group_t ngroups = ext4_get_groups_count(sb); + struct ext4_group_info *grp; #ifdef EXT4FS_DEBUG struct ext4_super_block *es; ext4_fsblk_t bitmap_count; @@ -655,14 +675,18 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - desc_count += ext4_free_group_clusters(sb, gdp); + grp = NULL; + if (EXT4_SB(sb)->s_group_info) + grp = ext4_get_group_info(sb, i); + if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + desc_count += ext4_free_group_clusters(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_block_bitmap(sb, i); if (bitmap_bh == NULL) continue; x = ext4_count_free(bitmap_bh->b_data, - EXT4_BLOCKS_PER_GROUP(sb) / 8); + EXT4_CLUSTERS_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", i, ext4_free_group_clusters(sb, gdp), x); bitmap_count += x; @@ -679,7 +703,11 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - desc_count += ext4_free_group_clusters(sb, gdp); + grp = NULL; + if (EXT4_SB(sb)->s_group_info) + grp = ext4_get_group_info(sb, i); + if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + desc_count += ext4_free_group_clusters(sb, gdp); } return desc_count; @@ -699,16 +727,6 @@ static inline int test_root(ext4_group_t a, int b) } } -static int ext4_group_sparse(ext4_group_t group) -{ - if (group <= 1) - return 1; - if (!(group & 1)) - return 0; - return (test_root(group, 7) || test_root(group, 5) || - test_root(group, 3)); -} - /** * ext4_bg_has_super - number of blocks used by the superblock in group * @sb: superblock for filesystem @@ -719,11 +737,26 @@ static int ext4_group_sparse(ext4_group_t group) */ int ext4_bg_has_super(struct super_block *sb, ext4_group_t group) { - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) && - !ext4_group_sparse(group)) + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + if (group == 0) + return 1; + if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_SPARSE_SUPER2)) { + if (group == le32_to_cpu(es->s_backup_bgs[0]) || + group == le32_to_cpu(es->s_backup_bgs[1])) + return 1; return 0; - return 1; + } + if ((group <= 1) || !EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) + return 1; + if (!(group & 1)) + return 0; + if (test_root(group, 3) || (test_root(group, 5)) || + test_root(group, 7)) + return 1; + + return 0; } static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 3f11656bd72..41eb9dcfac7 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -180,37 +180,12 @@ int ext4_setup_system_zone(struct super_block *sb) /* Called when the filesystem is unmounted */ void ext4_release_system_zone(struct super_block *sb) { - struct rb_node *n = EXT4_SB(sb)->system_blks.rb_node; - struct rb_node *parent; - struct ext4_system_zone *entry; + struct ext4_system_zone *entry, *n; - while (n) { - /* Do the node's children first */ - if (n->rb_left) { - n = n->rb_left; - continue; - } - if (n->rb_right) { - n = n->rb_right; - continue; - } - /* - * The node has no children; free it, and then zero - * out parent's link to it. Finally go to the - * beginning of the loop and try to free the parent - * node. - */ - parent = rb_parent(n); - entry = rb_entry(n, struct ext4_system_zone, node); + rbtree_postorder_for_each_entry_safe(entry, n, + &EXT4_SB(sb)->system_blks, node) kmem_cache_free(ext4_system_zone_cachep, entry); - if (!parent) - EXT4_SB(sb)->system_blks = RB_ROOT; - else if (parent->rb_left == n) - parent->rb_left = NULL; - else if (parent->rb_right == n) - parent->rb_right = NULL; - n = parent; - } + EXT4_SB(sb)->system_blks = RB_ROOT; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 680bb338891..ef1bed66c14 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -105,7 +105,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, static int ext4_readdir(struct file *file, struct dir_context *ctx) { unsigned int offset; - int i, stored; + int i; struct ext4_dir_entry_2 *de; int err; struct inode *inode = file_inode(file); @@ -133,7 +133,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) return ret; } - stored = 0; offset = ctx->pos & (sb->s_blocksize - 1); while (ctx->pos < inode->i_size) { @@ -353,41 +352,16 @@ struct fname { */ static void free_rb_tree_fname(struct rb_root *root) { - struct rb_node *n = root->rb_node; - struct rb_node *parent; - struct fname *fname; - - while (n) { - /* Do the node's children first */ - if (n->rb_left) { - n = n->rb_left; - continue; - } - if (n->rb_right) { - n = n->rb_right; - continue; - } - /* - * The node has no children; free it, and then zero - * out parent's link to it. Finally go to the - * beginning of the loop and try to free the parent - * node. - */ - parent = rb_parent(n); - fname = rb_entry(n, struct fname, rb_hash); + struct fname *fname, *next; + + rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash) while (fname) { struct fname *old = fname; fname = fname->next; kfree(old); } - if (!parent) - *root = RB_ROOT; - else if (parent->rb_left == n) - parent->rb_left = NULL; - else if (parent->rb_right == n) - parent->rb_right = NULL; - n = parent; - } + + *root = RB_ROOT; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index af815ea9d7c..7cc5a0e2368 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -29,7 +29,9 @@ #include <linux/wait.h> #include <linux/blockgroup_lock.h> #include <linux/percpu_counter.h> +#include <linux/ratelimit.h> #include <crypto/hash.h> +#include <linux/falloc.h> #ifdef __KERNEL__ #include <linux/compat.h> #endif @@ -156,7 +158,6 @@ struct ext4_allocation_request { #define EXT4_MAP_MAPPED (1 << BH_Mapped) #define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) #define EXT4_MAP_BOUNDARY (1 << BH_Boundary) -#define EXT4_MAP_UNINIT (1 << BH_Uninit) /* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of * ext4_map_blocks wants to know whether or not the underlying cluster has * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that @@ -167,7 +168,7 @@ struct ext4_allocation_request { #define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ - EXT4_MAP_UNINIT | EXT4_MAP_FROM_CLUSTER) + EXT4_MAP_FROM_CLUSTER) struct ext4_map_blocks { ext4_fsblk_t m_pblk; @@ -182,7 +183,7 @@ struct ext4_map_blocks { #define EXT4_IO_END_UNWRITTEN 0x0001 /* - * For converting uninitialized extents on a work queue. 'handle' is used for + * For converting unwritten extents on a work queue. 'handle' is used for * buffered writeback. */ typedef struct ext4_io_end { @@ -267,6 +268,16 @@ struct ext4_io_submit { /* Translate # of blks to # of clusters */ #define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ (sbi)->s_cluster_bits) +/* Mask out the low bits to get the starting block of the cluster */ +#define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \ + ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ + ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) +/* Get the cluster offset */ +#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ + ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_COFF(s, lblk) ((lblk) & \ + ((ext4_lblk_t) (s)->s_cluster_ratio - 1)) /* * Structure of a blocks group descriptor @@ -525,26 +536,26 @@ enum { /* * Flags used by ext4_map_blocks() */ - /* Allocate any needed blocks and/or convert an unitialized + /* Allocate any needed blocks and/or convert an unwritten extent to be an initialized ext4 */ #define EXT4_GET_BLOCKS_CREATE 0x0001 - /* Request the creation of an unitialized extent */ -#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002 -#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ + /* Request the creation of an unwritten extent */ +#define EXT4_GET_BLOCKS_UNWRIT_EXT 0x0002 +#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT (EXT4_GET_BLOCKS_UNWRIT_EXT|\ EXT4_GET_BLOCKS_CREATE) /* Caller is from the delayed allocation writeout path * finally doing the actual allocation of delayed blocks */ #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 /* caller is from the direct IO path, request to creation of an - unitialized extents if not allocated, split the uninitialized + unwritten extents if not allocated, split the unwritten extent if blocks has been preallocated already*/ #define EXT4_GET_BLOCKS_PRE_IO 0x0008 #define EXT4_GET_BLOCKS_CONVERT 0x0010 #define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ - EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) /* Convert extent to initialized after IO complete */ #define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ - EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) /* Eventual metadata allocation (due to growing extent tree) * should not fail, so try to use reserved blocks for that.*/ #define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 @@ -556,6 +567,8 @@ enum { #define EXT4_GET_BLOCKS_NO_LOCK 0x0100 /* Do not put hole in extent cache */ #define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 + /* Convert written extents to unwritten */ +#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400 /* * The bit position of these flags must not overlap with any of the @@ -760,6 +773,8 @@ do { \ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ (einode)->xtime.tv_sec = \ (signed)le32_to_cpu((raw_inode)->xtime); \ + else \ + (einode)->xtime.tv_sec = 0; \ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ ext4_decode_extra_time(&(einode)->xtime, \ raw_inode->xtime ## _extra); \ @@ -860,6 +875,8 @@ struct ext4_inode_info { struct inode vfs_inode; struct jbd2_inode *jinode; + spinlock_t i_raw_lock; /* protects updates to the raw inode */ + /* * File creation time. Its function is same as that of * struct timespec i_{a,c,m}time in the generic inode. @@ -985,6 +1002,8 @@ struct ext4_inode_info { #define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group size of blocksize * 8 blocks */ +#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated + file systems */ #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt @@ -1141,7 +1160,8 @@ struct ext4_super_block { __le32 s_usr_quota_inum; /* inode for tracking user quota */ __le32 s_grp_quota_inum; /* inode for tracking group quota */ __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ - __le32 s_reserved[108]; /* Padding to the end of the block */ + __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */ + __le32 s_reserved[106]; /* Padding to the end of the block */ __le32 s_checksum; /* crc32c(superblock) */ }; @@ -1313,7 +1333,13 @@ struct ext4_sb_info { struct list_head s_es_lru; unsigned long s_es_last_sorted; struct percpu_counter s_extent_cache_cnt; + struct mb_cache *s_mb_cache; spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; + + /* Ratelimit ext4 messages. */ + struct ratelimit_state s_err_ratelimit_state; + struct ratelimit_state s_warning_ratelimit_state; + struct ratelimit_state s_msg_ratelimit_state; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1396,7 +1422,18 @@ static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ } +/* Add these declarations here only so that these functions can be + * found by name. Otherwise, they are very hard to locate. */ +static inline int ext4_test_inode_flag(struct inode *inode, int bit); +static inline void ext4_set_inode_flag(struct inode *inode, int bit); +static inline void ext4_clear_inode_flag(struct inode *inode, int bit); EXT4_INODE_BIT_FNS(flag, flags, 0) + +/* Add these declarations here only so that these functions can be + * found by name. Otherwise, they are very hard to locate. */ +static inline int ext4_test_inode_state(struct inode *inode, int bit); +static inline void ext4_set_inode_state(struct inode *inode, int bit); +static inline void ext4_clear_inode_state(struct inode *inode, int bit); #if (BITS_PER_LONG < 64) EXT4_INODE_BIT_FNS(state, state_flags, 0) @@ -1470,6 +1507,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008 #define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 #define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 +#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200 #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 @@ -1918,10 +1956,6 @@ extern void ext4_get_group_no_and_offset(struct super_block *sb, extern ext4_group_t ext4_get_group_number(struct super_block *sb, ext4_fsblk_t block); -extern void ext4_validate_block_bitmap(struct super_block *sb, - struct ext4_group_desc *desc, - ext4_group_t block_group, - struct buffer_head *bh); extern unsigned int ext4_block_group(struct super_block *sb, ext4_fsblk_t blocknr); extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, @@ -1950,16 +1984,9 @@ extern int ext4_wait_block_bitmap(struct super_block *sb, struct buffer_head *bh); extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group); -extern void ext4_init_block_bitmap(struct super_block *sb, - struct buffer_head *bh, - ext4_group_t group, - struct ext4_group_desc *desc); extern unsigned ext4_free_clusters_after_init(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp); -extern unsigned ext4_num_overhead_clusters(struct super_block *sb, - ext4_group_t block_group, - struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); /* dir.c */ @@ -2102,10 +2129,6 @@ extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); -extern int ext4_block_truncate_page(handle_t *handle, - struct address_space *mapping, loff_t from); -extern int ext4_block_zero_page_range(handle_t *handle, - struct address_space *mapping, loff_t from, loff_t length); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); @@ -2117,8 +2140,7 @@ extern void ext4_da_update_reserve_space(struct inode *inode, extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs); + struct iov_iter *iter, loff_t offset); extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); extern void ext4_ind_truncate(handle_t *, struct inode *inode); @@ -2165,8 +2187,6 @@ extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ extern int ext4_calculate_overhead(struct super_block *sb); -extern int ext4_superblock_csum_verify(struct super_block *sb, - struct ext4_super_block *es); extern void ext4_superblock_csum_set(struct super_block *sb); extern void *ext4_kvmalloc(size_t size, gfp_t flags); extern void *ext4_kvzalloc(size_t size, gfp_t flags); @@ -2433,23 +2453,6 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) up_write(&EXT4_I(inode)->i_data_sem); } -/* - * Update i_disksize after writeback has been started. Races with truncate - * are avoided by checking i_size under i_data_sem. - */ -static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t newsize) -{ - loff_t i_size; - - down_write(&EXT4_I(inode)->i_data_sem); - i_size = i_size_read(inode); - if (newsize > i_size) - newsize = i_size; - if (newsize > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = newsize; - up_write(&EXT4_I(inode)->i_data_sem); -} - struct ext4_group_info { unsigned long bb_state; struct rb_root bb_free_root; @@ -2555,19 +2558,11 @@ extern const struct file_operations ext4_dir_operations; extern const struct inode_operations ext4_file_inode_operations; extern const struct file_operations ext4_file_operations; extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); -extern void ext4_unwritten_wait(struct inode *inode); /* inline.c */ extern int ext4_has_inline_data(struct inode *inode); -extern int ext4_get_inline_size(struct inode *inode); extern int ext4_get_max_inline_size(struct inode *inode); extern int ext4_find_inline_data_nolock(struct inode *inode); -extern void ext4_write_inline_data(struct inode *inode, - struct ext4_iloc *iloc, - void *buffer, loff_t pos, - unsigned int len); -extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, - unsigned int len); extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, unsigned int len); extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); @@ -2728,17 +2723,18 @@ extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); extern int ext4_ext_precache(struct inode *inode); +extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); /* move_extent.c */ extern void ext4_double_down_write_data_sem(struct inode *first, struct inode *second); extern void ext4_double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode); -void ext4_inode_double_lock(struct inode *inode1, struct inode *inode2); -void ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2); extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 start_orig, __u64 start_donor, __u64 len, __u64 *moved_len); +extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path, + struct ext4_extent **extent); /* page-io.c */ extern int __init ext4_init_pageio(void); @@ -2754,23 +2750,20 @@ extern void ext4_io_submit(struct ext4_io_submit *io); extern int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, int len, - struct writeback_control *wbc); + struct writeback_control *wbc, + bool keep_towrite); /* mmp.c */ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); -extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp); -extern int ext4_mmp_csum_verify(struct super_block *sb, - struct mmp_struct *mmp); /* * Note that these flags will never ever appear in a buffer_head's state flag. * See EXT4_MAP_... to see where this is used. */ enum ext4_state_bits { - BH_Uninit /* blocks are allocated but uninitialized on disk */ - = BH_JBDPrivateStart, - BH_AllocFromCluster, /* allocated blocks were part of already + BH_AllocFromCluster /* allocated blocks were part of already * allocated cluster. */ + = BH_JBDPrivateStart }; /* diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 5074fe23f19..a867f5ca999 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -137,21 +137,21 @@ struct ext4_ext_path { * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an * initialized extent. This is 2^15 and not (2^16 - 1), since we use the * MSB of ee_len field in the extent datastructure to signify if this - * particular extent is an initialized extent or an uninitialized (i.e. + * particular extent is an initialized extent or an unwritten (i.e. * preallocated). - * EXT_UNINIT_MAX_LEN is the maximum number of blocks we can have in an - * uninitialized extent. + * EXT_UNWRITTEN_MAX_LEN is the maximum number of blocks we can have in an + * unwritten extent. * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an - * uninitialized one. In other words, if MSB of ee_len is set, it is an - * uninitialized extent with only one special scenario when ee_len = 0x8000. - * In this case we can not have an uninitialized extent of zero length and + * unwritten one. In other words, if MSB of ee_len is set, it is an + * unwritten extent with only one special scenario when ee_len = 0x8000. + * In this case we can not have an unwritten extent of zero length and * thus we make it as a special case of initialized extent with 0x8000 length. * This way we get better extent-to-group alignment for initialized extents. * Hence, the maximum number of blocks we can have in an *initialized* - * extent is 2^15 (32768) and in an *uninitialized* extent is 2^15-1 (32767). + * extent is 2^15 (32768) and in an *unwritten* extent is 2^15-1 (32767). */ #define EXT_INIT_MAX_LEN (1UL << 15) -#define EXT_UNINIT_MAX_LEN (EXT_INIT_MAX_LEN - 1) +#define EXT_UNWRITTEN_MAX_LEN (EXT_INIT_MAX_LEN - 1) #define EXT_FIRST_EXTENT(__hdr__) \ @@ -187,14 +187,14 @@ static inline unsigned short ext_depth(struct inode *inode) return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); } -static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext) +static inline void ext4_ext_mark_unwritten(struct ext4_extent *ext) { - /* We can not have an uninitialized extent of zero length! */ + /* We can not have an unwritten extent of zero length! */ BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0); ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN); } -static inline int ext4_ext_is_uninitialized(struct ext4_extent *ext) +static inline int ext4_ext_is_unwritten(struct ext4_extent *ext) { /* Extent with ee_len of 0x8000 is treated as an initialized extent */ return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN); diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 17ac112ab10..0074e0d23d6 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -122,9 +122,10 @@ handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, return handle; } -void ext4_journal_abort_handle(const char *caller, unsigned int line, - const char *err_fn, struct buffer_head *bh, - handle_t *handle, int err) +static void ext4_journal_abort_handle(const char *caller, unsigned int line, + const char *err_fn, + struct buffer_head *bh, + handle_t *handle, int err) { char nbuf[16]; const char *errstr = ext4_decode_error(NULL, err, nbuf); @@ -259,6 +260,25 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, if (WARN_ON_ONCE(err)) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); + if (inode == NULL) { + pr_err("EXT4: jbd2_journal_dirty_metadata " + "failed: handle type %u started at " + "line %u, credits %u/%u, errcode %d", + handle->h_type, + handle->h_line_no, + handle->h_requested_credits, + handle->h_buffer_credits, err); + return err; + } + ext4_error_inode(inode, where, line, + bh->b_blocknr, + "journal_dirty_metadata failed: " + "handle type %u started at line %u, " + "credits %u/%u, errcode %d", + handle->h_type, + handle->h_line_no, + handle->h_requested_credits, + handle->h_buffer_credits, err); } } else { if (inode) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 81cfefa9dc0..17c00ff202f 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -231,10 +231,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); /* * Wrapper functions with which ext4 calls into JBD. */ -void ext4_journal_abort_handle(const char *caller, unsigned int line, - const char *err_fn, - struct buffer_head *bh, handle_t *handle, int err); - int __ext4_journal_get_write_access(const char *where, unsigned int line, handle_t *handle, struct buffer_head *bh); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 54d52afcdb1..4da228a0e6d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -37,7 +37,6 @@ #include <linux/quotaops.h> #include <linux/string.h> #include <linux/slab.h> -#include <linux/falloc.h> #include <asm/uaccess.h> #include <linux/fiemap.h> #include "ext4_jbd2.h" @@ -51,8 +50,8 @@ */ #define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ due to ENOSPC */ -#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ -#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ +#define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */ +#define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */ #define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */ #define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */ @@ -144,6 +143,7 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, { if (path->p_bh) { /* path points to block */ + BUFFER_TRACE(path->p_bh, "get_write_access"); return ext4_journal_get_write_access(handle, path->p_bh); } /* path points to leaf/index in inode body */ @@ -360,8 +360,10 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) { ext4_fsblk_t block = ext4_ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); + ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); + ext4_lblk_t last = lblock + len - 1; - if (len == 0) + if (lblock > last) return 0; return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); } @@ -387,11 +389,26 @@ static int ext4_valid_extent_entries(struct inode *inode, if (depth == 0) { /* leaf entries */ struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + ext4_fsblk_t pblock = 0; + ext4_lblk_t lblock = 0; + ext4_lblk_t prev = 0; + int len = 0; while (entries) { if (!ext4_valid_extent(inode, ext)) return 0; + + /* Check for overlapping extents */ + lblock = le32_to_cpu(ext->ee_block); + len = ext4_ext_get_actual_len(ext); + if ((lblock <= prev) && prev) { + pblock = ext4_ext_pblock(ext); + es->s_last_error_block = cpu_to_le64(pblock); + return 0; + } ext++; entries--; + prev = lblock + len - 1; } } else { struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); @@ -508,7 +525,7 @@ __read_extent_tree_block(const char *function, unsigned int line, lblk - prev, ~0, EXTENT_STATUS_HOLE); - if (ext4_ext_is_uninitialized(ex)) + if (ext4_ext_is_unwritten(ex)) status = EXTENT_STATUS_UNWRITTEN; ext4_es_cache_extent(inode, lblk, len, ext4_ext_pblock(ex), status); @@ -604,7 +621,7 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) } else if (path->p_ext) { ext_debug(" %d:[%d]%d:%llu ", le32_to_cpu(path->p_ext->ee_block), - ext4_ext_is_uninitialized(path->p_ext), + ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext), ext4_ext_pblock(path->p_ext)); } else @@ -630,7 +647,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), + ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); } ext_debug("\n"); @@ -661,7 +678,7 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), - ext4_ext_is_uninitialized(ex), + ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), newblock); ex++; @@ -786,7 +803,7 @@ ext4_ext_binsearch(struct inode *inode, ext_debug(" -> %d:%llu:[%d]%d ", le32_to_cpu(path->p_ext->ee_block), ext4_ext_pblock(path->p_ext), - ext4_ext_is_uninitialized(path->p_ext), + ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext)); #ifdef CHECK_BINSEARCH @@ -1666,22 +1683,17 @@ int ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, struct ext4_extent *ex2) { - unsigned short ext1_ee_len, ext2_ee_len, max_len; + unsigned short ext1_ee_len, ext2_ee_len; /* * Make sure that both extents are initialized. We don't merge - * uninitialized extents so that we can be sure that end_io code has + * unwritten extents so that we can be sure that end_io code has * the extent that was written properly split out and conversion to * initialized is trivial. */ - if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2)) + if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2)) return 0; - if (ext4_ext_is_uninitialized(ex1)) - max_len = EXT_UNINIT_MAX_LEN; - else - max_len = EXT_INIT_MAX_LEN; - ext1_ee_len = ext4_ext_get_actual_len(ex1); ext2_ee_len = ext4_ext_get_actual_len(ex2); @@ -1694,7 +1706,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, * as an RO_COMPAT feature, refuse to merge to extents if * this can result in the top bit of ee_len being set. */ - if (ext1_ee_len + ext2_ee_len > max_len) + if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) + return 0; + if (ext4_ext_is_unwritten(ex1) && + (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || + atomic_read(&EXT4_I(inode)->i_unwritten) || + (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN))) return 0; #ifdef AGGRESSIVE_TEST if (ext1_ee_len >= 4) @@ -1719,8 +1736,7 @@ static int ext4_ext_try_to_merge_right(struct inode *inode, { struct ext4_extent_header *eh; unsigned int depth, len; - int merge_done = 0; - int uninitialized = 0; + int merge_done = 0, unwritten; depth = ext_depth(inode); BUG_ON(path[depth].p_hdr == NULL); @@ -1730,12 +1746,11 @@ static int ext4_ext_try_to_merge_right(struct inode *inode, if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) break; /* merge with next extent! */ - if (ext4_ext_is_uninitialized(ex)) - uninitialized = 1; + unwritten = ext4_ext_is_unwritten(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(ex + 1)); - if (uninitialized) - ext4_ext_mark_uninitialized(ex); + if (unwritten) + ext4_ext_mark_unwritten(ex); if (ex + 1 < EXT_LAST_EXTENT(eh)) { len = (EXT_LAST_EXTENT(eh) - ex - 1) @@ -1844,8 +1859,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, depth = ext_depth(inode); if (!path[depth].p_ext) goto out; - b2 = le32_to_cpu(path[depth].p_ext->ee_block); - b2 &= ~(sbi->s_cluster_ratio - 1); + b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block)); /* * get the next allocated block if the extent in the path @@ -1855,7 +1869,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, b2 = ext4_ext_next_allocated_block(path); if (b2 == EXT_MAX_BLOCKS) goto out; - b2 &= ~(sbi->s_cluster_ratio - 1); + b2 = EXT4_LBLK_CMASK(sbi, b2); } /* check for wrap through zero on extent logical start block*/ @@ -1890,8 +1904,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *npath = NULL; int depth, len, err; ext4_lblk_t next; - unsigned uninitialized = 0; - int mb_flags = 0; + int mb_flags = 0, unwritten; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); @@ -1931,29 +1944,21 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, if (ext4_can_extents_be_merged(inode, ex, newext)) { ext_debug("append [%d]%d block to %u:[%d]%d" "(from %llu)\n", - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), + ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) return err; - - /* - * ext4_can_extents_be_merged should have checked - * that either both extents are uninitialized, or - * both aren't. Thus we need to check only one of - * them here. - */ - if (ext4_ext_is_uninitialized(ex)) - uninitialized = 1; + unwritten = ext4_ext_is_unwritten(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); - if (uninitialized) - ext4_ext_mark_uninitialized(ex); + if (unwritten) + ext4_ext_mark_unwritten(ex); eh = path[depth].p_hdr; nearex = ex; goto merge; @@ -1965,10 +1970,10 @@ prepend: ext_debug("prepend %u[%d]%d block to %u:[%d]%d" "(from %llu)\n", le32_to_cpu(newext->ee_block), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), + ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, @@ -1976,20 +1981,13 @@ prepend: if (err) return err; - /* - * ext4_can_extents_be_merged should have checked - * that either both extents are uninitialized, or - * both aren't. Thus we need to check only one of - * them here. - */ - if (ext4_ext_is_uninitialized(ex)) - uninitialized = 1; + unwritten = ext4_ext_is_unwritten(ex); ex->ee_block = newext->ee_block; ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); - if (uninitialized) - ext4_ext_mark_uninitialized(ex); + if (unwritten) + ext4_ext_mark_unwritten(ex); eh = path[depth].p_hdr; nearex = ex; goto merge; @@ -2049,7 +2047,7 @@ has_space: ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext)); nearex = EXT_FIRST_EXTENT(eh); } else { @@ -2060,7 +2058,7 @@ has_space: "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), nearex); nearex++; @@ -2071,7 +2069,7 @@ has_space: "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), nearex); } @@ -2081,7 +2079,7 @@ has_space: "move %d extents from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), len, nearex, nearex + 1); memmove(nearex + 1, nearex, @@ -2203,7 +2201,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, es.es_lblk = le32_to_cpu(ex->ee_block); es.es_len = ext4_ext_get_actual_len(ex); es.es_pblk = ext4_ext_pblock(ex); - if (ext4_ext_is_uninitialized(ex)) + if (ext4_ext_is_unwritten(ex)) flags |= FIEMAP_EXTENT_UNWRITTEN; } @@ -2535,7 +2533,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, * extent, we have to mark the cluster as used (store negative * cluster number in partial_cluster). */ - unaligned = pblk & (sbi->s_cluster_ratio - 1); + unaligned = EXT4_PBLK_COFF(sbi, pblk); if (unaligned && (ee_len == num) && (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk)))) *partial_cluster = EXT4_B2C(sbi, pblk); @@ -2579,7 +2577,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, unsigned num; ext4_lblk_t ex_ee_block; unsigned short ex_ee_len; - unsigned uninitialized = 0; + unsigned unwritten = 0; struct ext4_extent *ex; ext4_fsblk_t pblk; @@ -2600,18 +2598,39 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); + /* + * If we're starting with an extent other than the last one in the + * node, we need to see if it shares a cluster with the extent to + * the right (towards the end of the file). If its leftmost cluster + * is this extent's rightmost cluster and it is not cluster aligned, + * we'll mark it as a partial that is not to be deallocated. + */ + + if (ex != EXT_LAST_EXTENT(eh)) { + ext4_fsblk_t current_pblk, right_pblk; + long long current_cluster, right_cluster; + + current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; + current_cluster = (long long)EXT4_B2C(sbi, current_pblk); + right_pblk = ext4_ext_pblock(ex + 1); + right_cluster = (long long)EXT4_B2C(sbi, right_pblk); + if (current_cluster == right_cluster && + EXT4_PBLK_COFF(sbi, right_pblk)) + *partial_cluster = -right_cluster; + } + trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { - if (ext4_ext_is_uninitialized(ex)) - uninitialized = 1; + if (ext4_ext_is_unwritten(ex)) + unwritten = 1; else - uninitialized = 0; + unwritten = 0; ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, - uninitialized, ex_ee_len); + unwritten, ex_ee_len); path[depth].p_ext = ex; a = ex_ee_block > start ? ex_ee_block : start; @@ -2629,7 +2648,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, * accidentally freeing it later on */ pblk = ext4_ext_pblock(ex); - if (pblk & (sbi->s_cluster_ratio - 1)) + if (EXT4_PBLK_COFF(sbi, pblk)) *partial_cluster = -((long long)EXT4_B2C(sbi, pblk)); ex--; @@ -2683,11 +2702,11 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex->ee_len = cpu_to_le16(num); /* - * Do not mark uninitialized if all the blocks in the + * Do not mark unwritten if all the blocks in the * extent have been removed. */ - if (uninitialized && num) - ext4_ext_mark_uninitialized(ex); + if (unwritten && num) + ext4_ext_mark_unwritten(ex); /* * If the extent was completely released, * we need to remove it from the leaf @@ -2725,10 +2744,15 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, err = ext4_ext_correct_indexes(handle, inode, path); /* - * Free the partial cluster only if the current extent does not - * reference it. Otherwise we might free used cluster. + * If there's a partial cluster and at least one extent remains in + * the leaf, free the partial cluster if it isn't shared with the + * current extent. If there's a partial cluster and no extents + * remain in the leaf, it can't be freed here. It can only be + * freed when it's possible to determine if it's not shared with + * any other extent - when the next leaf is processed or when space + * removal is complete. */ - if (*partial_cluster > 0 && + if (*partial_cluster > 0 && eh->eh_entries && (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != *partial_cluster)) { int flags = get_default_free_blocks_flags(inode); @@ -2831,9 +2855,9 @@ again: end < ee_block + ext4_ext_get_actual_len(ex) - 1) { int split_flag = 0; - if (ext4_ext_is_uninitialized(ex)) - split_flag = EXT4_EXT_MARK_UNINIT1 | - EXT4_EXT_MARK_UNINIT2; + if (ext4_ext_is_unwritten(ex)) + split_flag = EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2; /* * Split the extent in two so that 'end' is the last @@ -3090,7 +3114,7 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) * @path: the path to the extent * @split: the logical block where the extent is splitted. * @split_flags: indicates if the extent could be zeroout if split fails, and - * the states(init or uninit) of new extents. + * the states(init or unwritten) of new extents. * @flags: flags used to insert new extent to extent tree. * * @@ -3132,10 +3156,10 @@ static int ext4_split_extent_at(handle_t *handle, newblock = split - ee_block + ext4_ext_pblock(ex); BUG_ON(split < ee_block || split >= (ee_block + ee_len)); - BUG_ON(!ext4_ext_is_uninitialized(ex) && + BUG_ON(!ext4_ext_is_unwritten(ex) && split_flag & (EXT4_EXT_MAY_ZEROOUT | - EXT4_EXT_MARK_UNINIT1 | - EXT4_EXT_MARK_UNINIT2)); + EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -3147,8 +3171,8 @@ static int ext4_split_extent_at(handle_t *handle, * then we just change the state of the extent, and splitting * is not needed. */ - if (split_flag & EXT4_EXT_MARK_UNINIT2) - ext4_ext_mark_uninitialized(ex); + if (split_flag & EXT4_EXT_MARK_UNWRIT2) + ext4_ext_mark_unwritten(ex); else ext4_ext_mark_initialized(ex); @@ -3162,8 +3186,8 @@ static int ext4_split_extent_at(handle_t *handle, /* case a */ memcpy(&orig_ex, ex, sizeof(orig_ex)); ex->ee_len = cpu_to_le16(split - ee_block); - if (split_flag & EXT4_EXT_MARK_UNINIT1) - ext4_ext_mark_uninitialized(ex); + if (split_flag & EXT4_EXT_MARK_UNWRIT1) + ext4_ext_mark_unwritten(ex); /* * path may lead to new leaf, not to original leaf any more @@ -3177,8 +3201,8 @@ static int ext4_split_extent_at(handle_t *handle, ex2->ee_block = cpu_to_le32(split); ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); ext4_ext_store_pblock(ex2, newblock); - if (split_flag & EXT4_EXT_MARK_UNINIT2) - ext4_ext_mark_uninitialized(ex2); + if (split_flag & EXT4_EXT_MARK_UNWRIT2) + ext4_ext_mark_unwritten(ex2); err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { @@ -3255,7 +3279,7 @@ static int ext4_split_extent(handle_t *handle, struct ext4_extent *ex; unsigned int ee_len, depth; int err = 0; - int uninitialized; + int unwritten; int split_flag1, flags1; int allocated = map->m_len; @@ -3263,14 +3287,14 @@ static int ext4_split_extent(handle_t *handle, ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - uninitialized = ext4_ext_is_uninitialized(ex); + unwritten = ext4_ext_is_unwritten(ex); if (map->m_lblk + map->m_len < ee_block + ee_len) { split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; - if (uninitialized) - split_flag1 |= EXT4_EXT_MARK_UNINIT1 | - EXT4_EXT_MARK_UNINIT2; + if (unwritten) + split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2; if (split_flag & EXT4_EXT_DATA_VALID2) split_flag1 |= EXT4_EXT_DATA_VALID1; err = ext4_split_extent_at(handle, inode, path, @@ -3290,15 +3314,20 @@ static int ext4_split_extent(handle_t *handle, return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; - uninitialized = ext4_ext_is_uninitialized(ex); + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + return -EIO; + } + unwritten = ext4_ext_is_unwritten(ex); split_flag1 = 0; if (map->m_lblk >= ee_block) { split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; - if (uninitialized) { - split_flag1 |= EXT4_EXT_MARK_UNINIT1; + if (unwritten) { + split_flag1 |= EXT4_EXT_MARK_UNWRIT1; split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | - EXT4_EXT_MARK_UNINIT2); + EXT4_EXT_MARK_UNWRIT2); } err = ext4_split_extent_at(handle, inode, path, map->m_lblk, split_flag1, flags); @@ -3313,16 +3342,16 @@ out: /* * This function is called by ext4_ext_map_blocks() if someone tries to write - * to an uninitialized extent. It may result in splitting the uninitialized + * to an unwritten extent. It may result in splitting the unwritten * extent into multiple extents (up to three - one initialized and two - * uninitialized). + * unwritten). * There are three possibilities: * a> There is no split required: Entire extent should be initialized * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * * Pre-conditions: - * - The extent pointed to by 'path' is uninitialized. + * - The extent pointed to by 'path' is unwritten. * - The extent pointed to by 'path' contains a superset * of the logical span [map->m_lblk, map->m_lblk + map->m_len). * @@ -3368,12 +3397,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); /* Pre-conditions */ - BUG_ON(!ext4_ext_is_uninitialized(ex)); + BUG_ON(!ext4_ext_is_unwritten(ex)); BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); /* * Attempt to transfer newly initialized blocks from the currently - * uninitialized extent to its neighbor. This is much cheaper + * unwritten extent to its neighbor. This is much cheaper * than an insertion followed by a merge as those involve costly * memmove() calls. Transferring to the left is the common case in * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) @@ -3409,7 +3438,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. */ - if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/ + if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ ((prev_lblk + prev_len) == ee_block) && /*C2*/ ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ @@ -3424,7 +3453,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ex->ee_block = cpu_to_le32(ee_block + map_len); ext4_ext_store_pblock(ex, ee_pblk + map_len); ex->ee_len = cpu_to_le16(ee_len - map_len); - ext4_ext_mark_uninitialized(ex); /* Restore the flag */ + ext4_ext_mark_unwritten(ex); /* Restore the flag */ /* Extend abut_ex by 'map_len' blocks */ abut_ex->ee_len = cpu_to_le16(prev_len + map_len); @@ -3455,7 +3484,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. */ - if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/ + if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ ((map->m_lblk + map_len) == next_lblk) && /*C2*/ ((ee_pblk + ee_len) == next_pblk) && /*C3*/ (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ @@ -3470,7 +3499,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); ext4_ext_store_pblock(abut_ex, next_pblk - map_len); ex->ee_len = cpu_to_le16(ee_len - map_len); - ext4_ext_mark_uninitialized(ex); /* Restore the flag */ + ext4_ext_mark_unwritten(ex); /* Restore the flag */ /* Extend abut_ex by 'map_len' blocks */ abut_ex->ee_len = cpu_to_le16(next_len + map_len); @@ -3492,7 +3521,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, WARN_ON(map->m_lblk < ee_block); /* * It is safe to convert extent to initialized via explicit - * zeroout only if extent is fully insde i_size or new_size. + * zeroout only if extent is fully inside i_size or new_size. */ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; @@ -3575,26 +3604,28 @@ out: /* * This function is called by ext4_ext_map_blocks() from * ext4_get_blocks_dio_write() when DIO to write - * to an uninitialized extent. + * to an unwritten extent. * - * Writing to an uninitialized extent may result in splitting the uninitialized - * extent into multiple initialized/uninitialized extents (up to three) + * Writing to an unwritten extent may result in splitting the unwritten + * extent into multiple initialized/unwritten extents (up to three) * There are three possibilities: - * a> There is no split required: Entire extent should be uninitialized + * a> There is no split required: Entire extent should be unwritten * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * + * This works the same way in the case of initialized -> unwritten conversion. + * * One of more index blocks maybe needed if the extent tree grow after - * the uninitialized extent split. To prevent ENOSPC occur at the IO - * complete, we need to split the uninitialized extent before DIO submit - * the IO. The uninitialized extent called at this time will be split - * into three uninitialized extent(at most). After IO complete, the part + * the unwritten extent split. To prevent ENOSPC occur at the IO + * complete, we need to split the unwritten extent before DIO submit + * the IO. The unwritten extent called at this time will be split + * into three unwritten extent(at most). After IO complete, the part * being filled will be convert to initialized by the end_io callback function * via ext4_convert_unwritten_extents(). * - * Returns the size of uninitialized extent to be written on success. + * Returns the size of unwritten extent to be written on success. */ -static int ext4_split_unwritten_extents(handle_t *handle, +static int ext4_split_convert_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path *path, @@ -3606,9 +3637,9 @@ static int ext4_split_unwritten_extents(handle_t *handle, unsigned int ee_len; int split_flag = 0, depth; - ext_debug("ext4_split_unwritten_extents: inode %lu, logical" - "block %llu, max_blocks %u\n", inode->i_ino, - (unsigned long long)map->m_lblk, map->m_len); + ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n", + __func__, inode->i_ino, + (unsigned long long)map->m_lblk, map->m_len); eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; @@ -3623,14 +3654,79 @@ static int ext4_split_unwritten_extents(handle_t *handle, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; - split_flag |= EXT4_EXT_MARK_UNINIT2; - if (flags & EXT4_GET_BLOCKS_CONVERT) - split_flag |= EXT4_EXT_DATA_VALID2; + /* Convert to unwritten */ + if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { + split_flag |= EXT4_EXT_DATA_VALID1; + /* Convert to initialized */ + } else if (flags & EXT4_GET_BLOCKS_CONVERT) { + split_flag |= ee_block + ee_len <= eof_block ? + EXT4_EXT_MAY_ZEROOUT : 0; + split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); + } flags |= EXT4_GET_BLOCKS_PRE_IO; return ext4_split_extent(handle, inode, path, map, split_flag, flags); } +static int ext4_convert_initialized_extents(handle_t *handle, + struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path) +{ + struct ext4_extent *ex; + ext4_lblk_t ee_block; + unsigned int ee_len; + int depth; + int err = 0; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + + ext_debug("%s: inode %lu, logical" + "block %llu, max_blocks %u\n", __func__, inode->i_ino, + (unsigned long long)ee_block, ee_len); + + if (ee_block != map->m_lblk || ee_len > map->m_len) { + err = ext4_split_convert_extents(handle, inode, map, path, + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); + if (err < 0) + goto out; + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out; + } + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + err = -EIO; + goto out; + } + } + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + /* first mark the extent as unwritten */ + ext4_ext_mark_unwritten(ex); + + /* note: ext4_ext_correct_indexes() isn't needed here because + * borders are not changed + */ + ext4_ext_try_to_merge(handle, inode, path, ex); + + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + path->p_depth); +out: + ext4_ext_show_leaf(inode, path); + return err; +} + + static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, @@ -3664,8 +3760,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, inode->i_ino, (unsigned long long)ee_block, ee_len, (unsigned long long)map->m_lblk, map->m_len); #endif - err = ext4_split_unwritten_extents(handle, inode, map, path, - EXT4_GET_BLOCKS_CONVERT); + err = ext4_split_convert_extents(handle, inode, map, path, + EXT4_GET_BLOCKS_CONVERT); if (err < 0) goto out; ext4_ext_drop_refs(path); @@ -3784,7 +3880,7 @@ int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t lblk_start, lblk_end; - lblk_start = lblk & (~(sbi->s_cluster_ratio - 1)); + lblk_start = EXT4_LBLK_CMASK(sbi, lblk); lblk_end = lblk_start + sbi->s_cluster_ratio - 1; return ext4_find_delalloc_range(inode, lblk_start, lblk_end); @@ -3843,9 +3939,9 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); /* Check towards left side */ - c_offset = lblk_start & (sbi->s_cluster_ratio - 1); + c_offset = EXT4_LBLK_COFF(sbi, lblk_start); if (c_offset) { - lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1)); + lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start); lblk_to = lblk_from + c_offset - 1; if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) @@ -3853,7 +3949,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, } /* Now check towards right. */ - c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1); + c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks); if (allocated_clusters && c_offset) { lblk_from = lblk_start + num_blks; lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; @@ -3866,7 +3962,39 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, } static int -ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, +ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path, int flags, + unsigned int allocated, ext4_fsblk_t newblock) +{ + int ret = 0; + int err = 0; + + /* + * Make sure that the extent is no bigger than we support with + * unwritten extent + */ + if (map->m_len > EXT_UNWRITTEN_MAX_LEN) + map->m_len = EXT_UNWRITTEN_MAX_LEN / 2; + + ret = ext4_convert_initialized_extents(handle, inode, map, + path); + if (ret >= 0) { + ext4_update_inode_fsync_trans(handle, inode, 1); + err = check_eofblocks_fl(handle, inode, map->m_lblk, + path, map->m_len); + } else + err = ret; + map->m_flags |= EXT4_MAP_UNWRITTEN; + if (allocated > map->m_len) + allocated = map->m_len; + map->m_len = allocated; + + return err ? err : allocated; +} + +static int +ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path *path, int flags, unsigned int allocated, ext4_fsblk_t newblock) @@ -3875,25 +4003,25 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, int err = 0; ext4_io_end_t *io = ext4_inode_aio(inode); - ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical " + ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical " "block %llu, max_blocks %u, flags %x, allocated %u\n", inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, flags, allocated); ext4_ext_show_leaf(inode, path); /* - * When writing into uninitialized space, we should not fail to + * When writing into unwritten space, we should not fail to * allocate metadata blocks for the new extent block if needed. */ flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; - trace_ext4_ext_handle_uninitialized_extents(inode, map, flags, + trace_ext4_ext_handle_unwritten_extents(inode, map, flags, allocated, newblock); /* get_block() before submit the IO, split the extent */ - if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { - ret = ext4_split_unwritten_extents(handle, inode, map, - path, flags); + if (flags & EXT4_GET_BLOCKS_PRE_IO) { + ret = ext4_split_convert_extents(handle, inode, map, + path, flags | EXT4_GET_BLOCKS_CONVERT); if (ret <= 0) goto out; /* @@ -3906,12 +4034,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); map->m_flags |= EXT4_MAP_UNWRITTEN; - if (ext4_should_dioread_nolock(inode)) - map->m_flags |= EXT4_MAP_UNINIT; goto out; } /* IO end_io complete, convert the filled extent to written */ - if ((flags & EXT4_GET_BLOCKS_CONVERT)) { + if (flags & EXT4_GET_BLOCKS_CONVERT) { ret = ext4_convert_unwritten_extents_endio(handle, inode, map, path); if (ret >= 0) { @@ -3921,6 +4047,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, } else err = ret; map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = newblock; if (allocated > map->m_len) allocated = map->m_len; map->m_len = allocated; @@ -3931,7 +4058,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, * repeat fallocate creation request * we already have an unwritten extent */ - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) { + if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { map->m_flags |= EXT4_MAP_UNWRITTEN; goto map_out; } @@ -4007,10 +4134,6 @@ out1: map->m_pblk = newblock; map->m_len = allocated; out2: - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } return err ? err : allocated; } @@ -4061,7 +4184,7 @@ static int get_implied_cluster_alloc(struct super_block *sb, struct ext4_ext_path *path) { struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1); + ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); ext4_lblk_t ex_cluster_start, ex_cluster_end; ext4_lblk_t rr_cluster_start; ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); @@ -4079,8 +4202,7 @@ static int get_implied_cluster_alloc(struct super_block *sb, (rr_cluster_start == ex_cluster_start)) { if (rr_cluster_start == ex_cluster_end) ee_start += ee_len - 1; - map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) + - c_offset; + map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset; map->m_len = min(map->m_len, (unsigned) sbi->s_cluster_ratio - c_offset); /* @@ -4143,7 +4265,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_extent newex, *ex, *ex2; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_fsblk_t newblock = 0; - int free_on_err = 0, err = 0, depth; + int free_on_err = 0, err = 0, depth, ret; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; @@ -4185,8 +4307,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t ee_start = ext4_ext_pblock(ex); unsigned short ee_len; + /* - * Uninitialized extents are treated as holes, except that + * unwritten extents are treated as holes, except that * we split out initialized portions during a write. */ ee_len = ext4_ext_get_actual_len(ex); @@ -4201,13 +4324,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, ee_block, ee_len, newblock); - if (!ext4_ext_is_uninitialized(ex)) + /* + * If the extent is initialized check whether the + * caller wants to convert it to unwritten. + */ + if ((!ext4_ext_is_unwritten(ex)) && + (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { + allocated = ext4_ext_convert_initialized_extent( + handle, inode, map, path, flags, + allocated, newblock); + goto out2; + } else if (!ext4_ext_is_unwritten(ex)) goto out; - allocated = ext4_ext_handle_uninitialized_extents( + ret = ext4_ext_handle_unwritten_extents( handle, inode, map, path, flags, allocated, newblock); - goto out3; + if (ret < 0) + err = ret; + else + allocated = ret; + goto out2; } } @@ -4234,7 +4371,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, */ map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; newex.ee_block = cpu_to_le32(map->m_lblk); - cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1); + cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); /* * If we are doing bigalloc, check to see if the extent returned @@ -4272,15 +4409,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* * See if request is beyond maximum number of blocks we can have in * a single extent. For an initialized extent this limit is - * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is - * EXT_UNINIT_MAX_LEN. + * EXT_INIT_MAX_LEN and for an unwritten extent this limit is + * EXT_UNWRITTEN_MAX_LEN. */ if (map->m_len > EXT_INIT_MAX_LEN && - !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) + !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) map->m_len = EXT_INIT_MAX_LEN; - else if (map->m_len > EXT_UNINIT_MAX_LEN && - (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) - map->m_len = EXT_UNINIT_MAX_LEN; + else if (map->m_len > EXT_UNWRITTEN_MAX_LEN && + (flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) + map->m_len = EXT_UNWRITTEN_MAX_LEN; /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ newex.ee_len = cpu_to_le16(map->m_len); @@ -4302,7 +4439,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * needed so that future calls to get_implied_cluster_alloc() * work correctly. */ - offset = map->m_lblk & (sbi->s_cluster_ratio - 1); + offset = EXT4_LBLK_COFF(sbi, map->m_lblk); ar.len = EXT4_NUM_B2C(sbi, offset+allocated); ar.goal -= offset; ar.logical -= offset; @@ -4328,21 +4465,19 @@ got_allocated_blocks: /* try to insert new extent into found leaf and return */ ext4_ext_store_pblock(&newex, newblock + offset); newex.ee_len = cpu_to_le16(ar.len); - /* Mark uninitialized */ - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ - ext4_ext_mark_uninitialized(&newex); + /* Mark unwritten */ + if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){ + ext4_ext_mark_unwritten(&newex); map->m_flags |= EXT4_MAP_UNWRITTEN; /* * io_end structure was created for every IO write to an - * uninitialized extent. To avoid unnecessary conversion, + * unwritten extent. To avoid unnecessary conversion, * here we flag the IO that really needs the conversion. * For non asycn direct IO case, flag the inode state * that we need to perform conversion when IO is done. */ - if ((flags & EXT4_GET_BLOCKS_PRE_IO)) + if (flags & EXT4_GET_BLOCKS_PRE_IO) set_unwritten = 1; - if (ext4_should_dioread_nolock(inode)) - map->m_flags |= EXT4_MAP_UNINIT; } err = 0; @@ -4469,9 +4604,9 @@ got_allocated_blocks: /* * Cache the extent and update transaction to commit on fdatasync only - * when it is _not_ an uninitialized extent. + * when it is _not_ an unwritten extent. */ - if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) + if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0) ext4_update_inode_fsync_trans(handle, inode, 1); else ext4_update_inode_fsync_trans(handle, inode, 0); @@ -4488,7 +4623,6 @@ out2: kfree(path); } -out3: trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); ext4_es_lru_add(inode); @@ -4529,34 +4663,210 @@ retry: ext4_std_error(inode->i_sb, err); } -static void ext4_falloc_update_inode(struct inode *inode, - int mode, loff_t new_size, int update_ctime) +static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, + ext4_lblk_t len, int flags, int mode) +{ + struct inode *inode = file_inode(file); + handle_t *handle; + int ret = 0; + int ret2 = 0; + int retries = 0; + struct ext4_map_blocks map; + unsigned int credits; + + map.m_lblk = offset; + /* + * Don't normalize the request if it can fit in one extent so + * that it doesn't get unnecessarily split into multiple + * extents. + */ + if (len <= EXT_UNWRITTEN_MAX_LEN) + flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; + + /* + * credits to insert 1 extent into extent tree + */ + credits = ext4_chunk_trans_blocks(inode, len); + +retry: + while (ret >= 0 && ret < len) { + map.m_lblk = map.m_lblk + ret; + map.m_len = len = len - ret; + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + break; + } + ret = ext4_map_blocks(handle, inode, &map, flags); + if (ret <= 0) { + ext4_debug("inode #%lu: block %u: len %u: " + "ext4_ext_map_blocks returned %d", + inode->i_ino, map.m_lblk, + map.m_len, ret); + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + break; + } + ret2 = ext4_journal_stop(handle); + if (ret2) + break; + } + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) { + ret = 0; + goto retry; + } + + return ret > 0 ? ret2 : ret; +} + +static long ext4_zero_range(struct file *file, loff_t offset, + loff_t len, int mode) { - struct timespec now; + struct inode *inode = file_inode(file); + handle_t *handle = NULL; + unsigned int max_blocks; + loff_t new_size = 0; + int ret = 0; + int flags; + int partial; + loff_t start, end; + ext4_lblk_t lblk; + struct address_space *mapping = inode->i_mapping; + unsigned int blkbits = inode->i_blkbits; + + trace_ext4_zero_range(inode, offset, len, mode); + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; - if (update_ctime) { - now = current_fs_time(inode->i_sb); - if (!timespec_equal(&inode->i_ctime, &now)) - inode->i_ctime = now; + /* Call ext4_force_commit to flush all data in case of data=journal. */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + if (ret) + return ret; } + /* - * Update only when preallocation was requested beyond - * the file size. + * Write out all dirty pages to avoid race conditions + * Then release them. */ - if (!(mode & FALLOC_FL_KEEP_SIZE)) { + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + ret = filemap_write_and_wait_range(mapping, offset, + offset + len - 1); + if (ret) + return ret; + } + + /* + * Round up offset. This is not fallocate, we neet to zero out + * blocks, so convert interior block aligned part of the range to + * unwritten and possibly manually zero out unaligned parts of the + * range. + */ + start = round_up(offset, 1 << blkbits); + end = round_down((offset + len), 1 << blkbits); + + if (start < offset || end > offset + len) + return -EINVAL; + partial = (offset + len) & ((1 << blkbits) - 1); + + lblk = start >> blkbits; + max_blocks = (end >> blkbits); + if (max_blocks < lblk) + max_blocks = 0; + else + max_blocks -= lblk; + + flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT | + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN; + if (mode & FALLOC_FL_KEEP_SIZE) + flags |= EXT4_GET_BLOCKS_KEEP_SIZE; + + mutex_lock(&inode->i_mutex); + + /* + * Indirect files do not support unwritten extnets + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + ret = -EOPNOTSUPP; + goto out_mutex; + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + ret = inode_newsize_ok(inode, new_size); + if (ret) + goto out_mutex; + /* + * If we have a partial block after EOF we have to allocate + * the entire block. + */ + if (partial) + max_blocks += 1; + } + + if (max_blocks > 0) { + + /* Now release the pages and zero block aligned part of pages*/ + truncate_pagecache_range(inode, start, end - 1); + + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + /* + * Remove entire range from the extent status tree. + */ + ret = ext4_es_remove_extent(inode, lblk, max_blocks); + if (ret) + goto out_dio; + + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, + mode); + if (ret) + goto out_dio; + } + + handle = ext4_journal_start(inode, EXT4_HT_MISC, 4); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + ext4_std_error(inode->i_sb, ret); + goto out_dio; + } + + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + + if (new_size) { if (new_size > i_size_read(inode)) i_size_write(inode, new_size); if (new_size > EXT4_I(inode)->i_disksize) ext4_update_i_disksize(inode, new_size); } else { /* - * Mark that we allocate beyond EOF so the subsequent truncate - * can proceed even if the new size is the same as i_size. - */ - if (new_size > i_size_read(inode)) + * Mark that we allocate beyond EOF so the subsequent truncate + * can proceed even if the new size is the same as i_size. + */ + if ((offset + len) > i_size_read(inode)) ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); } + ext4_mark_inode_dirty(handle, inode); + + /* Zero out partial block at the edges of the range */ + ret = ext4_zero_partial_blocks(handle, inode, offset, len); + + if (file->f_flags & O_SYNC) + ext4_handle_sync(handle); + + ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; } /* @@ -4570,17 +4880,17 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); handle_t *handle; - loff_t new_size; + loff_t new_size = 0; unsigned int max_blocks; int ret = 0; - int ret2 = 0; - int retries = 0; int flags; - struct ext4_map_blocks map; - unsigned int credits, blkbits = inode->i_blkbits; + ext4_lblk_t lblk; + struct timespec tv; + unsigned int blkbits = inode->i_blkbits; /* Return error if mode is not supported */ - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) @@ -4597,83 +4907,69 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; + if (mode & FALLOC_FL_COLLAPSE_RANGE) + return ext4_collapse_range(inode, offset, len); + + if (mode & FALLOC_FL_ZERO_RANGE) + return ext4_zero_range(file, offset, len, mode); + trace_ext4_fallocate_enter(inode, offset, len, mode); - map.m_lblk = offset >> blkbits; + lblk = offset >> blkbits; /* * We can't just convert len to max_blocks because * If blocksize = 4096 offset = 3072 and len = 2048 */ max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - - map.m_lblk; - /* - * credits to insert 1 extent into extent tree - */ - credits = ext4_chunk_trans_blocks(inode, max_blocks); - mutex_lock(&inode->i_mutex); - ret = inode_newsize_ok(inode, (len + offset)); - if (ret) { - mutex_unlock(&inode->i_mutex); - trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); - return ret; - } - flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT; + - lblk; + + flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; - /* - * Don't normalize the request if it can fit in one extent so - * that it doesn't get unnecessarily split into multiple - * extents. - */ - if (len <= EXT_UNINIT_MAX_LEN << blkbits) - flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; -retry: - while (ret >= 0 && ret < max_blocks) { - map.m_lblk = map.m_lblk + ret; - map.m_len = max_blocks = max_blocks - ret; - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, - credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - break; - } - ret = ext4_map_blocks(handle, inode, &map, flags); - if (ret <= 0) { -#ifdef EXT4FS_DEBUG - ext4_warning(inode->i_sb, - "inode #%lu: block %u: len %u: " - "ext4_ext_map_blocks returned %d", - inode->i_ino, map.m_lblk, - map.m_len, ret); -#endif - ext4_mark_inode_dirty(handle, inode); - ret2 = ext4_journal_stop(handle); - break; - } - if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len, - blkbits) >> blkbits)) - new_size = offset + len; - else - new_size = ((loff_t) map.m_lblk + ret) << blkbits; + mutex_lock(&inode->i_mutex); - ext4_falloc_update_inode(inode, mode, new_size, - (map.m_flags & EXT4_MAP_NEW)); - ext4_mark_inode_dirty(handle, inode); - if ((file->f_flags & O_SYNC) && ret >= max_blocks) - ext4_handle_sync(handle); - ret2 = ext4_journal_stop(handle); - if (ret2) - break; + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + ret = inode_newsize_ok(inode, new_size); + if (ret) + goto out; } - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) { - ret = 0; - goto retry; + + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode); + if (ret) + goto out; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + goto out; + + tv = inode->i_ctime = ext4_current_time(inode); + + if (new_size) { + if (new_size > i_size_read(inode)) { + i_size_write(inode, new_size); + inode->i_mtime = tv; + } + if (new_size > EXT4_I(inode)->i_disksize) + ext4_update_i_disksize(inode, new_size); + } else { + /* + * Mark that we allocate beyond EOF so the subsequent truncate + * can proceed even if the new size is the same as i_size. + */ + if ((offset + len) > i_size_read(inode)) + ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); } + ext4_mark_inode_dirty(handle, inode); + if (file->f_flags & O_SYNC) + ext4_handle_sync(handle); + + ext4_journal_stop(handle); +out: mutex_unlock(&inode->i_mutex); - trace_ext4_fallocate_exit(inode, offset, max_blocks, - ret > 0 ? ret2 : ret); - return ret > 0 ? ret2 : ret; + trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); + return ret; } /* @@ -4884,3 +5180,333 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ext4_es_lru_add(inode); return error; } + +/* + * ext4_access_path: + * Function to access the path buffer for marking it dirty. + * It also checks if there are sufficient credits left in the journal handle + * to update path. + */ +static int +ext4_access_path(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + int credits, err; + + if (!ext4_handle_valid(handle)) + return 0; + + /* + * Check if need to extend journal credits + * 3 for leaf, sb, and inode plus 2 (bmap and group + * descriptor) for each block group; assume two block + * groups + */ + if (handle->h_buffer_credits < 7) { + credits = ext4_writepage_trans_blocks(inode); + err = ext4_ext_truncate_extend_restart(handle, inode, credits); + /* EAGAIN is success */ + if (err && err != -EAGAIN) + return err; + } + + err = ext4_ext_get_access(handle, inode, path); + return err; +} + +/* + * ext4_ext_shift_path_extents: + * Shift the extents of a path structure lying between path[depth].p_ext + * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift + * from starting block for each extent. + */ +static int +ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, + struct inode *inode, handle_t *handle, + ext4_lblk_t *start) +{ + int depth, err = 0; + struct ext4_extent *ex_start, *ex_last; + bool update = 0; + depth = path->p_depth; + + while (depth >= 0) { + if (depth == path->p_depth) { + ex_start = path[depth].p_ext; + if (!ex_start) + return -EIO; + + ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); + if (!ex_last) + return -EIO; + + err = ext4_access_path(handle, inode, path + depth); + if (err) + goto out; + + if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) + update = 1; + + *start = le32_to_cpu(ex_last->ee_block) + + ext4_ext_get_actual_len(ex_last); + + while (ex_start <= ex_last) { + le32_add_cpu(&ex_start->ee_block, -shift); + /* Try to merge to the left. */ + if ((ex_start > + EXT_FIRST_EXTENT(path[depth].p_hdr)) && + ext4_ext_try_to_merge_right(inode, + path, ex_start - 1)) + ex_last--; + else + ex_start++; + } + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + if (--depth < 0 || !update) + break; + } + + /* Update index too */ + err = ext4_access_path(handle, inode, path + depth); + if (err) + goto out; + + le32_add_cpu(&path[depth].p_idx->ei_block, -shift); + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + /* we are done if current index is not a starting index */ + if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr)) + break; + + depth--; + } + +out: + return err; +} + +/* + * ext4_ext_shift_extents: + * All the extents which lies in the range from start to the last allocated + * block for the file are shifted downwards by shift blocks. + * On success, 0 is returned, error otherwise. + */ +static int +ext4_ext_shift_extents(struct inode *inode, handle_t *handle, + ext4_lblk_t start, ext4_lblk_t shift) +{ + struct ext4_ext_path *path; + int ret = 0, depth; + struct ext4_extent *extent; + ext4_lblk_t stop_block, current_block; + ext4_lblk_t ex_start, ex_end; + + /* Let path point to the last extent */ + path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + + depth = path->p_depth; + extent = path[depth].p_ext; + if (!extent) { + ext4_ext_drop_refs(path); + kfree(path); + return ret; + } + + stop_block = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + ext4_ext_drop_refs(path); + kfree(path); + + /* Nothing to shift, if hole is at the end of file */ + if (start >= stop_block) + return ret; + + /* + * Don't start shifting extents until we make sure the hole is big + * enough to accomodate the shift. + */ + path = ext4_ext_find_extent(inode, start - 1, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = path->p_depth; + extent = path[depth].p_ext; + if (extent) { + ex_start = le32_to_cpu(extent->ee_block); + ex_end = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + } else { + ex_start = 0; + ex_end = 0; + } + ext4_ext_drop_refs(path); + kfree(path); + + if ((start == ex_start && shift > ex_start) || + (shift > start - ex_end)) + return -EINVAL; + + /* Its safe to start updating extents */ + while (start < stop_block) { + path = ext4_ext_find_extent(inode, start, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = path->p_depth; + extent = path[depth].p_ext; + if (!extent) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) start); + return -EIO; + } + + current_block = le32_to_cpu(extent->ee_block); + if (start > current_block) { + /* Hole, move to the next extent */ + ret = mext_next_extent(inode, path, &extent); + if (ret != 0) { + ext4_ext_drop_refs(path); + kfree(path); + if (ret == 1) + ret = 0; + break; + } + } + ret = ext4_ext_shift_path_extents(path, shift, inode, + handle, &start); + ext4_ext_drop_refs(path); + kfree(path); + if (ret) + break; + } + + return ret; +} + +/* + * ext4_collapse_range: + * This implements the fallocate's collapse range functionality for ext4 + * Returns: 0 and non-zero on error. + */ +int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) +{ + struct super_block *sb = inode->i_sb; + ext4_lblk_t punch_start, punch_stop; + handle_t *handle; + unsigned int credits; + loff_t new_size, ioffset; + int ret; + + /* Collapse range works only on fs block size aligned offsets. */ + if (offset & (EXT4_BLOCK_SIZE(sb) - 1) || + len & (EXT4_BLOCK_SIZE(sb) - 1)) + return -EINVAL; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) + return -EOPNOTSUPP; + + trace_ext4_collapse_range(inode, offset, len); + + punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); + punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); + + /* Call ext4_force_commit to flush all data in case of data=journal. */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + if (ret) + return ret; + } + + /* + * Need to round down offset to be aligned with page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + + /* Write out all dirty pages */ + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + LLONG_MAX); + if (ret) + return ret; + + /* Take mutex lock */ + mutex_lock(&inode->i_mutex); + + /* + * There is no need to overlap collapse range with EOF, in which case + * it is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) { + ret = -EINVAL; + goto out_mutex; + } + + /* Currently just for extent based files */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + ret = -EOPNOTSUPP; + goto out_mutex; + } + + truncate_pagecache(inode, ioffset); + + /* Wait for existing dio to complete */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + credits = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_dio; + } + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + ret = ext4_es_remove_extent(inode, punch_start, + EXT_MAX_BLOCKS - punch_start); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + ext4_discard_preallocations(inode); + + ret = ext4_ext_shift_extents(inode, handle, punch_stop, + punch_stop - punch_start); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + new_size = i_size_read(inode) - len; + i_size_write(inode, new_size); + EXT4_I(inode)->i_disksize = new_size; + + up_write(&EXT4_I(inode)->i_data_sem); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + +out_stop: + ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; +} diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 3981ff78395..0b7e28e7eaa 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -184,7 +184,7 @@ static void ext4_es_print_tree(struct inode *inode) while (node) { struct extent_status *es; es = rb_entry(node, struct extent_status, rb_node); - printk(KERN_DEBUG " [%u/%u) %llu %llx", + printk(KERN_DEBUG " [%u/%u) %llu %x", es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); node = rb_next(node); @@ -344,8 +344,14 @@ static int ext4_es_can_be_merged(struct extent_status *es1, if (ext4_es_status(es1) != ext4_es_status(es2)) return 0; - if (((__u64) es1->es_len) + es2->es_len > 0xFFFFFFFFULL) + if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) { + pr_warn("ES assertion failed when merging extents. " + "The sum of lengths of es1 (%d) and es2 (%d) " + "is bigger than allowed file size (%d)\n", + es1->es_len, es2->es_len, EXT_MAX_BLOCKS); + WARN_ON(1); return 0; + } if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk) return 0; @@ -433,7 +439,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, ee_start = ext4_ext_pblock(ex); ee_len = ext4_ext_get_actual_len(ex); - ee_status = ext4_ext_is_uninitialized(ex) ? 1 : 0; + ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0; es_status = ext4_es_is_unwritten(es) ? 1 : 0; /* @@ -445,8 +451,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, pr_warn("ES insert assertion failed for " "inode: %lu we can find an extent " "at block [%d/%d/%llu/%c], but we " - "want to add an delayed/hole extent " - "[%d/%d/%llu/%llx]\n", + "want to add a delayed/hole extent " + "[%d/%d/%llu/%x]\n", inode->i_ino, ee_block, ee_len, ee_start, ee_status ? 'u' : 'w', es->es_lblk, es->es_len, @@ -486,8 +492,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { pr_warn("ES insert assertion failed for inode: %lu " "can't find an extent at block %d but we want " - "to add an written/unwritten extent " - "[%d/%d/%llu/%llx]\n", inode->i_ino, + "to add a written/unwritten extent " + "[%d/%d/%llu/%x]\n", inode->i_ino, es->es_lblk, es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); } @@ -524,7 +530,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode, */ pr_warn("ES insert assertion failed for inode: %lu " "We can find blocks but we want to add a " - "delayed/hole extent [%d/%d/%llu/%llx]\n", + "delayed/hole extent [%d/%d/%llu/%x]\n", inode->i_ino, es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); return; @@ -554,7 +560,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode, if (ext4_es_is_written(es)) { pr_warn("ES insert assertion failed for inode: %lu " "We can't find the block but we want to add " - "an written extent [%d/%d/%llu/%llx]\n", + "a written extent [%d/%d/%llu/%x]\n", inode->i_ino, es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); return; @@ -658,8 +664,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, newes.es_lblk = lblk; newes.es_len = len; - ext4_es_store_pblock(&newes, pblk); - ext4_es_store_status(&newes, status); + ext4_es_store_pblock_status(&newes, pblk, status); trace_ext4_es_insert_extent(inode, &newes); ext4_es_insert_extent_check(inode, &newes); @@ -699,8 +704,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, newes.es_lblk = lblk; newes.es_len = len; - ext4_es_store_pblock(&newes, pblk); - ext4_es_store_status(&newes, status); + ext4_es_store_pblock_status(&newes, pblk, status); trace_ext4_es_cache_extent(inode, &newes); if (!len) @@ -812,13 +816,13 @@ retry: newes.es_lblk = end + 1; newes.es_len = len2; + block = 0x7FDEADBEEFULL; if (ext4_es_is_written(&orig_es) || - ext4_es_is_unwritten(&orig_es)) { + ext4_es_is_unwritten(&orig_es)) block = ext4_es_pblock(&orig_es) + orig_es.es_len - len2; - ext4_es_store_pblock(&newes, block); - } - ext4_es_store_status(&newes, ext4_es_status(&orig_es)); + ext4_es_store_pblock_status(&newes, block, + ext4_es_status(&orig_es)); err = __es_insert_extent(inode, &newes); if (err) { es->es_lblk = orig_es.es_lblk; @@ -962,10 +966,10 @@ retry: continue; } - if (ei->i_es_lru_nr == 0 || ei == locked_ei) + if (ei->i_es_lru_nr == 0 || ei == locked_ei || + !write_trylock(&ei->i_es_lock)) continue; - write_lock(&ei->i_es_lock); shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); if (ei->i_es_lru_nr == 0) list_del_init(&ei->i_es_lru); diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 167f4ab8ecc..f1b62a41992 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -129,6 +129,15 @@ static inline void ext4_es_store_status(struct extent_status *es, (es->es_pblk & ~ES_MASK)); } +static inline void ext4_es_store_pblock_status(struct extent_status *es, + ext4_fsblk_t pb, + unsigned int status) +{ + es->es_pblk = (((ext4_fsblk_t) + (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) | + (pb & ~ES_MASK)); +} + extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_lru_add(struct inode *inode); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 3da21945ff1..8695f70af1e 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -57,7 +57,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp) return 0; } -void ext4_unwritten_wait(struct inode *inode) +static void ext4_unwritten_wait(struct inode *inode) { wait_queue_head_t *wq = ext4_ioend_wq(inode); @@ -74,132 +74,126 @@ void ext4_unwritten_wait(struct inode *inode) * or one thread will zero the other's data, causing corruption. */ static int -ext4_unaligned_aio(struct inode *inode, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) { struct super_block *sb = inode->i_sb; int blockmask = sb->s_blocksize - 1; - size_t count = iov_length(iov, nr_segs); - loff_t final_size = pos + count; - if (pos >= inode->i_size) + if (pos >= i_size_read(inode)) return 0; - if ((pos & blockmask) || (final_size & blockmask)) + if ((pos | iov_iter_alignment(from)) & blockmask) return 1; return 0; } static ssize_t -ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct inode *inode = file_inode(iocb->ki_filp); + struct mutex *aio_mutex = NULL; struct blk_plug plug; - int unaligned_aio = 0; - ssize_t ret; + int o_direct = file->f_flags & O_DIRECT; int overwrite = 0; - size_t length = iov_length(iov, nr_segs); - - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && - !is_sync_kiocb(iocb)) - unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos); + size_t length = iov_iter_count(from); + ssize_t ret; + loff_t pos = iocb->ki_pos; - /* Unaligned direct AIO must be serialized; see comment above */ - if (unaligned_aio) { - mutex_lock(ext4_aio_mutex(inode)); + /* + * Unaligned direct AIO must be serialized; see comment above + * In the case of O_APPEND, assume that we must always serialize + */ + if (o_direct && + ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && + !is_sync_kiocb(iocb) && + (file->f_flags & O_APPEND || + ext4_unaligned_aio(inode, from, pos))) { + aio_mutex = ext4_aio_mutex(inode); + mutex_lock(aio_mutex); ext4_unwritten_wait(inode); } - BUG_ON(iocb->ki_pos != pos); - mutex_lock(&inode->i_mutex); - blk_start_plug(&plug); - - iocb->private = &overwrite; + if (file->f_flags & O_APPEND) + iocb->ki_pos = pos = i_size_read(inode); - /* check whether we do a DIO overwrite or not */ - if (ext4_should_dioread_nolock(inode) && !unaligned_aio && - !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { - struct ext4_map_blocks map; - unsigned int blkbits = inode->i_blkbits; - int err, len; + /* + * If we have encountered a bitmap-format file, the size limit + * is smaller than s_maxbytes, which is for extent-mapped files. + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - map.m_lblk = pos >> blkbits; - map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits) - - map.m_lblk; - len = map.m_len; + if ((pos > sbi->s_bitmap_maxbytes) || + (pos == sbi->s_bitmap_maxbytes && length > 0)) { + mutex_unlock(&inode->i_mutex); + ret = -EFBIG; + goto errout; + } - err = ext4_map_blocks(NULL, inode, &map, 0); - /* - * 'err==len' means that all of blocks has been preallocated no - * matter they are initialized or not. For excluding - * uninitialized extents, we need to check m_flags. There are - * two conditions that indicate for initialized extents. - * 1) If we hit extent cache, EXT4_MAP_MAPPED flag is returned; - * 2) If we do a real lookup, non-flags are returned. - * So we should check these two conditions. - */ - if (err == len && (map.m_flags & EXT4_MAP_MAPPED)) - overwrite = 1; + if (pos + length > sbi->s_bitmap_maxbytes) + iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos); } - ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); - mutex_unlock(&inode->i_mutex); - - if (ret > 0) { - ssize_t err; - - err = generic_write_sync(file, pos, ret); - if (err < 0 && ret > 0) - ret = err; - } - blk_finish_plug(&plug); + if (o_direct) { + blk_start_plug(&plug); - if (unaligned_aio) - mutex_unlock(ext4_aio_mutex(inode)); + iocb->private = &overwrite; - return ret; -} + /* check whether we do a DIO overwrite or not */ + if (ext4_should_dioread_nolock(inode) && !aio_mutex && + !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { + struct ext4_map_blocks map; + unsigned int blkbits = inode->i_blkbits; + int err, len; -static ssize_t -ext4_file_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - struct inode *inode = file_inode(iocb->ki_filp); - ssize_t ret; + map.m_lblk = pos >> blkbits; + map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits) + - map.m_lblk; + len = map.m_len; - /* - * If we have encountered a bitmap-format file, the size limit - * is smaller than s_maxbytes, which is for extent-mapped files. - */ + err = ext4_map_blocks(NULL, inode, &map, 0); + /* + * 'err==len' means that all of blocks has + * been preallocated no matter they are + * initialized or not. For excluding + * unwritten extents, we need to check + * m_flags. There are two conditions that + * indicate for initialized extents. 1) If we + * hit extent cache, EXT4_MAP_MAPPED flag is + * returned; 2) If we do a real lookup, + * non-flags are returned. So we should check + * these two conditions. + */ + if (err == len && (map.m_flags & EXT4_MAP_MAPPED)) + overwrite = 1; + } + } - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - size_t length = iov_length(iov, nr_segs); + ret = __generic_file_write_iter(iocb, from); + mutex_unlock(&inode->i_mutex); - if ((pos > sbi->s_bitmap_maxbytes || - (pos == sbi->s_bitmap_maxbytes && length > 0))) - return -EFBIG; + if (ret > 0) { + ssize_t err; - if (pos + length > sbi->s_bitmap_maxbytes) { - nr_segs = iov_shorten((struct iovec *)iov, nr_segs, - sbi->s_bitmap_maxbytes - pos); - } + err = generic_write_sync(file, iocb->ki_pos - ret, ret); + if (err < 0) + ret = err; } + if (o_direct) + blk_finish_plug(&plug); - if (unlikely(iocb->ki_filp->f_flags & O_DIRECT)) - ret = ext4_file_dio_write(iocb, iov, nr_segs, pos); - else - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); - +errout: + if (aio_mutex) + mutex_unlock(aio_mutex); return ret; } static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, .remap_pages = generic_file_remap_pages, }; @@ -243,6 +237,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); if (IS_ERR(handle)) return PTR_ERR(handle); + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sbi->s_sbh); if (err) { ext4_journal_stop(handle); @@ -592,10 +587,10 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = ext4_file_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = ext4_file_write_iter, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, @@ -605,7 +600,7 @@ const struct file_operations ext4_file_operations = { .release = ext4_release_file, .fsync = ext4_sync_file, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, }; @@ -617,6 +612,7 @@ const struct inode_operations ext4_file_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, .get_acl = ext4_get_acl, + .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, }; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 137193ff389..5b87fc36aab 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -71,6 +71,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, struct ext4_group_desc *gdp) { struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); J_ASSERT_BH(bh, buffer_locked(bh)); /* If checksum is bad mark all blocks and inodes use to prevent @@ -78,7 +79,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return 0; } @@ -116,6 +126,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) struct buffer_head *bh = NULL; ext4_fsblk_t bitmap_blk; struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) @@ -185,6 +196,12 @@ verify: ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " "inode_bitmap = %llu", block_group, bitmap_blk); grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, desc); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return NULL; } @@ -321,6 +338,12 @@ out: fatal = err; } else { ext4_error(sb, "bit already cleared for inode %lu", ino); + if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); } @@ -432,7 +455,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, ext4fs_dirhash(qstr->name, qstr->len, &hinfo); grp = hinfo.hash; } else - get_random_bytes(&grp, sizeof(grp)); + grp = prandom_u32(); parent_group = (unsigned)grp % ngroups; for (i = 0; i < ngroups; i++) { g = (parent_group + i) % ngroups; @@ -851,6 +874,13 @@ got: goto out; } + BUFFER_TRACE(group_desc_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, group_desc_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + /* We may have to initialize the block bitmap if it isn't already */ if (ext4_has_group_desc_csum(sb) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { @@ -887,13 +917,6 @@ got: } } - BUFFER_TRACE(group_desc_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, group_desc_bh); - if (err) { - ext4_std_error(sb, err); - goto out; - } - /* Update the relevant bg descriptor fields */ if (ext4_has_group_desc_csum(sb)) { int free; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 594009f5f52..fd69da19482 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -389,7 +389,13 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, return 0; failed: for (; i >= 0; i--) { - if (i != indirect_blks && branch[i].bh) + /* + * We want to ext4_forget() only freshly allocated indirect + * blocks. Buffer for new_blocks[i-1] is at branch[i].bh and + * buffer at branch[0].bh is indirect block / inode already + * existing before ext4_alloc_branch() was called. + */ + if (i > 0 && i != indirect_blks && branch[i].bh) ext4_forget(handle, 1, inode, branch[i].bh, branch[i].bh->b_blocknr); ext4_free_blocks(handle, inode, NULL, new_blocks[i], @@ -639,8 +645,7 @@ out: * VFS code falls back into buffered path in that case so we are safe. */ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; @@ -648,7 +653,7 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, handle_t *handle; ssize_t ret; int orphan = 0; - size_t count = iov_length(iov, nr_segs); + size_t count = iov_iter_count(iter); int retries = 0; if (rw == WRITE) { @@ -687,18 +692,17 @@ retry: goto locked; } ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, + inode->i_sb->s_bdev, iter, offset, ext4_get_block, NULL, NULL, 0); inode_dio_done(inode); } else { locked: - ret = blockdev_direct_IO(rw, iocb, inode, iov, - offset, nr_segs, ext4_get_block); + ret = blockdev_direct_IO(rw, iocb, inode, iter, + offset, ext4_get_block); if (unlikely((rw & WRITE) && ret < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + iov_length(iov, nr_segs); + loff_t end = offset + count; if (end > isize) ext4_truncate_failed_write(inode); @@ -1312,16 +1316,24 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode, blk = *i_data; if (level > 0) { ext4_lblk_t first2; + ext4_lblk_t count2; + bh = sb_bread(inode->i_sb, le32_to_cpu(blk)); if (!bh) { EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk), "Read failure"); return -EIO; } - first2 = (first > offset) ? first - offset : 0; + if (first > offset) { + first2 = first - offset; + count2 = count; + } else { + first2 = 0; + count2 = count - (offset - first); + } ret = free_hole_blocks(handle, inode, bh, (__le32 *)bh->b_data, level - 1, - first2, count - offset, + first2, count2, inode->i_sb->s_blocksize >> 2); if (ret) { brelse(bh); @@ -1331,8 +1343,8 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode, if (level == 0 || (bh && all_zeroes((__le32 *)bh->b_data, (__le32 *)bh->b_data + addr_per_block))) { - ext4_free_data(handle, inode, parent_bh, &blk, &blk+1); - *i_data = 0; + ext4_free_data(handle, inode, parent_bh, + i_data, i_data + 1); } brelse(bh); bh = NULL; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index d9ecbf1113a..645205d8ada 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -22,7 +22,7 @@ #define EXT4_INLINE_DOTDOT_OFFSET 2 #define EXT4_INLINE_DOTDOT_SIZE 4 -int ext4_get_inline_size(struct inode *inode) +static int ext4_get_inline_size(struct inode *inode) { if (EXT4_I(inode)->i_inline_off) return EXT4_I(inode)->i_inline_size; @@ -211,8 +211,8 @@ out: * value since it is already handled by ext4_xattr_ibody_inline_set. * That saves us one memcpy. */ -void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, - void *buffer, loff_t pos, unsigned int len) +static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, + void *buffer, loff_t pos, unsigned int len) { struct ext4_xattr_entry *entry; struct ext4_xattr_ibody_header *header; @@ -264,6 +264,7 @@ static int ext4_create_inline_data(handle_t *handle, if (error) return error; + BUFFER_TRACE(is.iloc.bh, "get_write_access"); error = ext4_journal_get_write_access(handle, is.iloc.bh); if (error) goto out; @@ -347,6 +348,7 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode, if (error == -ENODATA) goto out; + BUFFER_TRACE(is.iloc.bh, "get_write_access"); error = ext4_journal_get_write_access(handle, is.iloc.bh); if (error) goto out; @@ -373,8 +375,8 @@ out: return error; } -int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, - unsigned int len) +static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, + unsigned int len) { int ret, size; struct ext4_inode_info *ei = EXT4_I(inode); @@ -424,6 +426,7 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle, if (error) goto out; + BUFFER_TRACE(is.iloc.bh, "get_write_access"); error = ext4_journal_get_write_access(handle, is.iloc.bh); if (error) goto out; @@ -849,15 +852,16 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, handle_t *handle; struct page *page; struct ext4_iloc iloc; + int retries; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; +retry_journal: handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - handle = NULL; goto out; } @@ -867,7 +871,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, if (inline_size >= pos + len) { ret = ext4_prepare_inline_data(handle, inode, pos + len); if (ret && ret != -ENOSPC) - goto out; + goto out_journal; } if (ret == -ENOSPC) { @@ -875,6 +879,10 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, inode, flags, fsdata); + ext4_journal_stop(handle); + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry_journal; goto out; } @@ -887,7 +895,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, page = grab_cache_page_write_begin(mapping, 0, flags); if (!page) { ret = -ENOMEM; - goto out; + goto out_journal; } down_read(&EXT4_I(inode)->xattr_sem); @@ -904,16 +912,15 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, up_read(&EXT4_I(inode)->xattr_sem); *pagep = page; - handle = NULL; brelse(iloc.bh); return 1; out_release_page: up_read(&EXT4_I(inode)->xattr_sem); unlock_page(page); page_cache_release(page); +out_journal: + ext4_journal_stop(handle); out: - if (handle) - ext4_journal_stop(handle); brelse(iloc.bh); return ret; } @@ -994,17 +1001,16 @@ static int ext4_add_dirent_to_inline(handle_t *handle, struct inode *dir = dentry->d_parent->d_inode; const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; - unsigned short reclen; int err; struct ext4_dir_entry_2 *de; - reclen = EXT4_DIR_REC_LEN(namelen); err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start, inline_size, name, namelen, &de); if (err) return err; + BUFFER_TRACE(iloc->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, iloc->bh); if (err) return err; @@ -1442,6 +1448,7 @@ int ext4_read_inline_dir(struct file *file, if (ret < 0) goto out; + ret = 0; sb = inode->i_sb; parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); offset = ctx->pos; @@ -1666,6 +1673,7 @@ int ext4_delete_inline_entry(handle_t *handle, EXT4_MIN_INLINE_DATA_SIZE; } + BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, bh); if (err) goto out; @@ -1838,7 +1846,6 @@ int ext4_try_to_evict_inline_data(handle_t *handle, { int error; struct ext4_xattr_entry *entry; - struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; struct ext4_iloc iloc; @@ -1847,7 +1854,6 @@ int ext4_try_to_evict_inline_data(handle_t *handle, return error; raw_inode = ext4_raw_inode(&iloc); - header = IHDR(inode, raw_inode); entry = (struct ext4_xattr_entry *)((void *)raw_inode + EXT4_I(inode)->i_inline_off); if (EXT4_XATTR_LEN(entry->e_name_len) + @@ -1925,9 +1931,11 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline) } /* Clear the content within i_blocks. */ - if (i_size < EXT4_MIN_INLINE_DATA_SIZE) - memset(ext4_raw_inode(&is.iloc)->i_block + i_size, 0, - EXT4_MIN_INLINE_DATA_SIZE - i_size); + if (i_size < EXT4_MIN_INLINE_DATA_SIZE) { + void *p = (void *) ext4_raw_inode(&is.iloc)->i_block; + memset(p + i_size, 0, + EXT4_MIN_INLINE_DATA_SIZE - i_size); + } EXT4_I(inode)->i_inline_size = i_size < EXT4_MIN_INLINE_DATA_SIZE ? diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0d424d7ac02..8a064734e6e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -38,6 +38,7 @@ #include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/aio.h> +#include <linux/bitops.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -144,8 +145,11 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, */ static int ext4_inode_is_fast_symlink(struct inode *inode) { - int ea_blocks = EXT4_I(inode)->i_file_acl ? - (inode->i_sb->s_blocksize >> 9) : 0; + int ea_blocks = EXT4_I(inode)->i_file_acl ? + EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; + + if (ext4_has_inline_data(inode)) + return 0; return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); } @@ -214,7 +218,7 @@ void ext4_evict_inode(struct inode *inode) jbd2_complete_transaction(journal, commit_tid); filemap_write_and_wait(&inode->i_data); } - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); goto no_delete; @@ -225,7 +229,7 @@ void ext4_evict_inode(struct inode *inode) if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); if (is_bad_inode(inode)) @@ -442,7 +446,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, * could be converted. */ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) - down_read((&EXT4_I(inode)->i_data_sem)); + down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags & EXT4_GET_BLOCKS_KEEP_SIZE); @@ -488,8 +492,8 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * - * On success, it returns the number of blocks being mapped or allocate. - * if create==0 and the blocks are pre-allocated and uninitialized block, + * On success, it returns the number of blocks being mapped or allocated. + * if create==0 and the blocks are pre-allocated and unwritten block, * the result buffer head is unmapped. If the create ==1, it will make sure * the buffer head is mapped. * @@ -503,6 +507,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, { struct extent_status es; int retval; + int ret = 0; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; @@ -514,6 +519,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, "logical block %lu\n", inode->i_ino, flags, map->m_len, (unsigned long) map->m_lblk); + /* + * ext4_map_blocks returns an int, and m_len is an unsigned int + */ + if (unlikely(map->m_len > INT_MAX)) + map->m_len = INT_MAX; + + /* We can handle the block number less than EXT_MAX_BLOCKS */ + if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) + return -EIO; + /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { ext4_es_lru_add(inode); @@ -543,7 +558,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * file system block. */ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) - down_read((&EXT4_I(inode)->i_data_sem)); + down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags & EXT4_GET_BLOCKS_KEEP_SIZE); @@ -552,7 +567,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, EXT4_GET_BLOCKS_KEEP_SIZE); } if (retval > 0) { - int ret; unsigned int status; if (unlikely(retval != map->m_len)) { @@ -579,7 +593,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, found: if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret = check_block_validity(inode, map); + ret = check_block_validity(inode, map); if (ret != 0) return ret; } @@ -596,7 +610,13 @@ found: * with buffer head unmapped. */ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) - return retval; + /* + * If we need to convert extent to unwritten + * we continue and do the actual work in + * ext4_ext_map_blocks() + */ + if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) + return retval; /* * Here we clear m_flags because after allocating an new extent, @@ -605,12 +625,12 @@ found: map->m_flags &= ~EXT4_MAP_FLAGS; /* - * New blocks allocate and/or writing to uninitialized extent + * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take * the write lock of i_data_sem, and call get_blocks() * with create == 1 flag. */ - down_write((&EXT4_I(inode)->i_data_sem)); + down_write(&EXT4_I(inode)->i_data_sem); /* * if the caller is from delayed allocation writeout path @@ -652,7 +672,6 @@ found: ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); if (retval > 0) { - int ret; unsigned int status; if (unlikely(retval != map->m_len)) { @@ -687,7 +706,7 @@ found: has_zeroout: up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret = check_block_validity(inode, map); + ret = check_block_validity(inode, map); if (ret != 0) return ret; } @@ -906,6 +925,7 @@ int do_journal_get_write_access(handle_t *handle, */ if (dirty) clear_buffer_dirty(bh); + BUFFER_TRACE(bh, "get write access"); ret = ext4_journal_get_write_access(handle, bh); if (!ret && dirty) ret = ext4_handle_dirty_metadata(handle, NULL, bh); @@ -1206,7 +1226,6 @@ static int ext4_journalled_write_end(struct file *file, */ static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) { - int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); unsigned int md_needed; @@ -1218,7 +1237,6 @@ static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) * in order to allocate nrblocks * worse case is one extent per block */ -repeat: spin_lock(&ei->i_block_reservation_lock); /* * ext4_calc_metadata_amount() has side effects, which we have @@ -1238,10 +1256,6 @@ repeat: ei->i_da_metadata_calc_len = save_len; ei->i_da_metadata_calc_last_lblock = save_last_lblock; spin_unlock(&ei->i_block_reservation_lock); - if (ext4_should_retry_alloc(inode->i_sb, &retries)) { - cond_resched(); - goto repeat; - } return -ENOSPC; } ei->i_reserved_meta_blocks += md_needed; @@ -1255,7 +1269,6 @@ repeat: */ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) { - int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); unsigned int md_needed; @@ -1277,7 +1290,6 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) * in order to allocate nrblocks * worse case is one extent per block */ -repeat: spin_lock(&ei->i_block_reservation_lock); /* * ext4_calc_metadata_amount() has side effects, which we have @@ -1297,10 +1309,6 @@ repeat: ei->i_da_metadata_calc_len = save_len; ei->i_da_metadata_calc_last_lblock = save_last_lblock; spin_unlock(&ei->i_block_reservation_lock); - if (ext4_should_retry_alloc(inode->i_sb, &retries)) { - cond_resched(); - goto repeat; - } dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); return -ENOSPC; } @@ -1536,7 +1544,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, ext4_es_lru_add(inode); if (ext4_es_is_hole(&es)) { retval = 0; - down_read((&EXT4_I(inode)->i_data_sem)); + down_read(&EXT4_I(inode)->i_data_sem); goto add_delayed; } @@ -1573,7 +1581,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, * Try to see if we can get the block without requesting a new * file system block. */ - down_read((&EXT4_I(inode)->i_data_sem)); + down_read(&EXT4_I(inode)->i_data_sem); if (ext4_has_inline_data(inode)) { /* * We will soon create blocks for this page, and let @@ -1765,6 +1773,7 @@ static int __ext4_journalled_writepage(struct page *page, BUG_ON(!ext4_handle_valid(handle)); if (inline_data) { + BUFFER_TRACE(inode_bh, "get write access"); ret = ext4_journal_get_write_access(handle, inode_bh); err = ext4_handle_dirty_metadata(handle, inode, inode_bh); @@ -1784,7 +1793,7 @@ static int __ext4_journalled_writepage(struct page *page, ret = err; if (!ext4_has_inline_data(inode)) - ext4_walk_page_buffers(handle, page_bufs, 0, len, + ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, bput_one); ext4_set_inode_state(inode, EXT4_STATE_JDATA); out: @@ -1842,6 +1851,7 @@ static int ext4_writepage(struct page *page, struct buffer_head *page_bufs = NULL; struct inode *inode = page->mapping->host; struct ext4_io_submit io_submit; + bool keep_towrite = false; trace_ext4_writepage(page); size = i_size_read(inode); @@ -1872,6 +1882,7 @@ static int ext4_writepage(struct page *page, unlock_page(page); return 0; } + keep_towrite = true; } if (PageChecked(page) && ext4_should_journal_data(inode)) @@ -1888,7 +1899,7 @@ static int ext4_writepage(struct page *page, unlock_page(page); return -ENOMEM; } - ret = ext4_bio_write_page(&io_submit, page, len, wbc); + ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite); ext4_io_submit(&io_submit); /* Drop io_end reference we got from init */ ext4_put_io_end_defer(io_submit.io_end); @@ -1907,7 +1918,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) else len = PAGE_CACHE_SIZE; clear_page_dirty_for_io(page); - err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc); + err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false); if (!err) mpd->wbc->nr_to_write--; mpd->first_page++; @@ -2028,7 +2039,7 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, * Scan buffers corresponding to changed extent (we expect corresponding pages * to be already locked) and update buffer state according to new extent state. * We map delalloc buffers to their physical location, clear unwritten bits, - * and mark buffers as uninit when we perform writes to uninitialized extents + * and mark buffers as uninit when we perform writes to unwritten extents * and do extent conversion after IO is finished. If the last page is not fully * mapped, we update @map to the next extent in the last page that needs * mapping. Otherwise we submit the page for IO. @@ -2122,12 +2133,12 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) struct inode *inode = mpd->inode; struct ext4_map_blocks *map = &mpd->map; int get_blocks_flags; - int err; + int err, dioread_nolock; trace_ext4_da_write_pages_extent(inode, map); /* * Call ext4_map_blocks() to allocate any delayed allocation blocks, or - * to convert an uninitialized extent to be initialized (in the case + * to convert an unwritten extent to be initialized (in the case * where we have written into one or more preallocated blocks). It is * possible that we're going to need more metadata blocks than * previously reserved. However we must not fail because we're in @@ -2144,7 +2155,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL; - if (ext4_should_dioread_nolock(inode)) + dioread_nolock = ext4_should_dioread_nolock(inode); + if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; if (map->m_flags & (1 << BH_Delay)) get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; @@ -2152,7 +2164,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) err = ext4_map_blocks(handle, inode, map, get_blocks_flags); if (err < 0) return err; - if (map->m_flags & EXT4_MAP_UNINIT) { + if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) { if (!mpd->io_submit.io_end->handle && ext4_handle_valid(handle)) { mpd->io_submit.io_end->handle = handle->h_rsv_handle; @@ -2178,6 +2190,9 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) * * @handle - handle for journal operations * @mpd - extent to map + * @give_up_on_write - we set this to true iff there is a fatal error and there + * is no hope of writing the data. The caller should discard + * dirty pages to avoid infinite loops. * * The function maps extent starting at mpd->lblk of length mpd->len. If it is * delayed, blocks are allocated, if it is unwritten, we may need to convert @@ -2240,13 +2255,23 @@ static int mpage_map_and_submit_extent(handle_t *handle, return err; } while (map->m_len); - /* Update on-disk size after IO is submitted */ + /* + * Update on-disk size after IO is submitted. Races with + * truncate are avoided by checking i_size under i_data_sem. + */ disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; if (disksize > EXT4_I(inode)->i_disksize) { int err2; - - ext4_wb_update_i_disksize(inode, disksize); + loff_t i_size; + + down_write(&EXT4_I(inode)->i_data_sem); + i_size = i_size_read(inode); + if (disksize > i_size) + disksize = i_size; + if (disksize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = disksize; err2 = ext4_mark_inode_dirty(handle, inode); + up_write(&EXT4_I(inode)->i_data_sem); if (err2) ext4_error(inode->i_sb, "Failed to mark inode %lu dirty", @@ -2295,6 +2320,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) struct address_space *mapping = mpd->inode->i_mapping; struct pagevec pvec; unsigned int nr_pages; + long left = mpd->wbc->nr_to_write; pgoff_t index = mpd->first_page; pgoff_t end = mpd->last_page; int tag; @@ -2330,6 +2356,17 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) if (page->index > end) goto out; + /* + * Accumulated enough dirty pages? This doesn't apply + * to WB_SYNC_ALL mode. For integrity sync we have to + * keep going because someone may be concurrently + * dirtying pages, and we might have synced a lot of + * newly appeared dirty pages, but have not synced all + * of the old dirty pages. + */ + if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0) + goto out; + /* If we can't merge this page, we are done. */ if (mpd->map.m_len > 0 && mpd->next_page != page->index) goto out; @@ -2364,19 +2401,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) if (err <= 0) goto out; err = 0; - - /* - * Accumulated enough dirty pages? This doesn't apply - * to WB_SYNC_ALL mode. For integrity sync we have to - * keep going because someone may be concurrently - * dirtying pages, and we might have synced a lot of - * newly appeared dirty pages, but have not synced all - * of the old dirty pages. - */ - if (mpd->wbc->sync_mode == WB_SYNC_NONE && - mpd->next_page - mpd->first_page >= - mpd->wbc->nr_to_write) - goto out; + left--; } pagevec_release(&pvec); cond_resched(); @@ -2420,16 +2445,15 @@ static int ext4_writepages(struct address_space *mapping, * because that could violate lock ordering on umount */ if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) - return 0; + goto out_writepages; if (ext4_should_journal_data(inode)) { struct blk_plug plug; - int ret; blk_start_plug(&plug); ret = write_cache_pages(mapping, wbc, __writepage, mapping); blk_finish_plug(&plug); - return ret; + goto out_writepages; } /* @@ -2442,8 +2466,10 @@ static int ext4_writepages(struct address_space *mapping, * *never* be called, so if that ever happens, we would want * the stack trace. */ - if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) - return -EROFS; + if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) { + ret = -EROFS; + goto out_writepages; + } if (ext4_should_dioread_nolock(inode)) { /* @@ -2563,7 +2589,7 @@ retry: break; } blk_finish_plug(&plug); - if (!ret && !cycled) { + if (!ret && !cycled && wbc->nr_to_write > 0) { cycled = 1; mpd.last_page = writeback_index - 1; mpd.first_page = 0; @@ -3052,9 +3078,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, * preallocated extents, and those write extend the file, no need to * fall back to buffered IO. * - * For holes, we fallocate those blocks, mark them as uninitialized + * For holes, we fallocate those blocks, mark them as unwritten * If those blocks were preallocated, we mark sure they are split, but - * still keep the range to write as uninitialized. + * still keep the range to write as unwritten. * * The unwritten extents will be converted to written when DIO is completed. * For async direct IO, since the IO may still pending when return, we @@ -3067,13 +3093,12 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, * */ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; ssize_t ret; - size_t count = iov_length(iov, nr_segs); + size_t count = iov_iter_count(iter); int overwrite = 0; get_block_t *get_block_func = NULL; int dio_flags = 0; @@ -3082,7 +3107,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, /* Use the old path for reads and writes beyond i_size. */ if (rw != WRITE || final_size > inode->i_size) - return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); + return ext4_ind_direct_IO(rw, iocb, iter, offset); BUG_ON(iocb->private == NULL); @@ -3106,12 +3131,12 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, * We could direct write to holes and fallocate. * * Allocated blocks to fill the hole are marked as - * uninitialized to prevent parallel buffered read to expose + * unwritten to prevent parallel buffered read to expose * the stale data before DIO complete the data IO. * * As to previously fallocated extents, ext4 get_block will * just simply mark the buffer mapped but still keep the - * extents uninitialized. + * extents unwritten. * * For non AIO case, we will convert those unwritten extents * to written after return back from blockdev_direct_IO. @@ -3149,8 +3174,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, dio_flags = DIO_LOCKING; } ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, + inode->i_sb->s_bdev, iter, + offset, get_block_func, ext4_end_io_dio, NULL, @@ -3204,11 +3229,11 @@ retake_lock: } static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + size_t count = iov_iter_count(iter); ssize_t ret; /* @@ -3221,13 +3246,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, if (ext4_has_inline_data(inode)) return 0; - trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); + trace_ext4_direct_IO_enter(inode, offset, count, rw); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); + ret = ext4_ext_direct_IO(rw, iocb, iter, offset); else - ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); - trace_ext4_direct_IO_exit(inode, offset, - iov_length(iov, nr_segs), rw, ret); + ret = ext4_ind_direct_IO(rw, iocb, iter, offset); + trace_ext4_direct_IO_exit(inode, offset, count, rw, ret); return ret; } @@ -3320,33 +3344,13 @@ void ext4_set_aops(struct inode *inode) } /* - * ext4_block_truncate_page() zeroes out a mapping from file offset `from' - * up to the end of the block which corresponds to `from'. - * This required during truncate. We need to physically zero the tail end - * of that block so it doesn't yield old data if the file is later grown. - */ -int ext4_block_truncate_page(handle_t *handle, - struct address_space *mapping, loff_t from) -{ - unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned length; - unsigned blocksize; - struct inode *inode = mapping->host; - - blocksize = inode->i_sb->s_blocksize; - length = blocksize - (offset & (blocksize - 1)); - - return ext4_block_zero_page_range(handle, mapping, from, length); -} - -/* * ext4_block_zero_page_range() zeros out a mapping of length 'length' * starting from file offset 'from'. The range to be zero'd must * be contained with in one block. If the specified range exceeds * the end of the block it will be shortened to end of the block * that cooresponds to 'from' */ -int ext4_block_zero_page_range(handle_t *handle, +static int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; @@ -3436,6 +3440,26 @@ unlock: return err; } +/* + * ext4_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +static int ext4_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from) +{ + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned length; + unsigned blocksize; + struct inode *inode = mapping->host; + + blocksize = inode->i_sb->s_blocksize; + length = blocksize - (offset & (blocksize - 1)); + + return ext4_block_zero_page_range(handle, mapping, from, length); +} + int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t length) { @@ -3509,12 +3533,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if (EXT4_SB(sb)->s_cluster_ratio > 1) { - /* TODO: Add support for bigalloc file systems */ - return -EOPNOTSUPP; - } - - trace_ext4_punch_hole(inode, offset, length); + trace_ext4_punch_hole(inode, offset, length, 0); /* * Write out all dirty pages to avoid race conditions @@ -3528,15 +3547,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) } mutex_lock(&inode->i_mutex); - /* It's not possible punch hole on append only file */ - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - ret = -EPERM; - goto out_mutex; - } - if (IS_SWAPFILE(inode)) { - ret = -ETXTBSY; - goto out_mutex; - } /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) @@ -3617,10 +3627,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) ret = ext4_free_hole_blocks(handle, inode, first_block, stop_block); - ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); + + /* Now release the pages again to reduce race window */ + if (last_block_offset > first_block_offset) + truncate_pagecache_range(inode, first_block_offset, + last_block_offset); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); out_stop: @@ -3694,7 +3709,7 @@ void ext4_truncate(struct inode *inode) /* * There is a possibility that we're either freeing the inode - * or it completely new indode. In those cases we might not + * or it's a completely new inode. In those cases we might not * have i_mutex locked because it's not necessary. */ if (!(inode->i_state & (I_NEW|I_FREEING))) @@ -3934,18 +3949,20 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) void ext4_set_inode_flags(struct inode *inode) { unsigned int flags = EXT4_I(inode)->i_flags; + unsigned int new_fl = 0; - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); if (flags & EXT4_SYNC_FL) - inode->i_flags |= S_SYNC; + new_fl |= S_SYNC; if (flags & EXT4_APPEND_FL) - inode->i_flags |= S_APPEND; + new_fl |= S_APPEND; if (flags & EXT4_IMMUTABLE_FL) - inode->i_flags |= S_IMMUTABLE; + new_fl |= S_IMMUTABLE; if (flags & EXT4_NOATIME_FL) - inode->i_flags |= S_NOATIME; + new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) - inode->i_flags |= S_DIRSYNC; + new_fl |= S_DIRSYNC; + inode_set_flags(inode, new_fl, + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); } /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ @@ -4164,11 +4181,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); - inode->i_version = le32_to_cpu(raw_inode->i_disk_version); - if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) - inode->i_version |= - (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { + inode->i_version = le32_to_cpu(raw_inode->i_disk_version); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) + inode->i_version |= + (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; + } } ret = 0; @@ -4291,12 +4310,15 @@ static int ext4_do_update_inode(handle_t *handle, struct ext4_inode *raw_inode = ext4_raw_inode(iloc); struct ext4_inode_info *ei = EXT4_I(inode); struct buffer_head *bh = iloc->bh; + struct super_block *sb = inode->i_sb; int err = 0, rc, block; - int need_datasync = 0; + int need_datasync = 0, set_large_file = 0; uid_t i_uid; gid_t i_gid; - /* For fields not not tracking in the in-memory inode, + spin_lock(&ei->i_raw_lock); + + /* For fields not tracked in the in-memory inode, * initialise them to zero for new inodes. */ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); @@ -4334,12 +4356,13 @@ static int ext4_do_update_inode(handle_t *handle, EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); - if (ext4_inode_blocks_set(handle, raw_inode, ei)) + if (ext4_inode_blocks_set(handle, raw_inode, ei)) { + spin_unlock(&ei->i_raw_lock); goto out_brelse; + } raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); - if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != - cpu_to_le32(EXT4_OS_HURD)) + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); @@ -4348,24 +4371,11 @@ static int ext4_do_update_inode(handle_t *handle, need_datasync = 1; } if (ei->i_disksize > 0x7fffffffULL) { - struct super_block *sb = inode->i_sb; if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || EXT4_SB(sb)->s_es->s_rev_level == - cpu_to_le32(EXT4_GOOD_OLD_REV)) { - /* If this is the first large file - * created, add a flag to the superblock. - */ - err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh); - if (err) - goto out_brelse; - ext4_update_dynamic_rev(sb); - EXT4_SET_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_LARGE_FILE); - ext4_handle_sync(handle); - err = ext4_handle_dirty_super(handle, sb); - } + cpu_to_le32(EXT4_GOOD_OLD_REV)) + set_large_file = 1; } raw_inode->i_generation = cpu_to_le32(inode->i_generation); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { @@ -4384,22 +4394,37 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_block[block] = ei->i_data[block]; } - raw_inode->i_disk_version = cpu_to_le32(inode->i_version); - if (ei->i_extra_isize) { - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) - raw_inode->i_version_hi = - cpu_to_le32(inode->i_version >> 32); - raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { + raw_inode->i_disk_version = cpu_to_le32(inode->i_version); + if (ei->i_extra_isize) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) + raw_inode->i_version_hi = + cpu_to_le32(inode->i_version >> 32); + raw_inode->i_extra_isize = + cpu_to_le16(ei->i_extra_isize); + } } ext4_inode_csum_set(inode, raw_inode, ei); + spin_unlock(&ei->i_raw_lock); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); rc = ext4_handle_dirty_metadata(handle, NULL, bh); if (!err) err = rc; ext4_clear_inode_state(inode, EXT4_STATE_NEW); - + if (set_large_file) { + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); + err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + if (err) + goto out_brelse; + ext4_update_dynamic_rev(sb); + EXT4_SET_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_LARGE_FILE); + ext4_handle_sync(handle); + err = ext4_handle_dirty_super(handle, sb); + } ext4_update_inode_fsync_trans(handle, inode, need_datasync); out_brelse: brelse(bh); @@ -4412,21 +4437,20 @@ out_brelse: * * We are called from a few places: * - * - Within generic_file_write() for O_SYNC files. + * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running * transaction to commit. * - * - Within sys_sync(), kupdate and such. - * We wait on commit, if tol to. + * - Within flush work (sys_sync(), kupdate and such). + * We wait on commit, if told to. * - * - Within prune_icache() (PF_MEMALLOC == true) - * Here we simply return. We can't afford to block kswapd on the - * journal commit. + * - Within iput_final() -> write_inode_now() + * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in - * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for - * knfsd. + * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL + * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in @@ -4438,15 +4462,15 @@ out_brelse: * stuff(); * inode->i_size = expr; * - * is in error because a kswapd-driven write_inode() could occur while - * `stuff()' is running, and the new i_size will be lost. Plus the inode - * will no longer be on the superblock's dirty inode list. + * is in error because write_inode() could occur while `stuff()' is running, + * and the new i_size will be lost. Plus the inode will no longer be on the + * superblock's dirty inode list. */ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; - if (current->flags & PF_MEMALLOC) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; if (EXT4_SB(inode->i_sb)->s_journal) { @@ -4456,7 +4480,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) return -EIO; } - if (wbc->sync_mode != WB_SYNC_ALL) + /* + * No need to force transaction in WB_SYNC_NONE mode. Also + * ext4_sync_fs() will force the commit after everything is + * written. + */ + if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; err = ext4_force_commit(inode->i_sb); @@ -4466,7 +4495,11 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) err = __ext4_get_inode_loc(inode, &iloc, 0); if (err) return err; - if (wbc->sync_mode == WB_SYNC_ALL) + /* + * sync(2) will flush the whole buffer cache. No need to do + * it here separately for each inode. + */ + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, @@ -4594,6 +4627,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_size > sbi->s_bitmap_maxbytes) return -EFBIG; } + + if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size) + inode_inc_iversion(inode); + if (S_ISREG(inode->i_mode) && (attr->ia_size < inode->i_size)) { if (ext4_should_order_data(inode)) { @@ -4671,7 +4708,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ext4_orphan_del(NULL, inode); if (!rc && (ia_valid & ATTR_MODE)) - rc = ext4_acl_chmod(inode); + rc = posix_acl_chmod(inode, inode->i_mode); err_out: ext4_std_error(inode->i_sb, error); @@ -4690,6 +4727,15 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, generic_fillattr(inode, stat); /* + * If there is inline data in the inode, the inode will normally not + * have data blocks allocated (it may have an external xattr block). + * Report at least one sector for such files, so tools like tar, rsync, + * others doen't incorrectly think the file is completely sparse. + */ + if (unlikely(ext4_has_inline_data(inode))) + stat->blocks += (stat->size + 511) >> 9; + + /* * We can't update i_blocks if the block allocation is delayed * otherwise in the case of system crash before the real block * allocation is done, we will have i_blocks inconsistent with @@ -4700,9 +4746,8 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, * blocks for this file. */ delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), - EXT4_I(inode)->i_reserved_data_blocks); - - stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9); + EXT4_I(inode)->i_reserved_data_blocks); + stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9); return 0; } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index a569d335f80..0f2252ec274 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -101,28 +101,18 @@ static long swap_inode_boot_loader(struct super_block *sb, handle_t *handle; int err; struct inode *inode_bl; - struct ext4_inode_info *ei; struct ext4_inode_info *ei_bl; - struct ext4_sb_info *sbi; + struct ext4_sb_info *sbi = EXT4_SB(sb); - if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) { - err = -EINVAL; - goto swap_boot_out; - } - - if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) { - err = -EPERM; - goto swap_boot_out; - } + if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) + return -EINVAL; - sbi = EXT4_SB(sb); - ei = EXT4_I(inode); + if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) + return -EPERM; inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); - if (IS_ERR(inode_bl)) { - err = PTR_ERR(inode_bl); - goto swap_boot_out; - } + if (IS_ERR(inode_bl)) + return PTR_ERR(inode_bl); ei_bl = EXT4_I(inode_bl); filemap_flush(inode->i_mapping); @@ -130,7 +120,7 @@ static long swap_inode_boot_loader(struct super_block *sb, /* Protect orig inodes against a truncate and make sure, * that only 1 swap_inode_boot_loader is running. */ - ext4_inode_double_lock(inode, inode_bl); + lock_two_nondirectories(inode, inode_bl); truncate_inode_pages(&inode->i_data, 0); truncate_inode_pages(&inode_bl->i_data, 0); @@ -144,7 +134,7 @@ static long swap_inode_boot_loader(struct super_block *sb, handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); if (IS_ERR(handle)) { err = -EINVAL; - goto swap_boot_out; + goto journal_err_out; } /* Protect extent tree against block allocations via delalloc */ @@ -197,19 +187,14 @@ static long swap_inode_boot_loader(struct super_block *sb, ext4_mark_inode_dirty(handle, inode); } } - ext4_journal_stop(handle); - ext4_double_up_write_data_sem(inode, inode_bl); +journal_err_out: ext4_inode_resume_unlocked_dio(inode); ext4_inode_resume_unlocked_dio(inode_bl); - - ext4_inode_double_unlock(inode, inode_bl); - + unlock_two_nondirectories(inode, inode_bl); iput(inode_bl); - -swap_boot_out: return err; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a41e3ba8cfa..2dcb936be90 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -722,6 +722,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_grpblk_t first; @@ -751,14 +752,17 @@ void ext4_mb_generate_buddy(struct super_block *sb, if (free != grp->bb_free) { ext4_grp_locked_error(sb, group, 0, 0, - "%u clusters in bitmap, %u in gd; " - "block bitmap corrupt.", + "block bitmap and bg descriptor " + "inconsistent: %u vs %u free clusters", free, grp->bb_free); /* * If we intend to continue, we consider group descriptor * corrupt and update bb_free using bitmap value */ grp->bb_free = free; + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); } mb_set_largest_free_order(sb, grp); @@ -989,7 +993,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, poff = block % blocks_per_page; page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (!page) - return -EIO; + return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); e4b->bd_bitmap_page = page; e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); @@ -1003,7 +1007,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, pnum = block / blocks_per_page; page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (!page) - return -EIO; + return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); e4b->bd_buddy_page = page; return 0; @@ -1044,6 +1048,8 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) * allocating. If we are looking at the buddy cache we would * have taken a reference using ext4_mb_load_buddy and that * would have pinned buddy page to page cache. + * The call to ext4_mb_get_buddy_page_lock will mark the + * page accessed. */ ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { @@ -1062,7 +1068,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) ret = -EIO; goto err; } - mark_page_accessed(page); if (e4b.bd_buddy_page == NULL) { /* @@ -1082,7 +1087,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) ret = -EIO; goto err; } - mark_page_accessed(page); err: ext4_mb_put_buddy_page_lock(&e4b); return ret; @@ -1141,7 +1145,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, /* we could use find_or_create_page(), but it locks page * what we'd like to avoid in fast path ... */ - page = find_get_page(inode->i_mapping, pnum); + page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); if (page == NULL || !PageUptodate(page)) { if (page) /* @@ -1168,19 +1172,24 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) { + if (page == NULL) { + ret = -ENOMEM; + goto err; + } + if (!PageUptodate(page)) { ret = -EIO; goto err; } + + /* Pages marked accessed already */ e4b->bd_bitmap_page = page; e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); - mark_page_accessed(page); block++; pnum = block / blocks_per_page; poff = block % blocks_per_page; - page = find_get_page(inode->i_mapping, pnum); + page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); if (page == NULL || !PageUptodate(page)) { if (page) page_cache_release(page); @@ -1197,13 +1206,18 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) { + if (page == NULL) { + ret = -ENOMEM; + goto err; + } + if (!PageUptodate(page)) { ret = -EIO; goto err; } + + /* Pages marked accessed already */ e4b->bd_buddy_page = page; e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); - mark_page_accessed(page); BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); @@ -1421,6 +1435,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); if (unlikely(block != -1)) { + struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); @@ -1431,6 +1446,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, "freeing already freed block " "(bit %u); block bitmap corrupt.", block); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + e4b->bd_info->bb_free); /* Mark the block group as corrupt. */ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &e4b->bd_info->bb_state); @@ -1808,6 +1826,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, ext4_lock_group(ac->ac_sb, group); max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, ac->ac_g_ex.fe_len, &ex); + ex.fe_logical = 0xDEADFA11; /* debug value */ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ext4_fsblk_t start; @@ -1936,7 +1955,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, */ break; } - + ex.fe_logical = 0xDEADC0DE; /* debug value */ ext4_mb_measure_extent(ac, &ex, e4b); i += ex.fe_len; @@ -1977,6 +1996,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, max = mb_find_extent(e4b, i, sbi->s_stripe, &ex); if (max >= sbi->s_stripe) { ac->ac_found++; + ex.fe_logical = 0xDEADF00D; /* debug value */ ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); break; @@ -2607,7 +2627,7 @@ int ext4_mb_init(struct super_block *sb) sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { ret = -ENOMEM; - goto out_free_groupinfo_slab; + goto out; } for_each_possible_cpu(i) { struct ext4_locality_group *lg; @@ -2632,8 +2652,6 @@ int ext4_mb_init(struct super_block *sb) out_free_locality_groups: free_percpu(sbi->s_locality_groups); sbi->s_locality_groups = NULL; -out_free_groupinfo_slab: - ext4_groupinfo_destroy_slabs(); out: kfree(sbi->s_mb_offsets); sbi->s_mb_offsets = NULL; @@ -2866,6 +2884,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (!bitmap_bh) goto out_err; + BUFFER_TRACE(bitmap_bh, "getting write access"); err = ext4_journal_get_write_access(handle, bitmap_bh); if (err) goto out_err; @@ -2878,6 +2897,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, ext4_free_group_clusters(sb, gdp)); + BUFFER_TRACE(gdp_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdp_bh); if (err) goto out_err; @@ -3135,7 +3155,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, } BUG_ON(start + size <= ac->ac_o_ex.fe_logical && start > ac->ac_o_ex.fe_logical); - BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); + BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); /* now prepare goal request */ @@ -3442,6 +3462,9 @@ static void ext4_mb_pa_callback(struct rcu_head *head) { struct ext4_prealloc_space *pa; pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); + + BUG_ON(atomic_read(&pa->pa_count)); + BUG_ON(pa->pa_deleted == 0); kmem_cache_free(ext4_pspace_cachep, pa); } @@ -3455,11 +3478,13 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, ext4_group_t grp; ext4_fsblk_t grp_blk; - if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) - return; - /* in this short window concurrent discard can set pa_deleted */ spin_lock(&pa->pa_lock); + if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) { + spin_unlock(&pa->pa_lock); + return; + } + if (pa->pa_deleted == 1) { spin_unlock(&pa->pa_lock); return; @@ -4001,8 +4026,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); - ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found", - ac->ac_ex_scanned, ac->ac_found); + ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found); ext4_msg(ac->ac_sb, KERN_ERR, "groups: "); ngroups = ext4_get_groups_count(sb); for (i = 0; i < ngroups; i++) { @@ -4121,7 +4145,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, ext4_get_group_no_and_offset(sb, goal, &group, &block); /* set up allocation goals */ - ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1); + ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical); ac->ac_status = AC_STATUS_CONTINUE; ac->ac_sb = sb; ac->ac_inode = ar->inode; @@ -4663,7 +4687,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, * blocks at the beginning or the end unless we are explicitly * requested to avoid doing so. */ - overflow = block & (sbi->s_cluster_ratio - 1); + overflow = EXT4_PBLK_COFF(sbi, block); if (overflow) { if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { overflow = sbi->s_cluster_ratio - overflow; @@ -4677,7 +4701,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, count += overflow; } } - overflow = count & (sbi->s_cluster_ratio - 1); + overflow = EXT4_LBLK_COFF(sbi, count); if (overflow) { if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { if (count > overflow) @@ -4794,8 +4818,8 @@ do_more: " group:%d block:%d count:%lu failed" " with %d", block_group, bit, count, err); - } - + } else + EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); @@ -5002,6 +5026,8 @@ error_return: */ static int ext4_trim_extent(struct super_block *sb, int start, int count, ext4_group_t group, struct ext4_buddy *e4b) +__releases(bitlock) +__acquires(bitlock) { struct ext4_free_extent ex; int ret = 0; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 08481ee84cd..d634e183b4d 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -48,7 +48,7 @@ extern ushort ext4_mballoc_debug; } \ } while (0) #else -#define mb_debug(n, fmt, a...) +#define mb_debug(n, fmt, a...) no_printk(fmt, ## a) #endif #define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ @@ -175,8 +175,6 @@ struct ext4_allocation_context { /* copy of the best found extent taken before preallocation efforts */ struct ext4_free_extent ac_f_ex; - /* number of iterations done. we have to track to limit searching */ - unsigned long ac_ex_scanned; __u16 ac_groups_scanned; __u16 ac_found; __u16 ac_tail; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 2ae73a80c19..ec092437d3e 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -505,7 +505,7 @@ int ext4_ext_migrate(struct inode *inode) * with i_data_sem held to prevent racing with block * allocation. */ - down_read((&EXT4_I(inode)->i_data_sem)); + down_read(&EXT4_I(inode)->i_data_sem); ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE); up_read((&EXT4_I(inode)->i_data_sem)); diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 214461e42a0..32bce844c2e 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -18,7 +18,7 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) return cpu_to_le32(csum); } -int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) +static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) { if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) @@ -27,7 +27,7 @@ int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp); } -void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) +static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) { if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) @@ -259,7 +259,7 @@ static unsigned int mmp_new_seq(void) u32 new_seq; do { - get_random_bytes(&new_seq, sizeof(u32)); + new_seq = prandom_u32(); } while (new_seq > EXT4_MMP_SEQ_MAX); return new_seq; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 7fa4d855dbd..2484c7ec6a7 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -57,8 +57,8 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock, static void copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest) { - if (ext4_ext_is_uninitialized(src)) - ext4_ext_mark_uninitialized(dest); + if (ext4_ext_is_unwritten(src)) + ext4_ext_mark_unwritten(dest); else dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest)); } @@ -76,7 +76,7 @@ copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest) * ext4_ext_path structure refers to the last extent, or a negative error * value on failure. */ -static int +int mext_next_extent(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent **extent) { @@ -391,6 +391,7 @@ mext_insert_extents(handle_t *handle, struct inode *orig_inode, if (depth) { /* Register to journal */ + BUFFER_TRACE(orig_path->p_bh, "get_write_access"); ret = ext4_journal_get_write_access(handle, orig_path->p_bh); if (ret) return ret; @@ -593,14 +594,14 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, * @inode: inode in question * @from: block offset of inode * @count: block count to be checked - * @uninit: extents expected to be uninitialized + * @unwritten: extents expected to be unwritten * @err: pointer to save error value * * Return 1 if all extents in range has expected type, and zero otherwise. */ static int mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, - int uninit, int *err) + int unwritten, int *err) { struct ext4_ext_path *path = NULL; struct ext4_extent *ext; @@ -611,7 +612,7 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, if (*err) goto out; ext = path[ext_depth(inode)].p_ext; - if (uninit != ext4_ext_is_uninitialized(ext)) + if (unwritten != ext4_ext_is_unwritten(ext)) goto out; from += ext4_ext_get_actual_len(ext); ext4_ext_drop_refs(path); @@ -861,8 +862,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) } if (!buffer_mapped(bh)) { zero_user(page, block_start, blocksize); - if (!err) - set_buffer_uptodate(bh); + set_buffer_uptodate(bh); continue; } } @@ -895,7 +895,7 @@ out: * @orig_page_offset: page index on original file * @data_offset_in_page: block index where data swapping starts * @block_len_in_page: the number of blocks to be swapped - * @uninit: orig extent is uninitialized or not + * @unwritten: orig extent is unwritten or not * @err: pointer to save return value * * Save the data in original inode blocks and replace original inode extents @@ -906,7 +906,7 @@ out: static int move_extent_per_page(struct file *o_filp, struct inode *donor_inode, pgoff_t orig_page_offset, int data_offset_in_page, - int block_len_in_page, int uninit, int *err) + int block_len_in_page, int unwritten, int *err) { struct inode *orig_inode = file_inode(o_filp); struct page *pagep[2] = {NULL, NULL}; @@ -963,27 +963,27 @@ again: if (unlikely(*err < 0)) goto stop_journal; /* - * If orig extent was uninitialized it can become initialized + * If orig extent was unwritten it can become initialized * at any time after i_data_sem was dropped, in order to * serialize with delalloc we have recheck extent while we * hold page's lock, if it is still the case data copy is not * necessary, just swap data blocks between orig and donor. */ - if (uninit) { + if (unwritten) { ext4_double_down_write_data_sem(orig_inode, donor_inode); /* If any of extents in range became initialized we have to * fallback to data copying */ - uninit = mext_check_coverage(orig_inode, orig_blk_offset, - block_len_in_page, 1, err); + unwritten = mext_check_coverage(orig_inode, orig_blk_offset, + block_len_in_page, 1, err); if (*err) goto drop_data_sem; - uninit &= mext_check_coverage(donor_inode, orig_blk_offset, - block_len_in_page, 1, err); + unwritten &= mext_check_coverage(donor_inode, orig_blk_offset, + block_len_in_page, 1, err); if (*err) goto drop_data_sem; - if (!uninit) { + if (!unwritten) { ext4_double_up_write_data_sem(orig_inode, donor_inode); goto data_copy; } @@ -1203,42 +1203,6 @@ mext_check_arguments(struct inode *orig_inode, } /** - * ext4_inode_double_lock - Lock i_mutex on both @inode1 and @inode2 - * - * @inode1: the inode structure - * @inode2: the inode structure - * - * Lock two inodes' i_mutex - */ -void -ext4_inode_double_lock(struct inode *inode1, struct inode *inode2) -{ - BUG_ON(inode1 == inode2); - if (inode1 < inode2) { - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); - } else { - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); - } -} - -/** - * ext4_inode_double_unlock - Release i_mutex on both @inode1 and @inode2 - * - * @inode1: the inode that is released first - * @inode2: the inode that is released second - * - */ - -void -ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2) -{ - mutex_unlock(&inode1->i_mutex); - mutex_unlock(&inode2->i_mutex); -} - -/** * ext4_move_extents - Exchange the specified range of a file * * @o_filp: file structure of the original file @@ -1296,7 +1260,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; int data_offset_in_page; int block_len_in_page; - int uninit; + int unwritten; if (orig_inode->i_sb != donor_inode->i_sb) { ext4_debug("ext4 move extent: The argument files " @@ -1327,7 +1291,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, return -EINVAL; } /* Protect orig and donor inodes against a truncate */ - ext4_inode_double_lock(orig_inode, donor_inode); + lock_two_nondirectories(orig_inode, donor_inode); /* Wait for all existing dio workers */ ext4_inode_block_unlocked_dio(orig_inode); @@ -1428,8 +1392,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, !last_extent) continue; - /* Is original extent is uninitialized */ - uninit = ext4_ext_is_uninitialized(ext_prev); + /* Is original extent is unwritten */ + unwritten = ext4_ext_is_unwritten(ext_prev); data_offset_in_page = seq_start % blocks_per_page; @@ -1469,8 +1433,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, o_filp, donor_inode, orig_page_offset, data_offset_in_page, - block_len_in_page, uninit, - &ret); + block_len_in_page, + unwritten, &ret); /* Count how many blocks we have exchanged */ *moved_len += block_len_in_page; @@ -1535,7 +1499,7 @@ out: ext4_double_up_write_data_sem(orig_inode, donor_inode); ext4_inode_resume_unlocked_dio(orig_inode); ext4_inode_resume_unlocked_dio(donor_inode); - ext4_inode_double_unlock(orig_inode, donor_inode); + unlock_two_nondirectories(orig_inode, donor_inode); return ret; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 1bec5a5c1e4..3520ab8a663 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -67,6 +67,7 @@ static struct buffer_head *ext4_append(handle_t *handle, return ERR_PTR(err); inode->i_size += inode->i_sb->s_blocksize; EXT4_I(inode)->i_disksize = inode->i_size; + BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, bh); if (err) { brelse(bh); @@ -1425,9 +1426,8 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi return ERR_PTR(-EIO); } if (unlikely(ino == dir->i_ino)) { - EXT4_ERROR_INODE(dir, "'%.*s' linked to parent dir", - dentry->d_name.len, - dentry->d_name.name); + EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir", + dentry); return ERR_PTR(-EIO); } inode = ext4_iget(dir->i_sb, ino); @@ -1779,6 +1779,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, blocksize = dir->i_sb->s_blocksize; dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); + BUFFER_TRACE(bh, "get_write_access"); retval = ext4_journal_get_write_access(handle, bh); if (retval) { ext4_std_error(dir->i_sb, retval); @@ -2319,7 +2320,7 @@ retry: d_tmpfile(dentry, inode); err = ext4_orphan_add(handle, inode); if (err) - goto err_drop_inode; + goto err_unlock_inode; mark_inode_dirty(inode); unlock_new_inode(inode); } @@ -2328,10 +2329,9 @@ retry: if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; -err_drop_inode: +err_unlock_inode: ext4_journal_stop(handle); unlock_new_inode(inode); - iput(inode); return err; } @@ -2512,8 +2512,7 @@ static int empty_dir(struct inode *inode) ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize); de = ext4_next_entry(de1, sb->s_blocksize); while (offset < inode->i_size) { - if (!bh || - (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { + if ((void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { unsigned int lblock; err = 0; brelse(bh); @@ -2541,26 +2540,37 @@ static int empty_dir(struct inode *inode) return 1; } -/* ext4_orphan_add() links an unlinked or truncated inode into a list of +/* + * ext4_orphan_add() links an unlinked or truncated inode into a list of * such inodes, starting at the superblock, in case we crash before the * file is closed/deleted, or in case the inode truncate spans multiple * transactions and the last transaction is not recovered after a crash. * * At filesystem recovery time, we walk this list deleting unlinked * inodes and truncating linked inodes in ext4_orphan_cleanup(). + * + * Orphan list manipulation functions must be called under i_mutex unless + * we are just creating the inode or deleting it. */ int ext4_orphan_add(handle_t *handle, struct inode *inode) { struct super_block *sb = inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_iloc iloc; int err = 0, rc; + bool dirty = false; - if (!EXT4_SB(sb)->s_journal) + if (!sbi->s_journal) return 0; - mutex_lock(&EXT4_SB(sb)->s_orphan_lock); + WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + !mutex_is_locked(&inode->i_mutex)); + /* + * Exit early if inode already is on orphan list. This is a big speedup + * since we don't have to contend on the global s_orphan_lock. + */ if (!list_empty(&EXT4_I(inode)->i_orphan)) - goto out_unlock; + return 0; /* * Orphan handling is only valid for files with data blocks @@ -2571,48 +2581,51 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sbi->s_sbh); if (err) - goto out_unlock; + goto out; err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) - goto out_unlock; + goto out; + + mutex_lock(&sbi->s_orphan_lock); /* * Due to previous errors inode may be already a part of on-disk * orphan list. If so skip on-disk list modification. */ - if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <= - (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) - goto mem_insert; - - /* Insert this inode at the head of the on-disk orphan list... */ - NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); - EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); - err = ext4_handle_dirty_super(handle, sb); - rc = ext4_mark_iloc_dirty(handle, inode, &iloc); - if (!err) - err = rc; - - /* Only add to the head of the in-memory list if all the - * previous operations succeeded. If the orphan_add is going to - * fail (possibly taking the journal offline), we can't risk - * leaving the inode on the orphan list: stray orphan-list - * entries can cause panics at unmount time. - * - * This is safe: on error we're going to ignore the orphan list - * anyway on the next recovery. */ -mem_insert: - if (!err) - list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); - + if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) > + (le32_to_cpu(sbi->s_es->s_inodes_count))) { + /* Insert this inode at the head of the on-disk orphan list */ + NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan); + sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); + dirty = true; + } + list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan); + mutex_unlock(&sbi->s_orphan_lock); + + if (dirty) { + err = ext4_handle_dirty_super(handle, sb); + rc = ext4_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; + if (err) { + /* + * We have to remove inode from in-memory list if + * addition to on disk orphan list failed. Stray orphan + * list entries can cause panics at unmount time. + */ + mutex_lock(&sbi->s_orphan_lock); + list_del(&EXT4_I(inode)->i_orphan); + mutex_unlock(&sbi->s_orphan_lock); + } + } jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); jbd_debug(4, "orphan inode %lu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); -out_unlock: - mutex_unlock(&EXT4_SB(sb)->s_orphan_lock); - ext4_std_error(inode->i_sb, err); +out: + ext4_std_error(sb, err); return err; } @@ -2624,45 +2637,51 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) { struct list_head *prev; struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 ino_next; struct ext4_iloc iloc; int err = 0; - if ((!EXT4_SB(inode->i_sb)->s_journal) && - !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) + if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS)) return 0; - mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); + WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + !mutex_is_locked(&inode->i_mutex)); + /* Do this quick check before taking global s_orphan_lock. */ if (list_empty(&ei->i_orphan)) - goto out; + return 0; - ino_next = NEXT_ORPHAN(inode); - prev = ei->i_orphan.prev; - sbi = EXT4_SB(inode->i_sb); + if (handle) { + /* Grab inode buffer early before taking global s_orphan_lock */ + err = ext4_reserve_inode_write(handle, inode, &iloc); + } + mutex_lock(&sbi->s_orphan_lock); jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); + prev = ei->i_orphan.prev; list_del_init(&ei->i_orphan); /* If we're on an error path, we may not have a valid * transaction handle with which to update the orphan list on * disk, but we still need to remove the inode from the linked * list in memory. */ - if (!handle) - goto out; - - err = ext4_reserve_inode_write(handle, inode, &iloc); - if (err) + if (!handle || err) { + mutex_unlock(&sbi->s_orphan_lock); goto out_err; + } + ino_next = NEXT_ORPHAN(inode); if (prev == &sbi->s_orphan) { jbd_debug(4, "superblock will point to %u\n", ino_next); BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sbi->s_sbh); - if (err) + if (err) { + mutex_unlock(&sbi->s_orphan_lock); goto out_brelse; + } sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); + mutex_unlock(&sbi->s_orphan_lock); err = ext4_handle_dirty_super(handle, inode->i_sb); } else { struct ext4_iloc iloc2; @@ -2672,20 +2691,20 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) jbd_debug(4, "orphan inode %lu will point to %u\n", i_prev->i_ino, ino_next); err = ext4_reserve_inode_write(handle, i_prev, &iloc2); - if (err) + if (err) { + mutex_unlock(&sbi->s_orphan_lock); goto out_brelse; + } NEXT_ORPHAN(i_prev) = ino_next; err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2); + mutex_unlock(&sbi->s_orphan_lock); } if (err) goto out_brelse; NEXT_ORPHAN(inode) = 0; err = ext4_mark_iloc_dirty(handle, inode, &iloc); - out_err: ext4_std_error(inode->i_sb, err); -out: - mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock); return err; out_brelse: @@ -3002,6 +3021,154 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, return ext4_get_first_inline_block(inode, parent_de, retval); } +struct ext4_renament { + struct inode *dir; + struct dentry *dentry; + struct inode *inode; + bool is_dir; + int dir_nlink_delta; + + /* entry for "dentry" */ + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; + int inlined; + + /* entry for ".." in inode if it's a directory */ + struct buffer_head *dir_bh; + struct ext4_dir_entry_2 *parent_de; + int dir_inlined; +}; + +static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent) +{ + int retval; + + ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode, + &retval, &ent->parent_de, + &ent->dir_inlined); + if (!ent->dir_bh) + return retval; + if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino) + return -EIO; + BUFFER_TRACE(ent->dir_bh, "get_write_access"); + return ext4_journal_get_write_access(handle, ent->dir_bh); +} + +static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent, + unsigned dir_ino) +{ + int retval; + + ent->parent_de->inode = cpu_to_le32(dir_ino); + BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata"); + if (!ent->dir_inlined) { + if (is_dx(ent->inode)) { + retval = ext4_handle_dirty_dx_node(handle, + ent->inode, + ent->dir_bh); + } else { + retval = ext4_handle_dirty_dirent_node(handle, + ent->inode, + ent->dir_bh); + } + } else { + retval = ext4_mark_inode_dirty(handle, ent->inode); + } + if (retval) { + ext4_std_error(ent->dir->i_sb, retval); + return retval; + } + return 0; +} + +static int ext4_setent(handle_t *handle, struct ext4_renament *ent, + unsigned ino, unsigned file_type) +{ + int retval; + + BUFFER_TRACE(ent->bh, "get write access"); + retval = ext4_journal_get_write_access(handle, ent->bh); + if (retval) + return retval; + ent->de->inode = cpu_to_le32(ino); + if (EXT4_HAS_INCOMPAT_FEATURE(ent->dir->i_sb, + EXT4_FEATURE_INCOMPAT_FILETYPE)) + ent->de->file_type = file_type; + ent->dir->i_version++; + ent->dir->i_ctime = ent->dir->i_mtime = + ext4_current_time(ent->dir); + ext4_mark_inode_dirty(handle, ent->dir); + BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); + if (!ent->inlined) { + retval = ext4_handle_dirty_dirent_node(handle, + ent->dir, ent->bh); + if (unlikely(retval)) { + ext4_std_error(ent->dir->i_sb, retval); + return retval; + } + } + brelse(ent->bh); + ent->bh = NULL; + + return 0; +} + +static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, + const struct qstr *d_name) +{ + int retval = -ENOENT; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; + + bh = ext4_find_entry(dir, d_name, &de, NULL); + if (bh) { + retval = ext4_delete_entry(handle, dir, de, bh); + brelse(bh); + } + return retval; +} + +static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent) +{ + int retval; + /* + * ent->de could have moved from under us during htree split, so make + * sure that we are deleting the right entry. We might also be pointing + * to a stale entry in the unused part of ent->bh so just checking inum + * and the name isn't enough. + */ + if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino || + ent->de->name_len != ent->dentry->d_name.len || + strncmp(ent->de->name, ent->dentry->d_name.name, + ent->de->name_len)) { + retval = ext4_find_delete_entry(handle, ent->dir, + &ent->dentry->d_name); + } else { + retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh); + if (retval == -ENOENT) { + retval = ext4_find_delete_entry(handle, ent->dir, + &ent->dentry->d_name); + } + } + + if (retval) { + ext4_warning(ent->dir->i_sb, + "Deleting old file (%lu), %d, error=%d", + ent->dir->i_ino, ent->dir->i_nlink, retval); + } +} + +static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent) +{ + if (ent->dir_nlink_delta) { + if (ent->dir_nlink_delta == -1) + ext4_dec_count(handle, ent->dir); + else + ext4_inc_count(handle, ent->dir); + ext4_mark_inode_dirty(handle, ent->dir); + } +} + /* * Anybody can rename anything with this: the permission checks are left to the * higher-level routines. @@ -3014,198 +3181,267 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { handle_t *handle = NULL; - struct inode *old_inode, *new_inode; - struct buffer_head *old_bh, *new_bh, *dir_bh; - struct ext4_dir_entry_2 *old_de, *new_de; + struct ext4_renament old = { + .dir = old_dir, + .dentry = old_dentry, + .inode = old_dentry->d_inode, + }; + struct ext4_renament new = { + .dir = new_dir, + .dentry = new_dentry, + .inode = new_dentry->d_inode, + }; int retval; - int inlined = 0, new_inlined = 0; - struct ext4_dir_entry_2 *parent_de; - dquot_initialize(old_dir); - dquot_initialize(new_dir); - - old_bh = new_bh = dir_bh = NULL; + dquot_initialize(old.dir); + dquot_initialize(new.dir); /* Initialize quotas before so that eventual writes go * in separate transaction */ - if (new_dentry->d_inode) - dquot_initialize(new_dentry->d_inode); + if (new.inode) + dquot_initialize(new.inode); - old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL); + old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process * and merrily kill the link to whatever was created under the * same name. Goodbye sticky bit ;-< */ - old_inode = old_dentry->d_inode; retval = -ENOENT; - if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) + if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) goto end_rename; - new_inode = new_dentry->d_inode; - new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, - &new_de, &new_inlined); - if (new_bh) { - if (!new_inode) { - brelse(new_bh); - new_bh = NULL; + new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, + &new.de, &new.inlined); + if (new.bh) { + if (!new.inode) { + brelse(new.bh); + new.bh = NULL; } } - if (new_inode && !test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC)) - ext4_alloc_da_blocks(old_inode); + if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC)) + ext4_alloc_da_blocks(old.inode); - handle = ext4_journal_start(old_dir, EXT4_HT_DIR, - (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + + handle = ext4_journal_start(old.dir, EXT4_HT_DIR, + (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); if (IS_ERR(handle)) return PTR_ERR(handle); - if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) ext4_handle_sync(handle); - if (S_ISDIR(old_inode->i_mode)) { - if (new_inode) { + if (S_ISDIR(old.inode->i_mode)) { + if (new.inode) { retval = -ENOTEMPTY; - if (!empty_dir(new_inode)) + if (!empty_dir(new.inode)) + goto end_rename; + } else { + retval = -EMLINK; + if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir)) goto end_rename; } - retval = -EIO; - dir_bh = ext4_get_first_dir_block(handle, old_inode, - &retval, &parent_de, - &inlined); - if (!dir_bh) - goto end_rename; - if (le32_to_cpu(parent_de->inode) != old_dir->i_ino) - goto end_rename; - retval = -EMLINK; - if (!new_inode && new_dir != old_dir && - EXT4_DIR_LINK_MAX(new_dir)) - goto end_rename; - BUFFER_TRACE(dir_bh, "get_write_access"); - retval = ext4_journal_get_write_access(handle, dir_bh); + retval = ext4_rename_dir_prepare(handle, &old); if (retval) goto end_rename; } - if (!new_bh) { - retval = ext4_add_entry(handle, new_dentry, old_inode); + if (!new.bh) { + retval = ext4_add_entry(handle, new.dentry, old.inode); if (retval) goto end_rename; } else { - BUFFER_TRACE(new_bh, "get write access"); - retval = ext4_journal_get_write_access(handle, new_bh); + retval = ext4_setent(handle, &new, + old.inode->i_ino, old.de->file_type); if (retval) goto end_rename; - new_de->inode = cpu_to_le32(old_inode->i_ino); - if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb, - EXT4_FEATURE_INCOMPAT_FILETYPE)) - new_de->file_type = old_de->file_type; - new_dir->i_version++; - new_dir->i_ctime = new_dir->i_mtime = - ext4_current_time(new_dir); - ext4_mark_inode_dirty(handle, new_dir); - BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); - if (!new_inlined) { - retval = ext4_handle_dirty_dirent_node(handle, - new_dir, new_bh); - if (unlikely(retval)) { - ext4_std_error(new_dir->i_sb, retval); - goto end_rename; - } - } - brelse(new_bh); - new_bh = NULL; } /* * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old_inode->i_ctime = ext4_current_time(old_inode); - ext4_mark_inode_dirty(handle, old_inode); + old.inode->i_ctime = ext4_current_time(old.inode); + ext4_mark_inode_dirty(handle, old.inode); /* * ok, that's it */ - if (le32_to_cpu(old_de->inode) != old_inode->i_ino || - old_de->name_len != old_dentry->d_name.len || - strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || - (retval = ext4_delete_entry(handle, old_dir, - old_de, old_bh)) == -ENOENT) { - /* old_de could have moved from under us during htree split, so - * make sure that we are deleting the right entry. We might - * also be pointing to a stale entry in the unused part of - * old_bh so just checking inum and the name isn't enough. */ - struct buffer_head *old_bh2; - struct ext4_dir_entry_2 *old_de2; - - old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, - &old_de2, NULL); - if (old_bh2) { - retval = ext4_delete_entry(handle, old_dir, - old_de2, old_bh2); - brelse(old_bh2); - } + ext4_rename_delete(handle, &old); + + if (new.inode) { + ext4_dec_count(handle, new.inode); + new.inode->i_ctime = ext4_current_time(new.inode); } - if (retval) { - ext4_warning(old_dir->i_sb, - "Deleting old file (%lu), %d, error=%d", - old_dir->i_ino, old_dir->i_nlink, retval); - } - - if (new_inode) { - ext4_dec_count(handle, new_inode); - new_inode->i_ctime = ext4_current_time(new_inode); - } - old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir); - ext4_update_dx_flag(old_dir); - if (dir_bh) { - parent_de->inode = cpu_to_le32(new_dir->i_ino); - BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); - if (!inlined) { - if (is_dx(old_inode)) { - retval = ext4_handle_dirty_dx_node(handle, - old_inode, - dir_bh); - } else { - retval = ext4_handle_dirty_dirent_node(handle, - old_inode, dir_bh); - } - } else { - retval = ext4_mark_inode_dirty(handle, old_inode); - } - if (retval) { - ext4_std_error(old_dir->i_sb, retval); + old.dir->i_ctime = old.dir->i_mtime = ext4_current_time(old.dir); + ext4_update_dx_flag(old.dir); + if (old.dir_bh) { + retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); + if (retval) goto end_rename; - } - ext4_dec_count(handle, old_dir); - if (new_inode) { + + ext4_dec_count(handle, old.dir); + if (new.inode) { /* checked empty_dir above, can't have another parent, * ext4_dec_count() won't work for many-linked dirs */ - clear_nlink(new_inode); + clear_nlink(new.inode); } else { - ext4_inc_count(handle, new_dir); - ext4_update_dx_flag(new_dir); - ext4_mark_inode_dirty(handle, new_dir); + ext4_inc_count(handle, new.dir); + ext4_update_dx_flag(new.dir); + ext4_mark_inode_dirty(handle, new.dir); } } - ext4_mark_inode_dirty(handle, old_dir); - if (new_inode) { - ext4_mark_inode_dirty(handle, new_inode); - if (!new_inode->i_nlink) - ext4_orphan_add(handle, new_inode); + ext4_mark_inode_dirty(handle, old.dir); + if (new.inode) { + ext4_mark_inode_dirty(handle, new.inode); + if (!new.inode->i_nlink) + ext4_orphan_add(handle, new.inode); + } + retval = 0; + +end_rename: + brelse(old.dir_bh); + brelse(old.bh); + brelse(new.bh); + if (handle) + ext4_journal_stop(handle); + return retval; +} + +static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + handle_t *handle = NULL; + struct ext4_renament old = { + .dir = old_dir, + .dentry = old_dentry, + .inode = old_dentry->d_inode, + }; + struct ext4_renament new = { + .dir = new_dir, + .dentry = new_dentry, + .inode = new_dentry->d_inode, + }; + u8 new_file_type; + int retval; + + dquot_initialize(old.dir); + dquot_initialize(new.dir); + + old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, + &old.de, &old.inlined); + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process + * and merrily kill the link to whatever was created under the + * same name. Goodbye sticky bit ;-< + */ + retval = -ENOENT; + if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) + goto end_rename; + + new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, + &new.de, &new.inlined); + + /* RENAME_EXCHANGE case: old *and* new must both exist */ + if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino) + goto end_rename; + + handle = ext4_journal_start(old.dir, EXT4_HT_DIR, + (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + + 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) + ext4_handle_sync(handle); + + if (S_ISDIR(old.inode->i_mode)) { + old.is_dir = true; + retval = ext4_rename_dir_prepare(handle, &old); + if (retval) + goto end_rename; + } + if (S_ISDIR(new.inode->i_mode)) { + new.is_dir = true; + retval = ext4_rename_dir_prepare(handle, &new); + if (retval) + goto end_rename; } + + /* + * Other than the special case of overwriting a directory, parents' + * nlink only needs to be modified if this is a cross directory rename. + */ + if (old.dir != new.dir && old.is_dir != new.is_dir) { + old.dir_nlink_delta = old.is_dir ? -1 : 1; + new.dir_nlink_delta = -old.dir_nlink_delta; + retval = -EMLINK; + if ((old.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(old.dir)) || + (new.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(new.dir))) + goto end_rename; + } + + new_file_type = new.de->file_type; + retval = ext4_setent(handle, &new, old.inode->i_ino, old.de->file_type); + if (retval) + goto end_rename; + + retval = ext4_setent(handle, &old, new.inode->i_ino, new_file_type); + if (retval) + goto end_rename; + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + old.inode->i_ctime = ext4_current_time(old.inode); + new.inode->i_ctime = ext4_current_time(new.inode); + ext4_mark_inode_dirty(handle, old.inode); + ext4_mark_inode_dirty(handle, new.inode); + + if (old.dir_bh) { + retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); + if (retval) + goto end_rename; + } + if (new.dir_bh) { + retval = ext4_rename_dir_finish(handle, &new, old.dir->i_ino); + if (retval) + goto end_rename; + } + ext4_update_dir_count(handle, &old); + ext4_update_dir_count(handle, &new); retval = 0; end_rename: - brelse(dir_bh); - brelse(old_bh); - brelse(new_bh); + brelse(old.dir_bh); + brelse(new.dir_bh); + brelse(old.bh); + brelse(new.bh); if (handle) ext4_journal_stop(handle); return retval; } +static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if (flags & RENAME_EXCHANGE) { + return ext4_cross_rename(old_dir, old_dentry, + new_dir, new_dentry); + } + /* + * Existence checking was done by the VFS, otherwise "RENAME_NOREPLACE" + * is equivalent to regular rename. + */ + return ext4_rename(old_dir, old_dentry, new_dir, new_dentry); +} + /* * directories can handle most operations... */ @@ -3220,12 +3456,14 @@ const struct inode_operations ext4_dir_inode_operations = { .mknod = ext4_mknod, .tmpfile = ext4_tmpfile, .rename = ext4_rename, + .rename2 = ext4_rename2, .setattr = ext4_setattr, .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, .get_acl = ext4_get_acl, + .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, }; @@ -3236,4 +3474,5 @@ const struct inode_operations ext4_special_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, .get_acl = ext4_get_acl, + .set_acl = ext4_set_acl, }; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index d7d0c7b46ed..b24a2541a9b 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -65,9 +65,9 @@ static void ext4_finish_bio(struct bio *bio) { int i; int error = !test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec; - for (i = 0; i < bio->bi_vcnt; i++) { - struct bio_vec *bvec = &bio->bi_io_vec[i]; + bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; struct buffer_head *bh, *head; unsigned bio_start = bvec->bv_offset; @@ -197,14 +197,15 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head) static void ext4_add_complete_io(ext4_io_end_t *io_end) { struct ext4_inode_info *ei = EXT4_I(io_end->inode); + struct ext4_sb_info *sbi = EXT4_SB(io_end->inode->i_sb); struct workqueue_struct *wq; unsigned long flags; /* Only reserved conversions from writeback should enter here */ WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); - WARN_ON(!io_end->handle); + WARN_ON(!io_end->handle && sbi->s_journal); spin_lock_irqsave(&ei->i_completed_io_lock, flags); - wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq; + wq = sbi->rsv_conversion_wq; if (list_empty(&ei->i_rsv_conversion_list)) queue_work(wq, &ei->i_rsv_conversion_work); list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); @@ -297,7 +298,7 @@ ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) static void ext4_end_bio(struct bio *bio, int error) { ext4_io_end_t *io_end = bio->bi_private; - sector_t bi_sector = bio->bi_sector; + sector_t bi_sector = bio->bi_iter.bi_sector; BUG_ON(!io_end); bio->bi_end_io = NULL; @@ -307,13 +308,14 @@ static void ext4_end_bio(struct bio *bio, int error) if (error) { struct inode *inode = io_end->inode; - ext4_warning(inode->i_sb, "I/O error writing to inode %lu " + ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " "(offset %llu size %ld starting block %llu)", - inode->i_ino, + error, inode->i_ino, (unsigned long long) io_end->offset, (long) io_end->size, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); + mapping_set_error(inode->i_mapping, error); } if (io_end->flag & EXT4_IO_END_UNWRITTEN) { @@ -365,7 +367,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io, bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); if (!bio) return -ENOMEM; - bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); @@ -399,7 +401,8 @@ submit_and_retry: int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, int len, - struct writeback_control *wbc) + struct writeback_control *wbc, + bool keep_towrite) { struct inode *inode = page->mapping->host; unsigned block_start, blocksize; @@ -412,10 +415,24 @@ int ext4_bio_write_page(struct ext4_io_submit *io, BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); - set_page_writeback(page); + if (keep_towrite) + set_page_writeback_keepwrite(page); + else + set_page_writeback(page); ClearPageError(page); /* + * Comments copied from block_write_full_page: + * + * The page straddles i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + if (len < PAGE_CACHE_SIZE) + zero_user_segment(page, len, PAGE_CACHE_SIZE); + /* * In the first loop we prepare and mark buffers to submit. We have to * mark all buffers in the page before submitting so that * end_page_writeback() cannot be called from ext4_bio_end_io() when IO @@ -426,19 +443,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io, do { block_start = bh_offset(bh); if (block_start >= len) { - /* - * Comments copied from block_write_full_page_endio: - * - * The page straddles i_size. It must be zeroed out on - * each and every writepage invocation because it may - * be mmapped. "A file is mapped in multiples of the - * page size. For a file that is not a multiple of - * the page size, the remaining memory is zeroed when - * mapped, and writes to that region are not written - * out to the file." - */ - zero_user_segment(page, block_start, - block_start + blocksize); clear_buffer_dirty(bh); set_buffer_uptodate(bh); continue; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index c5adbb318a9..bb0e80f03e2 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -42,7 +42,7 @@ int ext4_resize_begin(struct super_block *sb) void ext4_resize_end(struct super_block *sb) { clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb, @@ -243,6 +243,7 @@ static int ext4_alloc_group_tables(struct super_block *sb, ext4_group_t group; ext4_group_t last_group; unsigned overhead; + __u16 uninit_mask = (flexbg_size > 1) ? ~EXT4_BG_BLOCK_UNINIT : ~0; BUG_ON(flex_gd->count == 0 || group_data == NULL); @@ -266,7 +267,7 @@ next_group: src_group++; for (; src_group <= last_group; src_group++) { overhead = ext4_group_overhead_blocks(sb, src_group); - if (overhead != 0) + if (overhead == 0) last_blk += group_data[src_group - group].blocks_count; else break; @@ -280,8 +281,7 @@ next_group: group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; group_data[group].free_blocks_count--; - if (flexbg_size > 1) - flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + flex_gd->bg_flags[group] &= uninit_mask; } /* Allocate inode bitmaps */ @@ -292,22 +292,30 @@ next_group: group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; group_data[group].free_blocks_count--; - if (flexbg_size > 1) - flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + flex_gd->bg_flags[group] &= uninit_mask; } /* Allocate inode tables */ for (; it_index < flex_gd->count; it_index++) { - if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk) + unsigned int itb = EXT4_SB(sb)->s_itb_per_group; + ext4_fsblk_t next_group_start; + + if (start_blk + itb > last_blk) goto next_group; group_data[it_index].inode_table = start_blk; - group = ext4_get_group_number(sb, start_blk - 1); + group = ext4_get_group_number(sb, start_blk); + next_group_start = ext4_group_first_block_no(sb, group + 1); group -= group_data[0].group; - group_data[group].free_blocks_count -= - EXT4_SB(sb)->s_itb_per_group; - if (flexbg_size > 1) - flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + if (start_blk + itb > next_group_start) { + flex_gd->bg_flags[group + 1] &= uninit_mask; + overhead = start_blk + itb - next_group_start; + group_data[group + 1].free_blocks_count -= overhead; + itb -= overhead; + } + + group_data[group].free_blocks_count -= itb; + flex_gd->bg_flags[group] &= uninit_mask; start_blk += EXT4_SB(sb)->s_itb_per_group; } @@ -340,6 +348,7 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, bh = sb_getblk(sb, blk); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); + BUFFER_TRACE(bh, "get_write_access"); if ((err = ext4_journal_get_write_access(handle, bh))) { brelse(bh); bh = ERR_PTR(err); @@ -401,7 +410,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, start = ext4_group_first_block_no(sb, group); group -= flex_gd->groups[0].group; - count2 = sb->s_blocksize * 8 - (block - start); + count2 = EXT4_BLOCKS_PER_GROUP(sb) - (block - start); if (count2 > count) count2 = count; @@ -418,6 +427,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, if (unlikely(!bh)) return -ENOMEM; + BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, bh); if (err) return err; @@ -510,6 +520,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb, goto out; } + BUFFER_TRACE(gdb, "get_write_access"); err = ext4_journal_get_write_access(handle, gdb); if (err) { brelse(gdb); @@ -620,7 +631,7 @@ handle_ib: if (err) goto out; count = group_table_count[j]; - start = group_data[i].block_bitmap; + start = (&group_data[i].block_bitmap)[j]; block = start; } @@ -782,14 +793,17 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, goto exit_dind; } + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); if (unlikely(err)) goto exit_dind; + BUFFER_TRACE(gdb_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdb_bh); if (unlikely(err)) goto exit_dind; + BUFFER_TRACE(dind, "get_write_access"); err = ext4_journal_get_write_access(handle, dind); if (unlikely(err)) ext4_std_error(sb, err); @@ -894,6 +908,7 @@ static int add_new_gdb_meta_bg(struct super_block *sb, EXT4_SB(sb)->s_group_desc = n_group_desc; EXT4_SB(sb)->s_gdb_count++; ext4_kvfree(o_group_desc); + BUFFER_TRACE(gdb_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdb_bh); if (unlikely(err)) brelse(gdb_bh); @@ -969,6 +984,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, } for (i = 0; i < reserved_gdb; i++) { + BUFFER_TRACE(primary[i], "get_write_access"); if ((err = ext4_journal_get_write_access(handle, primary[i]))) goto exit_bh; } @@ -1076,6 +1092,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data, ext4_debug("update metadata backup %llu(+%llu)\n", backup_block, backup_block - ext4_group_first_block_no(sb, group)); + BUFFER_TRACE(bh, "get_write_access"); if ((err = ext4_journal_get_write_access(handle, bh))) break; lock_buffer(bh); @@ -1155,6 +1172,7 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, */ if (gdb_off) { gdb_bh = sbi->s_group_desc[gdb_num]; + BUFFER_TRACE(gdb_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdb_bh); if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group)) @@ -1425,6 +1443,7 @@ static int ext4_flex_group_add(struct super_block *sb, goto exit; } + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sbi->s_sbh); if (err) goto exit_journal; @@ -1637,6 +1656,7 @@ static int ext4_group_extend_no_check(struct super_block *sb, return err; } + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); if (err) { ext4_warning(sb, "error %d on journal write access", err); @@ -1796,6 +1816,7 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) if (IS_ERR(handle)) return PTR_ERR(handle); + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sbi->s_sbh); if (err) goto errout; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2c2e6cbc6be..6df7bc611db 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -59,6 +59,7 @@ static struct kset *ext4_kset; static struct ext4_lazy_init *ext4_li_info; static struct mutex ext4_li_mtx; static struct ext4_features *ext4_feat; +static int ext4_mballoc_ready; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); @@ -137,8 +138,8 @@ static __le32 ext4_superblock_csum(struct super_block *sb, return cpu_to_le32(csum); } -int ext4_superblock_csum_verify(struct super_block *sb, - struct ext4_super_block *es) +static int ext4_superblock_csum_verify(struct super_block *sb, + struct ext4_super_block *es) { if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) @@ -411,20 +412,26 @@ static void ext4_handle_error(struct super_block *sb) sb->s_id); } +#define ext4_error_ratelimit(sb) \ + ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state), \ + "EXT4-fs error") + void __ext4_error(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) { struct va_format vaf; va_list args; - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", - sb->s_id, function, line, current->comm, &vaf); - va_end(args); + if (ext4_error_ratelimit(sb)) { + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CRIT + "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", + sb->s_id, function, line, current->comm, &vaf); + va_end(args); + } save_error_info(sb, function, line); - ext4_handle_error(sb); } @@ -438,22 +445,23 @@ void __ext4_error_inode(struct inode *inode, const char *function, es->s_last_error_ino = cpu_to_le32(inode->i_ino); es->s_last_error_block = cpu_to_le64(block); + if (ext4_error_ratelimit(inode->i_sb)) { + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (block) + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " + "inode #%lu: block %llu: comm %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + block, current->comm, &vaf); + else + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " + "inode #%lu: comm %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + current->comm, &vaf); + va_end(args); + } save_error_info(inode->i_sb, function, line); - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - if (block) - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " - "inode #%lu: block %llu: comm %s: %pV\n", - inode->i_sb->s_id, function, line, inode->i_ino, - block, current->comm, &vaf); - else - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " - "inode #%lu: comm %s: %pV\n", - inode->i_sb->s_id, function, line, inode->i_ino, - current->comm, &vaf); - va_end(args); - ext4_handle_error(inode->i_sb); } @@ -469,27 +477,28 @@ void __ext4_error_file(struct file *file, const char *function, es = EXT4_SB(inode->i_sb)->s_es; es->s_last_error_ino = cpu_to_le32(inode->i_ino); + if (ext4_error_ratelimit(inode->i_sb)) { + path = d_path(&(file->f_path), pathname, sizeof(pathname)); + if (IS_ERR(path)) + path = "(unknown)"; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (block) + printk(KERN_CRIT + "EXT4-fs error (device %s): %s:%d: inode #%lu: " + "block %llu: comm %s: path %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + block, current->comm, path, &vaf); + else + printk(KERN_CRIT + "EXT4-fs error (device %s): %s:%d: inode #%lu: " + "comm %s: path %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + current->comm, path, &vaf); + va_end(args); + } save_error_info(inode->i_sb, function, line); - path = d_path(&(file->f_path), pathname, sizeof(pathname)); - if (IS_ERR(path)) - path = "(unknown)"; - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - if (block) - printk(KERN_CRIT - "EXT4-fs error (device %s): %s:%d: inode #%lu: " - "block %llu: comm %s: path %s: %pV\n", - inode->i_sb->s_id, function, line, inode->i_ino, - block, current->comm, path, &vaf); - else - printk(KERN_CRIT - "EXT4-fs error (device %s): %s:%d: inode #%lu: " - "comm %s: path %s: %pV\n", - inode->i_sb->s_id, function, line, inode->i_ino, - current->comm, path, &vaf); - va_end(args); - ext4_handle_error(inode->i_sb); } @@ -543,11 +552,13 @@ void __ext4_std_error(struct super_block *sb, const char *function, (sb->s_flags & MS_RDONLY)) return; - errstr = ext4_decode_error(sb, errno, nbuf); - printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", - sb->s_id, function, line, errstr); - save_error_info(sb, function, line); + if (ext4_error_ratelimit(sb)) { + errstr = ext4_decode_error(sb, errno, nbuf); + printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", + sb->s_id, function, line, errstr); + } + save_error_info(sb, function, line); ext4_handle_error(sb); } @@ -597,6 +608,9 @@ void __ext4_msg(struct super_block *sb, struct va_format vaf; va_list args; + if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs")) + return; + va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; @@ -610,6 +624,10 @@ void __ext4_warning(struct super_block *sb, const char *function, struct va_format vaf; va_list args; + if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), + "EXT4-fs warning")) + return; + va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; @@ -633,18 +651,20 @@ __acquires(bitlock) es->s_last_error_block = cpu_to_le64(block); __save_error_info(sb, function, line); - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ", - sb->s_id, function, line, grp); - if (ino) - printk(KERN_CONT "inode %lu: ", ino); - if (block) - printk(KERN_CONT "block %llu:", (unsigned long long) block); - printk(KERN_CONT "%pV\n", &vaf); - va_end(args); + if (ext4_error_ratelimit(sb)) { + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ", + sb->s_id, function, line, grp); + if (ino) + printk(KERN_CONT "inode %lu: ", ino); + if (block) + printk(KERN_CONT "block %llu:", + (unsigned long long) block); + printk(KERN_CONT "%pV\n", &vaf); + va_end(args); + } if (test_opt(sb, ERRORS_CONT)) { ext4_commit_super(sb, 0); @@ -773,7 +793,7 @@ static void ext4_put_super(struct super_block *sb) } ext4_es_unregister_shrinker(sbi); - del_timer(&sbi->s_err_report); + del_timer_sync(&sbi->s_err_report); ext4_release_system_zone(sb); ext4_mb_release(sb); ext4_ext_release(sb); @@ -826,6 +846,10 @@ static void ext4_put_super(struct super_block *sb) invalidate_bdev(sbi->journal_bdev); ext4_blkdev_remove(sbi); } + if (sbi->s_mb_cache) { + ext4_xattr_destroy_cache(sbi->s_mb_cache); + sbi->s_mb_cache = NULL; + } if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); sb->s_fs_info = NULL; @@ -855,6 +879,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) return NULL; ei->vfs_inode.i_version = 1; + spin_lock_init(&ei->i_raw_lock); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); ext4_es_init_tree(&ei->i_es_tree); @@ -921,7 +946,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", sizeof(struct ext4_inode_info), @@ -1500,8 +1525,6 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, arg = JBD2_DEFAULT_MAX_COMMIT_AGE; sbi->s_commit_interval = HZ * arg; } else if (token == Opt_max_batch_time) { - if (arg == 0) - arg = EXT4_DEF_MAX_BATCH_TIME; sbi->s_max_batch_time = arg; } else if (token == Opt_min_batch_time) { sbi->s_min_batch_time = arg; @@ -1879,7 +1902,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, if (!(sbi->s_mount_state & EXT4_VALID_FS)) ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, " "running e2fsck is recommended"); - else if ((sbi->s_mount_state & EXT4_ERROR_FS)) + else if (sbi->s_mount_state & EXT4_ERROR_FS) ext4_msg(sb, KERN_WARNING, "warning: mounting fs with errors, " "running e2fsck is recommended"); @@ -2380,6 +2403,16 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb, if (ext4_bg_has_super(sb, bg)) has_super = 1; + /* + * If we have a meta_bg fs with 1k blocks, group 0's GDT is at + * block 2, not 1. If s_first_data_block == 0 (bigalloc is enabled + * on modern mke2fs or blksize > 1k on older mke2fs) then we must + * compensate. + */ + if (sb->s_blocksize == 1024 && nr == 0 && + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) == 0) + has_super++; + return (has_super + ext4_group_first_block_no(sb, bg)); } @@ -2606,6 +2639,12 @@ EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128); EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); +EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); static struct attribute *ext4_attrs[] = { ATTR_LIST(delayed_allocation_blocks), @@ -2623,6 +2662,12 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(max_writeback_mb_bump), ATTR_LIST(extent_max_zeroout_kb), ATTR_LIST(trigger_fs_error), + ATTR_LIST(err_ratelimit_interval_ms), + ATTR_LIST(err_ratelimit_burst), + ATTR_LIST(warning_ratelimit_interval_ms), + ATTR_LIST(warning_ratelimit_burst), + ATTR_LIST(msg_ratelimit_interval_ms), + ATTR_LIST(msg_ratelimit_burst), NULL, }; @@ -2762,10 +2807,11 @@ static void print_daily_error_info(unsigned long arg) es = sbi->s_es; if (es->s_error_count) - ext4_msg(sb, KERN_NOTICE, "error count: %u", + /* fsck newer than v1.41.13 is needed to clean this condition. */ + ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u", le32_to_cpu(es->s_error_count)); if (es->s_first_error_time) { - printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d", + printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d", sb->s_id, le32_to_cpu(es->s_first_error_time), (int) sizeof(es->s_first_error_func), es->s_first_error_func, @@ -2779,7 +2825,7 @@ static void print_daily_error_info(unsigned long arg) printk("\n"); } if (es->s_last_error_time) { - printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d", + printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d", sb->s_id, le32_to_cpu(es->s_last_error_time), (int) sizeof(es->s_last_error_func), es->s_last_error_func, @@ -3037,7 +3083,6 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_li_request *elr; - unsigned long rnd; elr = kzalloc(sizeof(*elr), GFP_KERNEL); if (!elr) @@ -3052,10 +3097,8 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, * spread the inode table initialization requests * better. */ - get_random_bytes(&rnd, sizeof(rnd)); - elr->lr_next_sched = jiffies + (unsigned long)rnd % - (EXT4_DEF_LI_MAX_START_DELAY * HZ); - + elr->lr_next_sched = jiffies + (prandom_u32() % + (EXT4_DEF_LI_MAX_START_DELAY * HZ)); return elr; } @@ -3288,19 +3331,28 @@ int ext4_calculate_overhead(struct super_block *sb) } -static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi) +static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb) { ext4_fsblk_t resv_clusters; /* + * There's no need to reserve anything when we aren't using extents. + * The space estimates are exact, there are no unwritten extents, + * hole punching doesn't need new metadata... This is needed especially + * to keep ext2/3 backward compatibility. + */ + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) + return 0; + /* * By default we reserve 2% or 4096 clusters, whichever is smaller. * This should cover the situations where we can not afford to run * out of space like for example punch hole, or converting - * uninitialized extents in delalloc path. In most cases such + * unwritten extents in delalloc path. In most cases such * allocation would require 1, or 2 blocks, higher numbers are * very rare. */ - resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits; + resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >> + EXT4_SB(sb)->s_cluster_bits; do_div(resv_clusters, 50); resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); @@ -3538,6 +3590,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "feature flags set on rev 0 fs, " "running e2fsck is recommended"); + if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) { + set_opt2(sb, HURD_COMPAT); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_64BIT)) { + ext4_msg(sb, KERN_ERR, + "The Hurd can't support 64-bit file systems"); + goto failed_mount; + } + } + if (IS_EXT2_SB(sb)) { if (ext2_feature_set_ok(sb)) ext4_msg(sb, KERN_INFO, "mounting ext2 file system " @@ -3658,16 +3720,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; - i = le32_to_cpu(es->s_flags); - if (i & EXT2_FLAGS_UNSIGNED_HASH) - sbi->s_hash_unsigned = 3; - else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { + if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { + i = le32_to_cpu(es->s_flags); + if (i & EXT2_FLAGS_UNSIGNED_HASH) + sbi->s_hash_unsigned = 3; + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { #ifdef __CHAR_UNSIGNED__ - es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); - sbi->s_hash_unsigned = 3; + if (!(sb->s_flags & MS_RDONLY)) + es->s_flags |= + cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); + sbi->s_hash_unsigned = 3; #else - es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); + if (!(sb->s_flags & MS_RDONLY)) + es->s_flags |= + cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); #endif + } } /* Handle clustersize */ @@ -3967,6 +4035,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) percpu_counter_set(&sbi->s_dirtyclusters_counter, 0); no_journal: + if (ext4_mballoc_ready) { + sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id); + if (!sbi->s_mb_cache) { + ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); + goto failed_mount_wq; + } + } + /* * Get the # of file system overhead blocks from the * superblock if present. @@ -4043,10 +4119,10 @@ no_journal: "available"); } - err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi)); + err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb)); if (err) { ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " - "reserved pool", ext4_calculate_resv_clusters(sbi)); + "reserved pool", ext4_calculate_resv_clusters(sb)); goto failed_mount4a; } @@ -4118,6 +4194,11 @@ no_journal: if (es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ + /* Enable message ratelimiting. Default is 10 messages per 5 secs. */ + ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10); + ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10); + ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); + kfree(orig_data); return 0; @@ -4151,7 +4232,7 @@ failed_mount_wq: } failed_mount3: ext4_es_unregister_shrinker(sbi); - del_timer(&sbi->s_err_report); + del_timer_sync(&sbi->s_err_report); if (sbi->s_flex_groups) ext4_kvfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeclusters_counter); @@ -4787,6 +4868,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } if (*flags & MS_RDONLY) { + err = sync_filesystem(sb); + if (err < 0) + goto restore_opts; err = dquot_suspend(sb, -1); if (err < 0) goto restore_opts; @@ -5293,6 +5377,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, bh = ext4_bread(handle, inode, blk, 1, &err); if (!bh) goto out; + BUFFER_TRACE(bh, "get write access"); err = ext4_journal_get_write_access(handle, bh); if (err) { brelse(bh); @@ -5468,11 +5553,9 @@ static int __init ext4_init_fs(void) err = ext4_init_mballoc(); if (err) - goto out3; - - err = ext4_init_xattr(); - if (err) goto out2; + else + ext4_mballoc_ready = 1; err = init_inodecache(); if (err) goto out1; @@ -5488,10 +5571,9 @@ out: unregister_as_ext3(); destroy_inodecache(); out1: - ext4_exit_xattr(); -out2: + ext4_mballoc_ready = 0; ext4_exit_mballoc(); -out3: +out2: ext4_exit_feat_adverts(); out4: if (ext4_proc_root) @@ -5514,7 +5596,6 @@ static void __exit ext4_exit_fs(void) unregister_as_ext3(); unregister_filesystem(&ext4_fs_type); destroy_inodecache(); - ext4_exit_xattr(); ext4_exit_mballoc(); ext4_exit_feat_adverts(); remove_proc_entry("fs/ext4", NULL); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index c081e34f717..e7387337060 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -81,7 +81,7 @@ # define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif -static void ext4_xattr_cache_insert(struct buffer_head *); +static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *); static struct buffer_head *ext4_xattr_cache_find(struct inode *, struct ext4_xattr_header *, struct mb_cache_entry **); @@ -90,13 +90,11 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *, static int ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size); -static struct mb_cache *ext4_xattr_cache; - static const struct xattr_handler *ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL - [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler, - [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler, + [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, #endif [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_SECURITY @@ -108,8 +106,8 @@ const struct xattr_handler *ext4_xattr_handlers[] = { &ext4_xattr_user_handler, &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL - &ext4_xattr_acl_access_handler, - &ext4_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif #ifdef CONFIG_EXT4_FS_SECURITY &ext4_xattr_security_handler, @@ -117,6 +115,9 @@ const struct xattr_handler *ext4_xattr_handlers[] = { NULL }; +#define EXT4_GET_MB_CACHE(inode) (((struct ext4_sb_info *) \ + inode->i_sb->s_fs_info)->s_mb_cache) + static __le32 ext4_xattr_block_csum(struct inode *inode, sector_t block_nr, struct ext4_xattr_header *hdr) @@ -265,6 +266,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, struct ext4_xattr_entry *entry; size_t size; int error; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); @@ -286,7 +288,7 @@ bad_block: error = -EIO; goto cleanup; } - ext4_xattr_cache_insert(bh); + ext4_xattr_cache_insert(ext4_mb_cache, bh); entry = BFIRST(bh); error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); if (error == -EIO) @@ -367,6 +369,9 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name, { int error; + if (strlen(name) > 255) + return -ERANGE; + down_read(&EXT4_I(inode)->xattr_sem); error = ext4_xattr_ibody_get(inode, name_index, name, buffer, buffer_size); @@ -409,6 +414,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) struct inode *inode = dentry->d_inode; struct buffer_head *bh = NULL; int error; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); @@ -430,7 +436,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) error = -EIO; goto cleanup; } - ext4_xattr_cache_insert(bh); + ext4_xattr_cache_insert(ext4_mb_cache, bh); error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); cleanup: @@ -510,6 +516,7 @@ static void ext4_xattr_update_super_block(handle_t *handle, if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR)) return; + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); ext4_handle_dirty_super(handle, sb); @@ -517,8 +524,8 @@ static void ext4_xattr_update_super_block(handle_t *handle, } /* - * Release the xattr block BH: If the reference count is > 1, decrement - * it; otherwise free the block. + * Release the xattr block BH: If the reference count is > 1, decrement it; + * otherwise free the block. */ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, @@ -526,8 +533,10 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, { struct mb_cache_entry *ce = NULL; int error = 0; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); - ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr); + ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr); + BUFFER_TRACE(bh, "get_write_access"); error = ext4_journal_get_write_access(handle, bh); if (error) goto out; @@ -538,16 +547,31 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, if (ce) mb_cache_entry_free(ce); get_bh(bh); + unlock_buffer(bh); ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); - unlock_buffer(bh); } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); if (ce) mb_cache_entry_release(ce); + /* + * Beware of this ugliness: Releasing of xattr block references + * from different inodes can race and so we have to protect + * from a race where someone else frees the block (and releases + * its journal_head) before we are done dirtying the buffer. In + * nojournal mode this race is harmless and we actually cannot + * call ext4_handle_dirty_xattr_block() with locked buffer as + * that function can call sync_dirty_buffer() so for that case + * we handle the dirtying after unlocking the buffer. + */ + if (ext4_handle_valid(handle)) + error = ext4_handle_dirty_xattr_block(handle, inode, + bh); unlock_buffer(bh); - error = ext4_handle_dirty_xattr_block(handle, inode, bh); + if (!ext4_handle_valid(handle)) + error = ext4_handle_dirty_xattr_block(handle, inode, + bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1)); @@ -567,12 +591,13 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, size_t *min_offs, void *base, int *total) { for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - *total += EXT4_XATTR_LEN(last->e_name_len); if (!last->e_value_block && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < *min_offs) *min_offs = offs; } + if (total) + *total += EXT4_XATTR_LEN(last->e_name_len); } return (*min_offs - ((void *)last - base) - sizeof(__u32)); } @@ -745,14 +770,16 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, struct ext4_xattr_search *s = &bs->s; struct mb_cache_entry *ce = NULL; int error = 0; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); #define header(x) ((struct ext4_xattr_header *)(x)) if (i->value && i->value_len > sb->s_blocksize) return -ENOSPC; if (s->base) { - ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev, + ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev, bs->bh->b_blocknr); + BUFFER_TRACE(bs->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, bs->bh); if (error) goto cleanup; @@ -769,7 +796,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, if (!IS_LAST_ENTRY(s->first)) ext4_xattr_rehash(header(s->base), s->here); - ext4_xattr_cache_insert(bs->bh); + ext4_xattr_cache_insert(ext4_mb_cache, + bs->bh); } unlock_buffer(bs->bh); if (error == -EIO) @@ -837,6 +865,7 @@ inserted: EXT4_C2B(EXT4_SB(sb), 1)); if (error) goto cleanup; + BUFFER_TRACE(new_bh, "get_write_access"); error = ext4_journal_get_write_access(handle, new_bh); if (error) @@ -874,7 +903,7 @@ inserted: * take i_data_sem because we will test * i_delalloc_reserved_flag in ext4_mb_new_blocks */ - down_read((&EXT4_I(inode)->i_data_sem)); + down_read(&EXT4_I(inode)->i_data_sem); block = ext4_new_meta_blocks(handle, inode, goal, 0, NULL, &error); up_read((&EXT4_I(inode)->i_data_sem)); @@ -905,7 +934,7 @@ getblk_failed: memcpy(new_bh->b_data, s->base, new_bh->b_size); set_buffer_uptodate(new_bh); unlock_buffer(new_bh); - ext4_xattr_cache_insert(new_bh); + ext4_xattr_cache_insert(ext4_mb_cache, new_bh); error = ext4_handle_dirty_xattr_block(handle, inode, new_bh); if (error) @@ -1228,7 +1257,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_xattr_block_find *bs = NULL; char *buffer = NULL, *b_entry_name = NULL; size_t min_offs, free; - int total_ino, total_blk; + int total_ino; void *base, *start, *end; int extra_isize = 0, error = 0, tried_min_extra_isize = 0; int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); @@ -1286,8 +1315,7 @@ retry: first = BFIRST(bh); end = bh->b_data + bh->b_size; min_offs = end - base; - free = ext4_xattr_free_space(first, &min_offs, base, - &total_blk); + free = ext4_xattr_free_space(first, &min_offs, base, NULL); if (free < new_extra_isize) { if (!tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; @@ -1350,6 +1378,9 @@ retry: s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; + kfree(is); is = NULL; + kfree(bs); bs = NULL; + brelse(bh); goto retry; } error = -1; @@ -1492,13 +1523,13 @@ ext4_xattr_put_super(struct super_block *sb) * Returns 0, or a negative error number on failure. */ static void -ext4_xattr_cache_insert(struct buffer_head *bh) +ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) { __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); struct mb_cache_entry *ce; int error; - ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS); + ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS); if (!ce) { ea_bdebug(bh, "out of memory"); return; @@ -1570,12 +1601,13 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, { __u32 hash = le32_to_cpu(header->h_hash); struct mb_cache_entry *ce; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); again: - ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev, + ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev, hash); while (ce) { struct buffer_head *bh; @@ -1673,19 +1705,17 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header, #undef BLOCK_HASH_SHIFT -int __init -ext4_init_xattr(void) +#define HASH_BUCKET_BITS 10 + +struct mb_cache * +ext4_xattr_create_cache(char *name) { - ext4_xattr_cache = mb_cache_create("ext4_xattr", 6); - if (!ext4_xattr_cache) - return -ENOMEM; - return 0; + return mb_cache_create(name, HASH_BUCKET_BITS); } -void -ext4_exit_xattr(void) +void ext4_xattr_destroy_cache(struct mb_cache *cache) { - if (ext4_xattr_cache) - mb_cache_destroy(ext4_xattr_cache); - ext4_xattr_cache = NULL; + if (cache) + mb_cache_destroy(cache); } + diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index c767dbdd7fc..29bedf5589f 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -96,8 +96,6 @@ struct ext4_xattr_ibody_find { extern const struct xattr_handler ext4_xattr_user_handler; extern const struct xattr_handler ext4_xattr_trusted_handler; -extern const struct xattr_handler ext4_xattr_acl_access_handler; -extern const struct xattr_handler ext4_xattr_acl_default_handler; extern const struct xattr_handler ext4_xattr_security_handler; extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); @@ -112,9 +110,6 @@ extern void ext4_xattr_put_super(struct super_block *); extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); -extern int __init ext4_init_xattr(void); -extern void ext4_exit_xattr(void); - extern const struct xattr_handler *ext4_xattr_handlers[]; extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, @@ -126,6 +121,9 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is); +extern struct mb_cache *ext4_xattr_create_cache(char *name); +extern void ext4_xattr_destroy_cache(struct mb_cache *); + #ifdef CONFIG_EXT4_FS_SECURITY extern int ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, const struct qstr *qstr); diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index e06e0995e00..214fe1054fc 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -63,3 +63,11 @@ config F2FS_FS_SECURITY the extended attribute support in advance. If you are not using a security module, say N. + +config F2FS_CHECK_FS + bool "F2FS consistency checking feature" + depends on F2FS_FS + help + Enables BUG_ONs which check the file system consistency in runtime. + + If you want to improve the performance, say N. diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index 27a0820340b..2e35da12d29 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -1,6 +1,6 @@ obj-$(CONFIG_F2FS_FS) += f2fs.o -f2fs-y := dir.o file.o inode.o namei.o hash.o super.o +f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index b7826ec1b47..dbe2141d10a 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -17,9 +17,6 @@ #include "xattr.h" #include "acl.h" -#define get_inode_mode(i) ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ - (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) - static inline size_t f2fs_acl_size(int count) { if (count <= 4) { @@ -167,25 +164,17 @@ fail: struct posix_acl *f2fs_get_acl(struct inode *inode, int type) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT; void *value = NULL; struct posix_acl *acl; int retval; - if (!test_opt(sbi, POSIX_ACL)) - return NULL; - - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - if (type == ACL_TYPE_ACCESS) name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; retval = f2fs_getxattr(inode, name_index, "", NULL, 0); if (retval > 0) { - value = kmalloc(retval, GFP_KERNEL); + value = kmalloc(retval, GFP_F2FS_ZERO); if (!value) return ERR_PTR(-ENOMEM); retval = f2fs_getxattr(inode, name_index, "", value, retval); @@ -205,19 +194,20 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) return acl; } -static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl) +static int __f2fs_set_acl(struct inode *inode, int type, + struct posix_acl *acl, struct page *ipage) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct f2fs_inode_info *fi = F2FS_I(inode); int name_index; void *value = NULL; size_t size = 0; int error; - if (!test_opt(sbi, POSIX_ACL)) - return 0; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; + if (acl) { + error = posix_acl_valid(acl); + if (error < 0) + return error; + } switch (type) { case ACL_TYPE_ACCESS: @@ -250,7 +240,7 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl) } } - error = f2fs_setxattr(inode, name_index, "", value, size, NULL); + error = f2fs_setxattr(inode, name_index, "", value, size, ipage, 0); kfree(value); if (!error) @@ -260,153 +250,31 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl) return error; } -int f2fs_init_acl(struct inode *inode, struct inode *dir) +int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - struct posix_acl *acl = NULL; - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - int error = 0; - - if (!S_ISLNK(inode->i_mode)) { - if (test_opt(sbi, POSIX_ACL)) { - acl = f2fs_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (!acl) - inode->i_mode &= ~current_umask(); - } - - if (test_opt(sbi, POSIX_ACL) && acl) { - - if (S_ISDIR(inode->i_mode)) { - error = f2fs_set_acl(inode, ACL_TYPE_DEFAULT, acl); - if (error) - goto cleanup; - } - error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); - if (error < 0) - return error; - if (error > 0) - error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl); - } -cleanup: - posix_acl_release(acl); - return error; + return __f2fs_set_acl(inode, type, acl, NULL); } -int f2fs_acl_chmod(struct inode *inode) +int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct posix_acl *acl; - int error; - umode_t mode = get_inode_mode(inode); - - if (!test_opt(sbi, POSIX_ACL)) - return 0; - if (S_ISLNK(mode)) - return -EOPNOTSUPP; - - acl = f2fs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); + struct posix_acl *default_acl, *acl; + int error = 0; - error = posix_acl_chmod(&acl, GFP_KERNEL, mode); + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); if (error) return error; - error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl); - posix_acl_release(acl); - return error; -} - -static size_t f2fs_xattr_list_acl(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) -{ - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - const char *xname = POSIX_ACL_XATTR_DEFAULT; - size_t size; - if (!test_opt(sbi, POSIX_ACL)) - return 0; - - if (type == ACL_TYPE_ACCESS) - xname = POSIX_ACL_XATTR_ACCESS; - - size = strlen(xname) + 1; - if (list && size <= list_size) - memcpy(list, xname, size); - return size; -} - -static int f2fs_xattr_get_acl(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - struct posix_acl *acl; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(sbi, POSIX_ACL)) - return -EOPNOTSUPP; - - acl = f2fs_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (!acl) - return -ENODATA; - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int f2fs_xattr_set_acl(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - struct inode *inode = dentry->d_inode; - struct posix_acl *acl = NULL; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(sbi, POSIX_ACL)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - error = posix_acl_valid(acl); - if (error) - goto release_and_out; - } - } else { - acl = NULL; + if (default_acl) { + error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, + ipage); + posix_acl_release(default_acl); + } + if (acl) { + if (error) + error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, + ipage); + posix_acl_release(acl); } - error = f2fs_set_acl(inode, type, acl); - -release_and_out: - posix_acl_release(acl); return error; } - -const struct xattr_handler f2fs_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = f2fs_xattr_list_acl, - .get = f2fs_xattr_get_acl, - .set = f2fs_xattr_set_acl, -}; - -const struct xattr_handler f2fs_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = f2fs_xattr_list_acl, - .get = f2fs_xattr_get_acl, - .set = f2fs_xattr_set_acl, -}; diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index 80f43067441..e0864651cdc 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -36,20 +36,16 @@ struct f2fs_acl_header { #ifdef CONFIG_F2FS_FS_POSIX_ACL -extern struct posix_acl *f2fs_get_acl(struct inode *inode, int type); -extern int f2fs_acl_chmod(struct inode *inode); -extern int f2fs_init_acl(struct inode *inode, struct inode *dir); +extern struct posix_acl *f2fs_get_acl(struct inode *, int); +extern int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int f2fs_init_acl(struct inode *, struct inode *, struct page *); #else #define f2fs_check_acl NULL #define f2fs_get_acl NULL #define f2fs_set_acl NULL -static inline int f2fs_acl_chmod(struct inode *inode) -{ - return 0; -} - -static inline int f2fs_init_acl(struct inode *inode, struct inode *dir) +static inline int f2fs_init_acl(struct inode *inode, struct inode *dir, + struct page *page) { return 0; } diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index bb312201ca9..0b4710c1d37 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -30,7 +30,7 @@ static struct kmem_cache *inode_entry_slab; */ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) { - struct address_space *mapping = sbi->meta_inode->i_mapping; + struct address_space *mapping = META_MAPPING(sbi); struct page *page = NULL; repeat: page = grab_cache_page(mapping, index); @@ -38,9 +38,7 @@ repeat: cond_resched(); goto repeat; } - - /* We wait writeback only inside grab_meta_page() */ - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, META); SetPageUptodate(page); return page; } @@ -50,7 +48,7 @@ repeat: */ struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) { - struct address_space *mapping = sbi->meta_inode->i_mapping; + struct address_space *mapping = META_MAPPING(sbi); struct page *page; repeat: page = grab_cache_page(mapping, index); @@ -61,67 +59,154 @@ repeat: if (PageUptodate(page)) goto out; - if (f2fs_readpage(sbi, page, index, READ_SYNC)) + if (f2fs_submit_page_bio(sbi, page, index, + READ_SYNC | REQ_META | REQ_PRIO)) goto repeat; lock_page(page); - if (page->mapping != mapping) { + if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } out: - mark_page_accessed(page); return page; } +static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type) +{ + switch (type) { + case META_NAT: + return NM_I(sbi)->max_nid / NAT_ENTRY_PER_BLOCK; + case META_SIT: + return SIT_BLK_CNT(sbi); + case META_SSA: + case META_CP: + return 0; + default: + BUG(); + } +} + +/* + * Readahead CP/NAT/SIT/SSA pages + */ +int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type) +{ + block_t prev_blk_addr = 0; + struct page *page; + int blkno = start; + int max_blks = get_max_meta_blks(sbi, type); + + struct f2fs_io_info fio = { + .type = META, + .rw = READ_SYNC | REQ_META | REQ_PRIO + }; + + for (; nrpages-- > 0; blkno++) { + block_t blk_addr; + + switch (type) { + case META_NAT: + /* get nat block addr */ + if (unlikely(blkno >= max_blks)) + blkno = 0; + blk_addr = current_nat_addr(sbi, + blkno * NAT_ENTRY_PER_BLOCK); + break; + case META_SIT: + /* get sit block addr */ + if (unlikely(blkno >= max_blks)) + goto out; + blk_addr = current_sit_addr(sbi, + blkno * SIT_ENTRY_PER_BLOCK); + if (blkno != start && prev_blk_addr + 1 != blk_addr) + goto out; + prev_blk_addr = blk_addr; + break; + case META_SSA: + case META_CP: + /* get ssa/cp block addr */ + blk_addr = blkno; + break; + default: + BUG(); + } + + page = grab_cache_page(META_MAPPING(sbi), blk_addr); + if (!page) + continue; + if (PageUptodate(page)) { + f2fs_put_page(page, 1); + continue; + } + + f2fs_submit_page_mbio(sbi, page, blk_addr, &fio); + f2fs_put_page(page, 0); + } +out: + f2fs_submit_merged_bio(sbi, META, READ); + return blkno - start; +} + static int f2fs_write_meta_page(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - /* Should not write any meta pages, if any IO error was occurred */ - if (wbc->for_reclaim || - is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)) { - dec_page_count(sbi, F2FS_DIRTY_META); - wbc->pages_skipped++; - set_page_dirty(page); - return AOP_WRITEPAGE_ACTIVATE; - } + trace_f2fs_writepage(page, META); - wait_on_page_writeback(page); + if (unlikely(sbi->por_doing)) + goto redirty_out; + if (wbc->for_reclaim) + goto redirty_out; + /* Should not write any meta pages, if any IO error was occurred */ + if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) + goto no_write; + + f2fs_wait_on_page_writeback(page, META); write_meta_page(sbi, page); +no_write: dec_page_count(sbi, F2FS_DIRTY_META); unlock_page(page); return 0; + +redirty_out: + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; } static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - struct block_device *bdev = sbi->sb->s_bdev; - long written; + long diff, written; - if (wbc->for_kupdate) - return 0; + trace_f2fs_writepages(mapping->host, wbc, META); - if (get_pages(sbi, F2FS_DIRTY_META) == 0) - return 0; + /* collect a number of dirty meta pages and write together */ + if (wbc->for_kupdate || + get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) + goto skip_write; /* if mounting is failed, skip writing node pages */ mutex_lock(&sbi->cp_mutex); - written = sync_meta_pages(sbi, META, bio_get_nr_vecs(bdev)); + diff = nr_pages_to_write(sbi, META, wbc); + written = sync_meta_pages(sbi, META, wbc->nr_to_write); mutex_unlock(&sbi->cp_mutex); - wbc->nr_to_write -= written; + wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); + return 0; + +skip_write: + wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META); return 0; } long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write) { - struct address_space *mapping = sbi->meta_inode->i_mapping; + struct address_space *mapping = META_MAPPING(sbi); pgoff_t index = 0, end = LONG_MAX; struct pagevec pvec; long nwritten = 0; @@ -136,20 +221,33 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) + if (unlikely(nr_pages == 0)) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + lock_page(page); - BUG_ON(page->mapping != mapping); - BUG_ON(!PageDirty(page)); - clear_page_dirty_for_io(page); + + if (unlikely(page->mapping != mapping)) { +continue_unlock: + unlock_page(page); + continue; + } + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + if (f2fs_write_meta_page(page, &wbc)) { unlock_page(page); break; } - if (nwritten++ >= nr_to_write) + nwritten++; + if (unlikely(nwritten >= nr_to_write)) break; } pagevec_release(&pvec); @@ -157,7 +255,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, } if (nwritten) - f2fs_submit_bio(sbi, type, nr_to_write == LONG_MAX); + f2fs_submit_merged_bio(sbi, type, WRITE); return nwritten; } @@ -167,6 +265,8 @@ static int f2fs_set_meta_page_dirty(struct page *page) struct address_space *mapping = page->mapping; struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); + trace_f2fs_set_page_dirty(page, META); + SetPageUptodate(page); if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); @@ -184,62 +284,50 @@ const struct address_space_operations f2fs_meta_aops = { int acquire_orphan_inode(struct f2fs_sb_info *sbi) { - unsigned int max_orphans; int err = 0; - /* - * considering 512 blocks in a segment 5 blocks are needed for cp - * and log segment summaries. Remaining blocks are used to keep - * orphan entries with the limitation one reserved segment - * for cp pack we can have max 1020*507 orphan entries - */ - max_orphans = (sbi->blocks_per_seg - 5) * F2FS_ORPHANS_PER_BLOCK; - mutex_lock(&sbi->orphan_inode_mutex); - if (sbi->n_orphans >= max_orphans) + spin_lock(&sbi->orphan_inode_lock); + if (unlikely(sbi->n_orphans >= sbi->max_orphans)) err = -ENOSPC; else sbi->n_orphans++; - mutex_unlock(&sbi->orphan_inode_mutex); + spin_unlock(&sbi->orphan_inode_lock); + return err; } void release_orphan_inode(struct f2fs_sb_info *sbi) { - mutex_lock(&sbi->orphan_inode_mutex); + spin_lock(&sbi->orphan_inode_lock); + f2fs_bug_on(sbi->n_orphans == 0); sbi->n_orphans--; - mutex_unlock(&sbi->orphan_inode_mutex); + spin_unlock(&sbi->orphan_inode_lock); } void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct list_head *head, *this; - struct orphan_inode_entry *new = NULL, *orphan = NULL; + struct list_head *head; + struct orphan_inode_entry *new, *orphan; - mutex_lock(&sbi->orphan_inode_mutex); + new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); + new->ino = ino; + + spin_lock(&sbi->orphan_inode_lock); head = &sbi->orphan_inode_list; - list_for_each(this, head) { - orphan = list_entry(this, struct orphan_inode_entry, list); - if (orphan->ino == ino) - goto out; + list_for_each_entry(orphan, head, list) { + if (orphan->ino == ino) { + spin_unlock(&sbi->orphan_inode_lock); + kmem_cache_free(orphan_entry_slab, new); + return; + } + if (orphan->ino > ino) break; - orphan = NULL; - } -retry: - new = kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); - if (!new) { - cond_resched(); - goto retry; } - new->ino = ino; - /* add new_oentry into list which is sorted by inode number */ - if (orphan) - list_add(&new->list, this->prev); - else - list_add_tail(&new->list, head); -out: - mutex_unlock(&sbi->orphan_inode_mutex); + /* add new orphan entry into list which is sorted by inode number */ + list_add_tail(&new->list, &orphan->list); + spin_unlock(&sbi->orphan_inode_lock); } void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) @@ -247,40 +335,46 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) struct list_head *head; struct orphan_inode_entry *orphan; - mutex_lock(&sbi->orphan_inode_mutex); + spin_lock(&sbi->orphan_inode_lock); head = &sbi->orphan_inode_list; list_for_each_entry(orphan, head, list) { if (orphan->ino == ino) { list_del(&orphan->list); - kmem_cache_free(orphan_entry_slab, orphan); + f2fs_bug_on(sbi->n_orphans == 0); sbi->n_orphans--; - break; + spin_unlock(&sbi->orphan_inode_lock); + kmem_cache_free(orphan_entry_slab, orphan); + return; } } - mutex_unlock(&sbi->orphan_inode_mutex); + spin_unlock(&sbi->orphan_inode_lock); } static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { struct inode *inode = f2fs_iget(sbi->sb, ino); - BUG_ON(IS_ERR(inode)); + f2fs_bug_on(IS_ERR(inode)); clear_nlink(inode); /* truncate all the data during iput */ iput(inode); } -int recover_orphan_inodes(struct f2fs_sb_info *sbi) +void recover_orphan_inodes(struct f2fs_sb_info *sbi) { block_t start_blk, orphan_blkaddr, i, j; if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) - return 0; + return; - sbi->por_doing = 1; - start_blk = __start_cp_addr(sbi) + 1; + sbi->por_doing = true; + + start_blk = __start_cp_addr(sbi) + 1 + + le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); orphan_blkaddr = __start_sum_addr(sbi) - 1; + ra_meta_pages(sbi, start_blk, orphan_blkaddr, META_CP); + for (i = 0; i < orphan_blkaddr; i++) { struct page *page = get_meta_page(sbi, start_blk + i); struct f2fs_orphan_block *orphan_blk; @@ -294,30 +388,40 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) } /* clear Orphan Flag */ clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG); - sbi->por_doing = 0; - return 0; + sbi->por_doing = false; + return; } static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) { - struct list_head *head, *this, *next; + struct list_head *head; struct f2fs_orphan_block *orphan_blk = NULL; - struct page *page = NULL; unsigned int nentries = 0; - unsigned short index = 1; - unsigned short orphan_blocks; - - orphan_blocks = (unsigned short)((sbi->n_orphans + + unsigned short index; + unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans + (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK); + struct page *page = NULL; + struct orphan_inode_entry *orphan = NULL; - mutex_lock(&sbi->orphan_inode_mutex); + for (index = 0; index < orphan_blocks; index++) + grab_meta_page(sbi, start_blk + index); + + index = 1; + spin_lock(&sbi->orphan_inode_lock); head = &sbi->orphan_inode_list; /* loop for each orphan inode entry and write them in Jornal block */ - list_for_each_safe(this, next, head) { - struct orphan_inode_entry *orphan; + list_for_each_entry(orphan, head, list) { + if (!page) { + page = find_get_page(META_MAPPING(sbi), start_blk++); + f2fs_bug_on(!page); + orphan_blk = + (struct f2fs_orphan_block *)page_address(page); + memset(orphan_blk, 0, sizeof(*orphan_blk)); + f2fs_put_page(page, 0); + } - orphan = list_entry(this, struct orphan_inode_entry, list); + orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino); if (nentries == F2FS_ORPHANS_PER_BLOCK) { /* @@ -331,29 +435,20 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) set_page_dirty(page); f2fs_put_page(page, 1); index++; - start_blk++; nentries = 0; page = NULL; } - if (page) - goto page_exist; + } - page = grab_meta_page(sbi, start_blk); - orphan_blk = (struct f2fs_orphan_block *)page_address(page); - memset(orphan_blk, 0, sizeof(*orphan_blk)); -page_exist: - orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino); + if (page) { + orphan_blk->blk_addr = cpu_to_le16(index); + orphan_blk->blk_count = cpu_to_le16(orphan_blocks); + orphan_blk->entry_count = cpu_to_le32(nentries); + set_page_dirty(page); + f2fs_put_page(page, 1); } - if (!page) - goto end; - - orphan_blk->blk_addr = cpu_to_le16(index); - orphan_blk->blk_count = cpu_to_le16(orphan_blocks); - orphan_blk->entry_count = cpu_to_le32(nentries); - set_page_dirty(page); - f2fs_put_page(page, 1); -end: - mutex_unlock(&sbi->orphan_inode_mutex); + + spin_unlock(&sbi->orphan_inode_lock); } static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, @@ -416,8 +511,11 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) unsigned long blk_size = sbi->blocksize; unsigned long long cp1_version = 0, cp2_version = 0; unsigned long long cp_start_blk_no; + unsigned int cp_blks = 1 + le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); + block_t cp_blk_no; + int i; - sbi->ckpt = kzalloc(blk_size, GFP_KERNEL); + sbi->ckpt = kzalloc(cp_blks * blk_size, GFP_KERNEL); if (!sbi->ckpt) return -ENOMEM; /* @@ -428,7 +526,8 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version); /* The second checkpoint pack should start at the next segment */ - cp_start_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg); + cp_start_blk_no += ((unsigned long long)1) << + le32_to_cpu(fsb->log_blocks_per_seg); cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version); if (cp1 && cp2) { @@ -447,6 +546,23 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) cp_block = (struct f2fs_checkpoint *)page_address(cur_page); memcpy(sbi->ckpt, cp_block, blk_size); + if (cp_blks <= 1) + goto done; + + cp_blk_no = le32_to_cpu(fsb->cp_blkaddr); + if (cur_page == cp2) + cp_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg); + + for (i = 1; i < cp_blks; i++) { + void *sit_bitmap_ptr; + unsigned char *ckpt = (unsigned char *)sbi->ckpt; + + cur_page = get_meta_page(sbi, cp_blk_no + i); + sit_bitmap_ptr = page_address(cur_page); + memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size); + f2fs_put_page(cur_page, 1); + } +done: f2fs_put_page(cp1, 1); f2fs_put_page(cp2, 1); return 0; @@ -459,19 +575,14 @@ fail_no_cp: static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct list_head *head = &sbi->dir_inode_list; - struct list_head *this; - - list_for_each(this, head) { - struct dir_inode_entry *entry; - entry = list_entry(this, struct dir_inode_entry, list); - if (entry->inode == inode) - return -EEXIST; - } - list_add_tail(&new->list, head); -#ifdef CONFIG_F2FS_STAT_FS - sbi->n_dirty_dirs++; -#endif + + if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) + return -EEXIST; + + set_inode_flag(F2FS_I(inode), FI_DIRTY_DIR); + F2FS_I(inode)->dirty_dir = new; + list_add_tail(&new->list, &sbi->dir_inode_list); + stat_inc_dirty_dir(sbi); return 0; } @@ -479,75 +590,65 @@ void set_dirty_dir_page(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct dir_inode_entry *new; + int ret = 0; if (!S_ISDIR(inode->i_mode)) return; -retry: - new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS); - if (!new) { - cond_resched(); - goto retry; - } + + new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); new->inode = inode; INIT_LIST_HEAD(&new->list); spin_lock(&sbi->dir_inode_lock); - if (__add_dirty_inode(inode, new)) - kmem_cache_free(inode_entry_slab, new); - - inc_page_count(sbi, F2FS_DIRTY_DENTS); + ret = __add_dirty_inode(inode, new); inode_inc_dirty_dents(inode); SetPagePrivate(page); spin_unlock(&sbi->dir_inode_lock); + + if (ret) + kmem_cache_free(inode_entry_slab, new); } void add_dirty_dir_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct dir_inode_entry *new; -retry: - new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS); - if (!new) { - cond_resched(); - goto retry; - } + struct dir_inode_entry *new = + f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); + int ret = 0; + new->inode = inode; INIT_LIST_HEAD(&new->list); spin_lock(&sbi->dir_inode_lock); - if (__add_dirty_inode(inode, new)) - kmem_cache_free(inode_entry_slab, new); + ret = __add_dirty_inode(inode, new); spin_unlock(&sbi->dir_inode_lock); + + if (ret) + kmem_cache_free(inode_entry_slab, new); } void remove_dirty_dir_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct list_head *head = &sbi->dir_inode_list; - struct list_head *this; + struct dir_inode_entry *entry; if (!S_ISDIR(inode->i_mode)) return; spin_lock(&sbi->dir_inode_lock); - if (atomic_read(&F2FS_I(inode)->dirty_dents)) { + if (get_dirty_dents(inode) || + !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) { spin_unlock(&sbi->dir_inode_lock); return; } - list_for_each(this, head) { - struct dir_inode_entry *entry; - entry = list_entry(this, struct dir_inode_entry, list); - if (entry->inode == inode) { - list_del(&entry->list); - kmem_cache_free(inode_entry_slab, entry); -#ifdef CONFIG_F2FS_STAT_FS - sbi->n_dirty_dirs--; -#endif - break; - } - } + entry = F2FS_I(inode)->dirty_dir; + list_del(&entry->list); + F2FS_I(inode)->dirty_dir = NULL; + clear_inode_flag(F2FS_I(inode), FI_DIRTY_DIR); + stat_dec_dirty_dir(sbi); spin_unlock(&sbi->dir_inode_lock); + kmem_cache_free(inode_entry_slab, entry); /* Only from the recovery routine */ if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) { @@ -556,32 +657,15 @@ void remove_dirty_dir_inode(struct inode *inode) } } -struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino) -{ - struct list_head *head = &sbi->dir_inode_list; - struct list_head *this; - struct inode *inode = NULL; - - spin_lock(&sbi->dir_inode_lock); - list_for_each(this, head) { - struct dir_inode_entry *entry; - entry = list_entry(this, struct dir_inode_entry, list); - if (entry->inode->i_ino == ino) { - inode = entry->inode; - break; - } - } - spin_unlock(&sbi->dir_inode_lock); - return inode; -} - void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) { - struct list_head *head = &sbi->dir_inode_list; + struct list_head *head; struct dir_inode_entry *entry; struct inode *inode; retry: spin_lock(&sbi->dir_inode_lock); + + head = &sbi->dir_inode_list; if (list_empty(head)) { spin_unlock(&sbi->dir_inode_lock); return; @@ -590,14 +674,14 @@ retry: inode = igrab(entry->inode); spin_unlock(&sbi->dir_inode_lock); if (inode) { - filemap_flush(inode->i_mapping); + filemap_fdatawrite(inode->i_mapping); iput(inode); } else { /* * We should submit bio, since it exists several * wribacking dentry pages in the freeing inode. */ - f2fs_submit_bio(sbi, DATA, true); + f2fs_submit_merged_bio(sbi, DATA, WRITE); } goto retry; } @@ -617,11 +701,10 @@ static void block_operations(struct f2fs_sb_info *sbi) blk_start_plug(&plug); retry_flush_dents: - mutex_lock_all(sbi); - + f2fs_lock_all(sbi); /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { - mutex_unlock_all(sbi); + f2fs_unlock_all(sbi); sync_dirty_dir_inodes(sbi); goto retry_flush_dents; } @@ -644,7 +727,22 @@ retry_flush_nodes: static void unblock_operations(struct f2fs_sb_info *sbi) { mutex_unlock(&sbi->node_write); - mutex_unlock_all(sbi); + f2fs_unlock_all(sbi); +} + +static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) +{ + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); + + if (!get_pages(sbi, F2FS_WRITEBACK)) + break; + + io_schedule(); + } + finish_wait(&sbi->cp_wait, &wait); } static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) @@ -657,6 +755,13 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) __u32 crc32 = 0; void *kaddr; int i; + int cp_payload_blks = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); + + /* + * This avoids to conduct wrong roll-forward operations and uses + * metapages, so should be called prior to sync_meta_pages below. + */ + discard_next_dnode(sbi); /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) @@ -701,16 +806,19 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1) / F2FS_ORPHANS_PER_BLOCK; - ckpt->cp_pack_start_sum = cpu_to_le32(1 + orphan_blocks); + ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + + orphan_blocks); if (is_umount) { set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); ckpt->cp_pack_total_block_count = cpu_to_le32(2 + - data_sum_blocks + orphan_blocks + NR_CURSEG_NODE_TYPE); + cp_payload_blks + data_sum_blocks + + orphan_blocks + NR_CURSEG_NODE_TYPE); } else { clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); ckpt->cp_pack_total_block_count = cpu_to_le32(2 + - data_sum_blocks + orphan_blocks); + cp_payload_blks + data_sum_blocks + + orphan_blocks); } if (sbi->n_orphans) @@ -736,6 +844,15 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) set_page_dirty(cp_page); f2fs_put_page(cp_page, 1); + for (i = 1; i < 1 + cp_payload_blks; i++) { + cp_page = grab_meta_page(sbi, start_blk++); + kaddr = page_address(cp_page); + memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE, + (1 << sbi->log_blocksize)); + set_page_dirty(cp_page); + f2fs_put_page(cp_page, 1); + } + if (sbi->n_orphans) { write_orphan_inodes(sbi, start_blk); start_blk += orphan_blocks; @@ -756,11 +873,10 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) f2fs_put_page(cp_page, 1); /* wait for previous submitted node/meta pages writeback */ - while (get_pages(sbi, F2FS_WRITEBACK)) - congestion_wait(BLK_RW_ASYNC, HZ / 50); + wait_on_all_pages_writeback(sbi); - filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX); - filemap_fdatawait_range(sbi->meta_inode->i_mapping, 0, LONG_MAX); + filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX); + filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX); /* update user_block_counts */ sbi->last_valid_block_count = sbi->total_valid_block_count; @@ -769,7 +885,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) /* Here, we only have one bio having CP pack */ sync_meta_pages(sbi, META_FLUSH, LONG_MAX); - if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { + if (unlikely(!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) { clear_prefree_segments(sbi); F2FS_RESET_SB_DIRT(sbi); } @@ -790,9 +906,9 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops"); - f2fs_submit_bio(sbi, DATA, true); - f2fs_submit_bio(sbi, NODE, true); - f2fs_submit_bio(sbi, META, true); + f2fs_submit_merged_bio(sbi, DATA, WRITE); + f2fs_submit_merged_bio(sbi, NODE, WRITE); + f2fs_submit_merged_bio(sbi, META, WRITE); /* * update checkpoint pack index @@ -812,25 +928,34 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) unblock_operations(sbi); mutex_unlock(&sbi->cp_mutex); + stat_inc_cp_count(sbi->stat_info); trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint"); } void init_orphan_info(struct f2fs_sb_info *sbi) { - mutex_init(&sbi->orphan_inode_mutex); + spin_lock_init(&sbi->orphan_inode_lock); INIT_LIST_HEAD(&sbi->orphan_inode_list); sbi->n_orphans = 0; + /* + * considering 512 blocks in a segment 8 blocks are needed for cp + * and log segment summaries. Remaining blocks are used to keep + * orphan entries with the limitation one reserved segment + * for cp pack we can have max 1020*504 orphan entries + */ + sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE) + * F2FS_ORPHANS_PER_BLOCK; } int __init create_checkpoint_caches(void) { orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", - sizeof(struct orphan_inode_entry), NULL); - if (unlikely(!orphan_entry_slab)) + sizeof(struct orphan_inode_entry)); + if (!orphan_entry_slab) return -ENOMEM; inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry", - sizeof(struct dir_inode_entry), NULL); - if (unlikely(!inode_entry_slab)) { + sizeof(struct dir_inode_entry)); + if (!inode_entry_slab) { kmem_cache_destroy(orphan_entry_slab); return -ENOMEM; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 941f9b9ca3a..f8cf619edb5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -24,6 +24,190 @@ #include "segment.h" #include <trace/events/f2fs.h> +static void f2fs_read_end_io(struct bio *bio, int err) +{ + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + + if (!err) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } + bio_put(bio); +} + +static void f2fs_write_end_io(struct bio *bio, int err) +{ + struct f2fs_sb_info *sbi = bio->bi_private; + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + + if (unlikely(err)) { + SetPageError(page); + set_bit(AS_EIO, &page->mapping->flags); + f2fs_stop_checkpoint(sbi); + } + end_page_writeback(page); + dec_page_count(sbi, F2FS_WRITEBACK); + } + + if (sbi->wait_io) { + complete(sbi->wait_io); + sbi->wait_io = NULL; + } + + if (!get_pages(sbi, F2FS_WRITEBACK) && + !list_empty(&sbi->cp_wait.task_list)) + wake_up(&sbi->cp_wait); + + bio_put(bio); +} + +/* + * Low-level block read/write IO operations. + */ +static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, + int npages, bool is_read) +{ + struct bio *bio; + + /* No failure on bio allocation */ + bio = bio_alloc(GFP_NOIO, npages); + + bio->bi_bdev = sbi->sb->s_bdev; + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); + bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; + bio->bi_private = sbi; + + return bio; +} + +static void __submit_merged_bio(struct f2fs_bio_info *io) +{ + struct f2fs_io_info *fio = &io->fio; + int rw; + + if (!io->bio) + return; + + rw = fio->rw; + + if (is_read_io(rw)) { + trace_f2fs_submit_read_bio(io->sbi->sb, rw, + fio->type, io->bio); + submit_bio(rw, io->bio); + } else { + trace_f2fs_submit_write_bio(io->sbi->sb, rw, + fio->type, io->bio); + /* + * META_FLUSH is only from the checkpoint procedure, and we + * should wait this metadata bio for FS consistency. + */ + if (fio->type == META_FLUSH) { + DECLARE_COMPLETION_ONSTACK(wait); + io->sbi->wait_io = &wait; + submit_bio(rw, io->bio); + wait_for_completion(&wait); + } else { + submit_bio(rw, io->bio); + } + } + + io->bio = NULL; +} + +void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, + enum page_type type, int rw) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(type); + struct f2fs_bio_info *io; + + io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype]; + + down_write(&io->io_rwsem); + + /* change META to META_FLUSH in the checkpoint procedure */ + if (type >= META_FLUSH) { + io->fio.type = META_FLUSH; + io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; + } + __submit_merged_bio(io); + up_write(&io->io_rwsem); +} + +/* + * Fill the locked page with data located in the block address. + * Return unlocked page. + */ +int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page, + block_t blk_addr, int rw) +{ + struct bio *bio; + + trace_f2fs_submit_page_bio(page, blk_addr, rw); + + /* Allocate a new bio */ + bio = __bio_alloc(sbi, blk_addr, 1, is_read_io(rw)); + + if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { + bio_put(bio); + f2fs_put_page(page, 1); + return -EFAULT; + } + + submit_bio(rw, bio); + return 0; +} + +void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page, + block_t blk_addr, struct f2fs_io_info *fio) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); + struct f2fs_bio_info *io; + bool is_read = is_read_io(fio->rw); + + io = is_read ? &sbi->read_io : &sbi->write_io[btype]; + + verify_block_addr(sbi, blk_addr); + + down_write(&io->io_rwsem); + + if (!is_read) + inc_page_count(sbi, F2FS_WRITEBACK); + + if (io->bio && (io->last_block_in_bio != blk_addr - 1 || + io->fio.rw != fio->rw)) + __submit_merged_bio(io); +alloc_new: + if (io->bio == NULL) { + int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + + io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read); + io->fio = *fio; + } + + if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + __submit_merged_bio(io); + goto alloc_new; + } + + io->last_block_in_bio = blk_addr; + + up_write(&io->io_rwsem); + trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr); +} + /* * Lock ordering for the change of data block address: * ->data_page @@ -37,7 +221,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr) struct page *node_page = dn->node_page; unsigned int ofs_in_node = dn->ofs_in_node; - f2fs_wait_on_page_writeback(node_page, NODE, false); + f2fs_wait_on_page_writeback(node_page, NODE); rn = F2FS_NODE(node_page); @@ -51,38 +235,57 @@ int reserve_new_block(struct dnode_of_data *dn) { struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); - if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)) + if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return -EPERM; - if (!inc_valid_block_count(sbi, dn->inode, 1)) + if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) return -ENOSPC; trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); __set_data_blkaddr(dn, NEW_ADDR); dn->data_blkaddr = NEW_ADDR; + mark_inode_dirty(dn->inode); sync_inode_page(dn); return 0; } +int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) +{ + bool need_put = dn->inode_page ? false : true; + int err; + + /* if inode_page exists, index should be zero */ + f2fs_bug_on(!need_put && index); + + err = get_dnode_of_data(dn, index, ALLOC_NODE); + if (err) + return err; + + if (dn->data_blkaddr == NULL_ADDR) + err = reserve_new_block(dn); + if (err || need_put) + f2fs_put_dnode(dn); + return err; +} + static int check_extent_cache(struct inode *inode, pgoff_t pgofs, struct buffer_head *bh_result) { struct f2fs_inode_info *fi = F2FS_I(inode); -#ifdef CONFIG_F2FS_STAT_FS - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); -#endif pgoff_t start_fofs, end_fofs; block_t start_blkaddr; + if (is_inode_flag_set(fi, FI_NO_EXTENT)) + return 0; + read_lock(&fi->ext.ext_lock); if (fi->ext.len == 0) { read_unlock(&fi->ext.ext_lock); return 0; } -#ifdef CONFIG_F2FS_STAT_FS - sbi->total_hit_ext++; -#endif + stat_inc_total_hit(inode->i_sb); + start_fofs = fi->ext.fofs; end_fofs = fi->ext.fofs + fi->ext.len - 1; start_blkaddr = fi->ext.blk_addr; @@ -100,9 +303,7 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs, else bh_result->b_size = UINT_MAX; -#ifdef CONFIG_F2FS_STAT_FS - sbi->read_hit_ext++; -#endif + stat_inc_read_hit(inode->i_sb); read_unlock(&fi->ext.ext_lock); return 1; } @@ -115,14 +316,18 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) struct f2fs_inode_info *fi = F2FS_I(dn->inode); pgoff_t fofs, start_fofs, end_fofs; block_t start_blkaddr, end_blkaddr; + int need_update = true; - BUG_ON(blk_addr == NEW_ADDR); + f2fs_bug_on(blk_addr == NEW_ADDR); fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + dn->ofs_in_node; /* Update the page address in the parent node */ __set_data_blkaddr(dn, blk_addr); + if (is_inode_flag_set(fi, FI_NO_EXTENT)) + return; + write_lock(&fi->ext.ext_lock); start_fofs = fi->ext.fofs; @@ -169,14 +374,21 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) fofs - start_fofs + 1; fi->ext.len -= fofs - start_fofs + 1; } - goto end_update; + } else { + need_update = false; } - write_unlock(&fi->ext.ext_lock); - return; + /* Finally, if the extent is very fragmented, let's drop the cache. */ + if (fi->ext.len < F2FS_MIN_EXTENT_LEN) { + fi->ext.len = 0; + set_inode_flag(fi, FI_NO_EXTENT); + need_update = true; + } end_update: write_unlock(&fi->ext.ext_lock); - sync_inode_page(dn); + if (need_update) + sync_inode_page(dn); + return; } struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) @@ -202,10 +414,10 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) return ERR_PTR(-ENOENT); /* By fallocate(), there is no cached page, but with NEW_ADDR */ - if (dn.data_blkaddr == NEW_ADDR) + if (unlikely(dn.data_blkaddr == NEW_ADDR)) return ERR_PTR(-EINVAL); - page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); + page = grab_cache_page(mapping, index); if (!page) return ERR_PTR(-ENOMEM); @@ -214,11 +426,14 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) return page; } - err = f2fs_readpage(sbi, page, dn.data_blkaddr, + err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, sync ? READ_SYNC : READA); + if (err) + return ERR_PTR(err); + if (sync) { wait_on_page_locked(page); - if (!PageUptodate(page)) { + if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 0); return ERR_PTR(-EIO); } @@ -240,7 +455,7 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index) int err; repeat: - page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); + page = grab_cache_page(mapping, index); if (!page) return ERR_PTR(-ENOMEM); @@ -252,7 +467,7 @@ repeat: } f2fs_put_dnode(&dn); - if (dn.data_blkaddr == NULL_ADDR) { + if (unlikely(dn.data_blkaddr == NULL_ADDR)) { f2fs_put_page(page, 1); return ERR_PTR(-ENOENT); } @@ -272,16 +487,16 @@ repeat: return page; } - err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); + err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC); if (err) return ERR_PTR(err); lock_page(page); - if (!PageUptodate(page)) { + if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); return ERR_PTR(-EIO); } - if (page->mapping != mapping) { + if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } @@ -292,12 +507,12 @@ repeat: * Caller ensures that this data page is never allocated. * A new zero-filled data page is allocated in the page cache. * - * Also, caller should grab and release a mutex by calling mutex_lock_op() and - * mutex_unlock_op(). - * Note that, npage is set only by make_empty_dir. + * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + * Note that, ipage is set only by make_empty_dir. */ struct page *get_new_data_page(struct inode *inode, - struct page *npage, pgoff_t index, bool new_i_size) + struct page *ipage, pgoff_t index, bool new_i_size) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct address_space *mapping = inode->i_mapping; @@ -305,24 +520,16 @@ struct page *get_new_data_page(struct inode *inode, struct dnode_of_data dn; int err; - set_new_dnode(&dn, inode, npage, npage, 0); - err = get_dnode_of_data(&dn, index, ALLOC_NODE); + set_new_dnode(&dn, inode, ipage, NULL, 0); + err = f2fs_reserve_block(&dn, index); if (err) return ERR_PTR(err); - - if (dn.data_blkaddr == NULL_ADDR) { - if (reserve_new_block(&dn)) { - if (!npage) - f2fs_put_dnode(&dn); - return ERR_PTR(-ENOSPC); - } - } - if (!npage) - f2fs_put_dnode(&dn); repeat: page = grab_cache_page(mapping, index); - if (!page) - return ERR_PTR(-ENOMEM); + if (!page) { + err = -ENOMEM; + goto put_err; + } if (PageUptodate(page)) return page; @@ -331,15 +538,18 @@ repeat: zero_user_segment(page, 0, PAGE_CACHE_SIZE); SetPageUptodate(page); } else { - err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); + err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, + READ_SYNC); if (err) - return ERR_PTR(err); + goto put_err; + lock_page(page); - if (!PageUptodate(page)) { + if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); - return ERR_PTR(-EIO); + err = -EIO; + goto put_err; } - if (page->mapping != mapping) { + if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } @@ -350,140 +560,206 @@ repeat: i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT)); /* Only the directory inode sets new_i_size */ set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); - mark_inode_dirty_sync(inode); } return page; -} - -static void read_end_io(struct bio *bio, int err) -{ - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - do { - struct page *page = bvec->bv_page; - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - if (uptodate) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); - } while (bvec >= bio->bi_io_vec); - bio_put(bio); +put_err: + f2fs_put_dnode(&dn); + return ERR_PTR(err); } -/* - * Fill the locked page with data located in the block address. - * Return unlocked page. - */ -int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page, - block_t blk_addr, int type) +static int __allocate_data_block(struct dnode_of_data *dn) { - struct block_device *bdev = sbi->sb->s_bdev; - struct bio *bio; + struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct f2fs_summary sum; + block_t new_blkaddr; + struct node_info ni; + int type; - trace_f2fs_readpage(page, blk_addr, type); + if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + return -EPERM; + if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) + return -ENOSPC; - down_read(&sbi->bio_sem); + __set_data_blkaddr(dn, NEW_ADDR); + dn->data_blkaddr = NEW_ADDR; - /* Allocate a new bio */ - bio = f2fs_bio_alloc(bdev, 1); + get_node_info(sbi, dn->nid, &ni); + set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - /* Initialize the bio */ - bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); - bio->bi_end_io = read_end_io; + type = CURSEG_WARM_DATA; - if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { - bio_put(bio); - up_read(&sbi->bio_sem); - f2fs_put_page(page, 1); - return -EFAULT; - } + allocate_data_block(sbi, NULL, NULL_ADDR, &new_blkaddr, &sum, type); + + /* direct IO doesn't use extent cache to maximize the performance */ + set_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT); + update_extent_cache(new_blkaddr, dn); + clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT); - submit_bio(type, bio); - up_read(&sbi->bio_sem); + dn->data_blkaddr = new_blkaddr; return 0; } /* - * This function should be used by the data read flow only where it - * does not check the "create" flag that indicates block allocation. - * The reason for this special functionality is to exploit VFS readahead - * mechanism. + * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh. + * If original data blocks are allocated, then give them to blockdev. + * Otherwise, + * a. preallocate requested block addresses + * b. do not use extent cache for better performance + * c. give the block addresses to blockdev */ -static int get_data_block_ro(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int __get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create, bool fiemap) { + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); unsigned int blkbits = inode->i_sb->s_blocksize_bits; unsigned maxblocks = bh_result->b_size >> blkbits; struct dnode_of_data dn; - pgoff_t pgofs; - int err; + int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA; + pgoff_t pgofs, end_offset; + int err = 0, ofs = 1; + bool allocated = false; /* Get the page offset from the block offset(iblock) */ pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits)); - if (check_extent_cache(inode, pgofs, bh_result)) { - trace_f2fs_get_data_block(inode, iblock, bh_result, 0); - return 0; - } + if (check_extent_cache(inode, pgofs, bh_result)) + goto out; + + if (create) + f2fs_lock_op(sbi); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); + err = get_dnode_of_data(&dn, pgofs, mode); if (err) { - trace_f2fs_get_data_block(inode, iblock, bh_result, err); - return (err == -ENOENT) ? 0 : err; + if (err == -ENOENT) + err = 0; + goto unlock_out; } + if (dn.data_blkaddr == NEW_ADDR && !fiemap) + goto put_out; - /* It does not support data allocation */ - BUG_ON(create); + if (dn.data_blkaddr != NULL_ADDR) { + map_bh(bh_result, inode->i_sb, dn.data_blkaddr); + } else if (create) { + err = __allocate_data_block(&dn); + if (err) + goto put_out; + allocated = true; + map_bh(bh_result, inode->i_sb, dn.data_blkaddr); + } else { + goto put_out; + } - if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) { - int i; - unsigned int end_offset; + end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + bh_result->b_size = (((size_t)1) << blkbits); + dn.ofs_in_node++; + pgofs++; - end_offset = IS_INODE(dn.node_page) ? - ADDRS_PER_INODE(F2FS_I(inode)) : - ADDRS_PER_BLOCK; +get_next: + if (dn.ofs_in_node >= end_offset) { + if (allocated) + sync_inode_page(&dn); + allocated = false; + f2fs_put_dnode(&dn); - clear_buffer_new(bh_result); + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, pgofs, mode); + if (err) { + if (err == -ENOENT) + err = 0; + goto unlock_out; + } + if (dn.data_blkaddr == NEW_ADDR && !fiemap) + goto put_out; + + end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + } + if (maxblocks > (bh_result->b_size >> blkbits)) { + block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + if (blkaddr == NULL_ADDR && create) { + err = __allocate_data_block(&dn); + if (err) + goto sync_out; + allocated = true; + blkaddr = dn.data_blkaddr; + } /* Give more consecutive addresses for the read ahead */ - for (i = 0; i < end_offset - dn.ofs_in_node; i++) - if (((datablock_addr(dn.node_page, - dn.ofs_in_node + i)) - != (dn.data_blkaddr + i)) || maxblocks == i) - break; - map_bh(bh_result, inode->i_sb, dn.data_blkaddr); - bh_result->b_size = (i << blkbits); + if (blkaddr == (bh_result->b_blocknr + ofs)) { + ofs++; + dn.ofs_in_node++; + pgofs++; + bh_result->b_size += (((size_t)1) << blkbits); + goto get_next; + } } +sync_out: + if (allocated) + sync_inode_page(&dn); +put_out: f2fs_put_dnode(&dn); - trace_f2fs_get_data_block(inode, iblock, bh_result, 0); - return 0; +unlock_out: + if (create) + f2fs_unlock_op(sbi); +out: + trace_f2fs_get_data_block(inode, iblock, bh_result, err); + return err; +} + +static int get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return __get_data_block(inode, iblock, bh_result, create, false); +} + +static int get_data_block_fiemap(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return __get_data_block(inode, iblock, bh_result, create, true); +} + +int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + return generic_block_fiemap(inode, fieinfo, + start, len, get_data_block_fiemap); } static int f2fs_read_data_page(struct file *file, struct page *page) { - return mpage_readpage(page, get_data_block_ro); + struct inode *inode = page->mapping->host; + int ret; + + trace_f2fs_readpage(page, DATA); + + /* If the file has inline data, try to read it directlly */ + if (f2fs_has_inline_data(inode)) + ret = f2fs_read_inline_data(inode, page); + else + ret = mpage_readpage(page, get_data_block); + + return ret; } static int f2fs_read_data_pages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { - return mpage_readpages(mapping, pages, nr_pages, get_data_block_ro); + struct inode *inode = file->f_mapping->host; + + /* If the file has inline data, skip readpages */ + if (f2fs_has_inline_data(inode)) + return 0; + + return mpage_readpages(mapping, pages, nr_pages, get_data_block); } -int do_write_data_page(struct page *page) +int do_write_data_page(struct page *page, struct f2fs_io_info *fio) { struct inode *inode = page->mapping->host; - block_t old_blk_addr, new_blk_addr; + block_t old_blkaddr, new_blkaddr; struct dnode_of_data dn; int err = 0; @@ -492,10 +768,10 @@ int do_write_data_page(struct page *page) if (err) return err; - old_blk_addr = dn.data_blkaddr; + old_blkaddr = dn.data_blkaddr; /* This page is already truncated */ - if (old_blk_addr == NULL_ADDR) + if (old_blkaddr == NULL_ADDR) goto out_writepage; set_page_writeback(page); @@ -504,15 +780,13 @@ int do_write_data_page(struct page *page) * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (unlikely(old_blk_addr != NEW_ADDR && + if (unlikely(old_blkaddr != NEW_ADDR && !is_cold_data(page) && need_inplace_update(inode))) { - rewrite_data_page(F2FS_SB(inode->i_sb), page, - old_blk_addr); + rewrite_data_page(page, old_blkaddr, fio); } else { - write_data_page(inode, page, &dn, - old_blk_addr, &new_blk_addr); - update_extent_cache(new_blk_addr, &dn); + write_data_page(page, &dn, &new_blkaddr, fio); + update_extent_cache(new_blkaddr, &dn); } out_writepage: f2fs_put_dnode(&dn); @@ -527,9 +801,15 @@ static int f2fs_write_data_page(struct page *page, loff_t i_size = i_size_read(inode); const pgoff_t end_index = ((unsigned long long) i_size) >> PAGE_CACHE_SHIFT; - unsigned offset; + unsigned offset = 0; bool need_balance_fs = false; int err = 0; + struct f2fs_io_info fio = { + .type = DATA, + .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + }; + + trace_f2fs_writepage(page, DATA); if (page->index < end_index) goto write; @@ -539,55 +819,50 @@ static int f2fs_write_data_page(struct page *page, * this page does not have to be written to disk. */ offset = i_size & (PAGE_CACHE_SIZE - 1); - if ((page->index >= end_index + 1) || !offset) { - if (S_ISDIR(inode->i_mode)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); - inode_dec_dirty_dents(inode); - } + if ((page->index >= end_index + 1) || !offset) goto out; - } zero_user_segment(page, offset, PAGE_CACHE_SIZE); write: - if (sbi->por_doing) { - err = AOP_WRITEPAGE_ACTIVATE; + if (unlikely(sbi->por_doing)) goto redirty_out; - } /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); - inode_dec_dirty_dents(inode); - err = do_write_data_page(page); - } else { - int ilock = mutex_lock_op(sbi); - err = do_write_data_page(page); - mutex_unlock_op(sbi, ilock); - need_balance_fs = true; + err = do_write_data_page(page, &fio); + goto done; } - if (err == -ENOENT) - goto out; - else if (err) + + if (!wbc->for_reclaim) + need_balance_fs = true; + else if (has_not_enough_free_secs(sbi, 0)) goto redirty_out; - if (wbc->for_reclaim) - f2fs_submit_bio(sbi, DATA, true); + f2fs_lock_op(sbi); + if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) + err = f2fs_write_inline_data(inode, page, offset); + else + err = do_write_data_page(page, &fio); + f2fs_unlock_op(sbi); +done: + if (err && err != -ENOENT) + goto redirty_out; clear_cold_data(page); out: + inode_dec_dirty_dents(inode); unlock_page(page); if (need_balance_fs) f2fs_balance_fs(sbi); + if (wbc->for_reclaim) + f2fs_submit_merged_bio(sbi, DATA, WRITE); return 0; redirty_out: - wbc->pages_skipped++; - set_page_dirty(page); - return err; + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; } -#define MAX_DESIRED_PAGES_WP 4096 - static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, void *data) { @@ -604,17 +879,20 @@ static int f2fs_write_data_pages(struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); bool locked = false; int ret; - long excess_nrtw = 0, desired_nrtw; + long diff; + + trace_f2fs_writepages(mapping->host, wbc, DATA); /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) return 0; - if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) { - desired_nrtw = MAX_DESIRED_PAGES_WP; - excess_nrtw = desired_nrtw - wbc->nr_to_write; - wbc->nr_to_write = desired_nrtw; - } + if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && + get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA) && + available_free_memory(sbi, DIRTY_DENTS)) + goto skip_write; + + diff = nr_pages_to_write(sbi, DATA, wbc); if (!S_ISDIR(inode->i_mode)) { mutex_lock(&sbi->writepages); @@ -623,12 +901,17 @@ static int f2fs_write_data_pages(struct address_space *mapping, ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); if (locked) mutex_unlock(&sbi->writepages); - f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL)); + + f2fs_submit_merged_bio(sbi, DATA, WRITE); remove_dirty_dir_inode(inode); - wbc->nr_to_write -= excess_nrtw; + wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return ret; + +skip_write: + wbc->pages_skipped += get_dirty_dents(inode); + return 0; } static int f2fs_write_begin(struct file *file, struct address_space *mapping, @@ -641,30 +924,44 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; struct dnode_of_data dn; int err = 0; - int ilock; + + trace_f2fs_write_begin(inode, pos, len, flags); f2fs_balance_fs(sbi); repeat: + err = f2fs_convert_inline_data(inode, pos + len); + if (err) + return err; + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; + + /* to avoid latency during memory pressure */ + unlock_page(page); + *pagep = page; - ilock = mutex_lock_op(sbi); + if (f2fs_has_inline_data(inode) && (pos + len) <= MAX_INLINE_DATA) + goto inline_data; + f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, ALLOC_NODE); - if (err) - goto err; + err = f2fs_reserve_block(&dn, index); + f2fs_unlock_op(sbi); - if (dn.data_blkaddr == NULL_ADDR) - err = reserve_new_block(&dn); - - f2fs_put_dnode(&dn); - if (err) - goto err; + if (err) { + f2fs_put_page(page, 0); + return err; + } +inline_data: + lock_page(page); + if (unlikely(page->mapping != mapping)) { + f2fs_put_page(page, 1); + goto repeat; + } - mutex_unlock_op(sbi, ilock); + f2fs_wait_on_page_writeback(page, DATA); if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) return 0; @@ -681,15 +978,25 @@ repeat: if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); } else { - err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); - if (err) - return err; + if (f2fs_has_inline_data(inode)) { + err = f2fs_read_inline_data(inode, page); + if (err) { + page_cache_release(page); + return err; + } + } else { + err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, + READ_SYNC); + if (err) + return err; + } + lock_page(page); - if (!PageUptodate(page)) { + if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); return -EIO; } - if (page->mapping != mapping) { + if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } @@ -698,11 +1005,6 @@ out: SetPageUptodate(page); clear_cold_data(page); return 0; - -err: - mutex_unlock_op(sbi, ilock); - f2fs_put_page(page, 1); - return err; } static int f2fs_write_end(struct file *file, @@ -712,6 +1014,8 @@ static int f2fs_write_end(struct file *file, { struct inode *inode = page->mapping->host; + trace_f2fs_write_end(inode, pos, len, copied); + SetPageUptodate(page); set_page_dirty(page); @@ -721,34 +1025,53 @@ static int f2fs_write_end(struct file *file, update_inode_page(inode); } - unlock_page(page); - page_cache_release(page); + f2fs_put_page(page, 1); return copied; } +static int check_direct_IO(struct inode *inode, int rw, + struct iov_iter *iter, loff_t offset) +{ + unsigned blocksize_mask = inode->i_sb->s_blocksize - 1; + + if (rw == READ) + return 0; + + if (offset & blocksize_mask) + return -EINVAL; + + if (iov_iter_alignment(iter) & blocksize_mask) + return -EINVAL; + + return 0; +} + static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - if (rw == WRITE) + /* Let buffer I/O handle the inline data case. */ + if (f2fs_has_inline_data(inode)) + return 0; + + if (check_direct_IO(inode, rw, iter, offset)) return 0; - /* Needs synchronization with the cleaner */ - return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, - get_data_block_ro); + /* clear fsync mark to recover these blocks */ + fsync_mark_clear(F2FS_SB(inode->i_sb), inode->i_ino); + + return blockdev_direct_IO(rw, iocb, inode, iter, offset, + get_data_block); } static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, unsigned int length) { struct inode *inode = page->mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - if (S_ISDIR(inode->i_mode) && PageDirty(page)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); + if (PageDirty(page)) inode_dec_dirty_dents(inode); - } ClearPagePrivate(page); } @@ -763,7 +1086,11 @@ static int f2fs_set_data_page_dirty(struct page *page) struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; + trace_f2fs_set_page_dirty(page, DATA); + SetPageUptodate(page); + mark_inode_dirty(inode); + if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); set_dirty_dir_page(inode, page); @@ -774,7 +1101,12 @@ static int f2fs_set_data_page_dirty(struct page *page) static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) { - return generic_block_bmap(mapping, block, get_data_block_ro); + struct inode *inode = mapping->host; + + if (f2fs_has_inline_data(inode)) + return 0; + + return generic_block_bmap(mapping, block, get_data_block); } const struct address_space_operations f2fs_dblock_aops = { diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index a84b0a8e685..b52c12cf587 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -24,7 +24,7 @@ #include "gc.h" static LIST_HEAD(f2fs_stat_list); -static struct dentry *debugfs_root; +static struct dentry *f2fs_debugfs_root; static DEFINE_MUTEX(f2fs_stat_mutex); static void update_general_status(struct f2fs_sb_info *sbi) @@ -45,14 +45,15 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->valid_count = valid_user_blocks(sbi); si->valid_node_count = valid_node_count(sbi); si->valid_inode_count = valid_inode_count(sbi); + si->inline_inode = sbi->inline_inode; si->utilization = utilization(sbi); si->free_segs = free_segments(sbi); si->free_secs = free_sections(sbi); si->prefree_count = prefree_segments(sbi); si->dirty_count = dirty_segments(sbi); - si->node_pages = sbi->node_inode->i_mapping->nrpages; - si->meta_pages = sbi->meta_inode->i_mapping->nrpages; + si->node_pages = NODE_MAPPING(sbi)->nrpages; + si->meta_pages = META_MAPPING(sbi)->nrpages; si->nats = NM_I(sbi)->nat_cnt; si->sits = SIT_I(sbi)->dirty_sentries; si->fnids = NM_I(sbi)->fcnt; @@ -85,7 +86,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist; - struct sit_info *sit_i = SIT_I(sbi); unsigned int segno, vblocks; int ndirty = 0; @@ -93,7 +93,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi) total_vblocks = 0; blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); hblks_per_sec = blks_per_sec / 2; - mutex_lock(&sit_i->sentry_lock); for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); dist = abs(vblocks - hblks_per_sec); @@ -104,7 +103,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi) ndirty++; } } - mutex_unlock(&sit_i->sentry_lock); dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100; si->bimodal = bimodal / dist; if (si->dirty_count) @@ -165,9 +163,9 @@ get_cache: /* free nids */ si->cache_mem = NM_I(sbi)->fcnt; si->cache_mem += NM_I(sbi)->nat_cnt; - npages = sbi->node_inode->i_mapping->nrpages; + npages = NODE_MAPPING(sbi)->nrpages; si->cache_mem += npages << PAGE_CACHE_SHIFT; - npages = sbi->meta_inode->i_mapping->nrpages; + npages = META_MAPPING(sbi)->nrpages; si->cache_mem += npages << PAGE_CACHE_SHIFT; si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry); si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry); @@ -200,6 +198,8 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "Other: %u)\n - Data: %u\n", si->valid_node_count - si->valid_inode_count, si->valid_count - si->valid_node_count); + seq_printf(s, " - Inline_data Inode: %u\n", + si->inline_inode); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); @@ -233,6 +233,7 @@ static int stat_show(struct seq_file *s, void *v) si->dirty_count); seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", si->prefree_count, si->free_segs, si->free_secs); + seq_printf(s, "CP calls: %d\n", si->cp_count); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d\n", si->data_segs); @@ -242,17 +243,17 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - node blocks : %d\n", si->node_blks); seq_printf(s, "\nExtent Hit Ratio: %d / %d\n", si->hit_ext, si->total_ext); - seq_printf(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - nodes %4d in %4d\n", + seq_puts(s, "\nBalancing F2FS Async:\n"); + seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); - seq_printf(s, " - dents %4d in dirs:%4d\n", + seq_printf(s, " - dents: %4d in dirs:%4d\n", si->ndirty_dent, si->ndirty_dirs); - seq_printf(s, " - meta %4d in %4d\n", + seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); - seq_printf(s, " - NATs %5d > %lu\n", - si->nats, NM_WOUT_THRESHOLD); - seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n", - si->sits, si->fnids); + seq_printf(s, " - NATs: %9d\n - SITs: %9d\n", + si->nats, si->sits); + seq_printf(s, " - free_nids: %9d\n", + si->fnids); seq_puts(s, "\nDistribution of User Blocks:"); seq_puts(s, " [ valid | invalid | free ]\n"); seq_puts(s, " ["); @@ -340,14 +341,32 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) void __init f2fs_create_root_stats(void) { - debugfs_root = debugfs_create_dir("f2fs", NULL); - if (debugfs_root) - debugfs_create_file("status", S_IRUGO, debugfs_root, - NULL, &stat_fops); + struct dentry *file; + + f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); + if (!f2fs_debugfs_root) + goto bail; + + file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, + NULL, &stat_fops); + if (!file) + goto free_debugfs_dir; + + return; + +free_debugfs_dir: + debugfs_remove(f2fs_debugfs_root); + +bail: + f2fs_debugfs_root = NULL; + return; } void f2fs_destroy_root_stats(void) { - debugfs_remove_recursive(debugfs_root); - debugfs_root = NULL; + if (!f2fs_debugfs_root) + return; + + debugfs_remove_recursive(f2fs_debugfs_root); + f2fs_debugfs_root = NULL; } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 384c6daf9a8..a4addd72ebb 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -21,12 +21,12 @@ static unsigned long dir_blocks(struct inode *inode) >> PAGE_CACHE_SHIFT; } -static unsigned int dir_buckets(unsigned int level) +static unsigned int dir_buckets(unsigned int level, int dir_level) { - if (level < MAX_DIR_HASH_DEPTH / 2) - return 1 << level; + if (level + dir_level < MAX_DIR_HASH_DEPTH / 2) + return 1 << (level + dir_level); else - return 1 << ((MAX_DIR_HASH_DEPTH / 2) - 1); + return MAX_DIR_BUCKETS; } static unsigned int bucket_blocks(unsigned int level) @@ -65,13 +65,14 @@ static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode) de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } -static unsigned long dir_block_index(unsigned int level, unsigned int idx) +static unsigned long dir_block_index(unsigned int level, + int dir_level, unsigned int idx) { unsigned long i; unsigned long bidx = 0; for (i = 0; i < level; i++) - bidx += dir_buckets(i) * bucket_blocks(i); + bidx += dir_buckets(i, dir_level) * bucket_blocks(i); bidx += idx * bucket_blocks(level); return bidx; } @@ -93,16 +94,21 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, f2fs_hash_t namehash, struct page **res_page) { struct f2fs_dir_entry *de; - unsigned long bit_pos, end_pos, next_pos; + unsigned long bit_pos = 0; struct f2fs_dentry_block *dentry_blk = kmap(dentry_page); - int slots; + const void *dentry_bits = &dentry_blk->dentry_bitmap; + int max_len = 0; - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, - NR_DENTRY_IN_BLOCK, 0); while (bit_pos < NR_DENTRY_IN_BLOCK) { + if (!test_bit_le(bit_pos, dentry_bits)) { + if (bit_pos == 0) + max_len = 1; + else if (!test_bit_le(bit_pos - 1, dentry_bits)) + max_len++; + bit_pos++; + continue; + } de = &dentry_blk->dentry[bit_pos]; - slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); - if (early_match_name(name, namelen, namehash, de)) { if (!memcmp(dentry_blk->filename[bit_pos], name, namelen)) { @@ -110,20 +116,18 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, goto found; } } - next_pos = bit_pos + slots; - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, - NR_DENTRY_IN_BLOCK, next_pos); - if (bit_pos >= NR_DENTRY_IN_BLOCK) - end_pos = NR_DENTRY_IN_BLOCK; - else - end_pos = bit_pos; - if (*max_slots < end_pos - next_pos) - *max_slots = end_pos - next_pos; + if (max_len > *max_slots) { + *max_slots = max_len; + max_len = 0; + } + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); } de = NULL; kunmap(dentry_page); found: + if (max_len > *max_slots) + *max_slots = max_len; return de; } @@ -139,12 +143,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, bool room = false; int max_slots = 0; - BUG_ON(level > MAX_DIR_HASH_DEPTH); + f2fs_bug_on(level > MAX_DIR_HASH_DEPTH); - nbucket = dir_buckets(level); + nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); - bidx = dir_block_index(level, le32_to_cpu(namehash) % nbucket); + bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, + le32_to_cpu(namehash) % nbucket); end_block = bidx + nblock; for (; bidx < end_block; bidx++) { @@ -190,9 +195,6 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, unsigned int max_depth; unsigned int level; - if (namelen > F2FS_NAME_LEN) - return NULL; - if (npages == 0) return NULL; @@ -251,7 +253,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode) { lock_page(page); - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); de->ino = cpu_to_le32(inode->i_ino); set_de_type(de, inode); kunmap(page); @@ -259,20 +261,19 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, dir->i_mtime = dir->i_ctime = CURRENT_TIME; mark_inode_dirty(dir); - /* update parent inode number before releasing dentry page */ - F2FS_I(inode)->i_pino = dir->i_ino; - f2fs_put_page(page, 1); } static void init_dent_inode(const struct qstr *name, struct page *ipage) { - struct f2fs_node *rn; + struct f2fs_inode *ri; + + f2fs_wait_on_page_writeback(ipage, NODE); /* copy name info. to this inode page */ - rn = F2FS_NODE(ipage); - rn->i.i_namelen = cpu_to_le32(name->len); - memcpy(rn->i.i_name, name->name, name->len); + ri = F2FS_INODE(ipage); + ri->i_namelen = cpu_to_le32(name->len); + memcpy(ri->i_name, name->name, name->len); set_page_dirty(ipage); } @@ -346,21 +347,18 @@ static struct page *init_inode_metadata(struct inode *inode, goto error; } - err = f2fs_init_acl(inode, dir); + err = f2fs_init_acl(inode, dir, page); if (err) - goto error; + goto put_error; err = f2fs_init_security(inode, dir, name, page); if (err) - goto error; - - wait_on_page_writeback(page); + goto put_error; } else { page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); if (IS_ERR(page)) return page; - wait_on_page_writeback(page); set_cold_node(inode, page); } @@ -376,8 +374,13 @@ static struct page *init_inode_metadata(struct inode *inode, } return page; -error: +put_error: f2fs_put_page(page, 1); +error: + /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ + truncate_inode_pages(&inode->i_data, 0); + truncate_blocks(inode, 0); + remove_dirty_dir_inode(inode); remove_inode_page(inode); return ERR_PTR(err); } @@ -393,16 +396,13 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode, clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); } dir->i_mtime = dir->i_ctime = CURRENT_TIME; + mark_inode_dirty(dir); + if (F2FS_I(dir)->i_current_depth != current_depth) { F2FS_I(dir)->i_current_depth = current_depth; set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); } - if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) - update_inode_page(dir); - else - mark_inode_dirty(dir); - if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) clear_inode_flag(F2FS_I(inode), FI_INC_LINK); } @@ -432,10 +432,11 @@ next: } /* - * Caller should grab and release a mutex by calling mutex_lock_op() and - * mutex_unlock_op(). + * Caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). */ -int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode) +int __f2fs_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode) { unsigned int bit_pos; unsigned int level; @@ -461,17 +462,18 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *in } start: - if (current_depth == MAX_DIR_HASH_DEPTH) + if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) return -ENOSPC; /* Increase the depth, if required */ if (level == current_depth) ++current_depth; - nbucket = dir_buckets(level); + nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); - bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket)); + bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, + (le32_to_cpu(dentry_hash) % nbucket)); for (block = bidx; block <= (bidx + nblock - 1); block++) { dentry_page = get_new_data_page(dir, NULL, block, true); @@ -491,8 +493,9 @@ start: ++level; goto start; add_dentry: - wait_on_page_writeback(dentry_page); + f2fs_wait_on_page_writeback(dentry_page, DATA); + down_write(&F2FS_I(inode)->i_sem); page = init_inode_metadata(inode, dir, name); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -515,7 +518,12 @@ add_dentry: update_parent_metadata(dir, inode, current_depth); fail: - clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); + up_write(&F2FS_I(inode)->i_sem); + + if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { + update_inode_page(dir); + clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); + } kunmap(dentry_page); f2fs_put_page(dentry_page, 1); return err; @@ -532,13 +540,12 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, unsigned int bit_pos; struct address_space *mapping = page->mapping; struct inode *dir = mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); void *kaddr = page_address(page); int i; lock_page(page); - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); dentry_blk = (struct f2fs_dentry_block *)kaddr; bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry; @@ -554,20 +561,22 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, dir->i_ctime = dir->i_mtime = CURRENT_TIME; - if (inode && S_ISDIR(inode->i_mode)) { - drop_nlink(dir); - update_inode_page(dir); - } else { - mark_inode_dirty(dir); - } - if (inode) { + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + + down_write(&F2FS_I(inode)->i_sem); + + if (S_ISDIR(inode->i_mode)) { + drop_nlink(dir); + update_inode_page(dir); + } inode->i_ctime = CURRENT_TIME; drop_nlink(inode); if (S_ISDIR(inode->i_mode)) { drop_nlink(inode); i_size_write(inode, 0); } + up_write(&F2FS_I(inode)->i_sem); update_inode_page(inode); if (inode->i_nlink == 0) @@ -580,7 +589,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, truncate_hole(dir, page->index, page->index + 1); clear_page_dirty_for_io(page); ClearPageUptodate(page); - dec_page_count(sbi, F2FS_DIRTY_DENTS); inode_dec_dirty_dents(dir); } f2fs_put_page(page, 1); @@ -631,12 +639,18 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) struct f2fs_dentry_block *dentry_blk = NULL; struct f2fs_dir_entry *de = NULL; struct page *dentry_page = NULL; + struct file_ra_state *ra = &file->f_ra; unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); unsigned char d_type = DT_UNKNOWN; bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK); - for ( ; n < npages; n++) { + /* readahead for multi pages of dir */ + if (npages - n > 1 && !ra_has_index(ra, n)) + page_cache_sync_readahead(inode->i_mapping, ra, file, n, + min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); + + for (; n < npages; n++) { dentry_page = get_lock_data_page(inode, n); if (IS_ERR(dentry_page)) continue; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 608f0df5b91..58df97e174d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -18,6 +18,15 @@ #include <linux/crc32.h> #include <linux/magic.h> #include <linux/kobject.h> +#include <linux/sched.h> + +#ifdef CONFIG_F2FS_CHECK_FS +#define f2fs_bug_on(condition) BUG_ON(condition) +#define f2fs_down_write(x, y) down_write_nest_lock(x, y) +#else +#define f2fs_bug_on(condition) +#define f2fs_down_write(x, y) down_write(x) +#endif /* * For mount options @@ -30,6 +39,8 @@ #define F2FS_MOUNT_POSIX_ACL 0x00000020 #define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040 #define F2FS_MOUNT_INLINE_XATTR 0x00000080 +#define F2FS_MOUNT_INLINE_DATA 0x00000100 +#define F2FS_MOUNT_FLUSH_MERGE 0x00000200 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -78,6 +89,16 @@ enum { SIT_BITMAP }; +/* + * For CP/NAT/SIT/SSA readahead + */ +enum { + META_CP, + META_NAT, + META_SIT, + META_SSA +}; + /* for the list of orphan inodes */ struct orphan_inode_entry { struct list_head list; /* list head */ @@ -90,6 +111,13 @@ struct dir_inode_entry { struct inode *inode; /* vfs inode pointer */ }; +/* for the list of blockaddresses to be discarded */ +struct discard_entry { + struct list_head list; /* list head */ + block_t blkaddr; /* block address to be discarded */ + int len; /* # of consecutive blocks of the discard */ +}; + /* for the list of fsync inodes, used only during recovery */ struct fsync_inode_entry { struct list_head list; /* list head */ @@ -148,13 +176,17 @@ enum { LOOKUP_NODE, /* look up a node without readahead */ LOOKUP_NODE_RA, /* * look up a node with readahead called - * by get_datablock_ro. + * by get_data_block. */ }; #define F2FS_LINK_MAX 32000 /* maximum link count per file */ +#define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ + /* for in-memory extent cache entry */ +#define F2FS_MIN_EXTENT_LEN 16 /* minimum extent length */ + struct extent_info { rwlock_t ext_lock; /* rwlock for consistency */ unsigned int fofs; /* start offset in a file */ @@ -168,22 +200,27 @@ struct extent_info { #define FADVISE_COLD_BIT 0x01 #define FADVISE_LOST_PINO_BIT 0x02 +#define DEF_DIR_LEVEL 0 + struct f2fs_inode_info { struct inode vfs_inode; /* serve a vfs inode */ unsigned long i_flags; /* keep an inode flags for ioctl */ unsigned char i_advise; /* use to give file attribute hints */ + unsigned char i_dir_level; /* use for dentry level for large dir */ unsigned int i_current_depth; /* use only in directory structure */ unsigned int i_pino; /* parent inode number */ umode_t i_acl_mode; /* keep file acl mode temporarily */ /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ + struct rw_semaphore i_sem; /* protect fi info */ atomic_t dirty_dents; /* # of dirty dentry pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ unsigned long long xattr_ver; /* cp version of xattr modification */ struct extent_info ext; /* in-memory extent cache entry */ + struct dir_inode_entry *dirty_dir; /* the pointer of dirty dir */ }; static inline void get_extent_info(struct extent_info *ext, @@ -209,7 +246,9 @@ static inline void set_raw_extent(struct extent_info *ext, struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ + nid_t available_nids; /* maximum available node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ + unsigned int ram_thresh; /* control the memory footprint */ /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ @@ -219,6 +258,7 @@ struct f2fs_nm_info { struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */ /* free node ids management */ + struct radix_tree_root free_nid_root;/* root of the free_nid cache */ struct list_head free_nid_list; /* a list for free nids */ spinlock_t free_nid_list_lock; /* protect free nid list */ unsigned int fcnt; /* the number of free node id */ @@ -281,15 +321,27 @@ enum { NO_CHECK_TYPE }; +struct flush_cmd { + struct flush_cmd *next; + struct completion wait; + int ret; +}; + +struct flush_cmd_control { + struct task_struct *f2fs_issue_flush; /* flush thread */ + wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ + struct flush_cmd *issue_list; /* list for command issue */ + struct flush_cmd *dispatch_list; /* list for command dispatch */ + spinlock_t issue_lock; /* for issue list lock */ + struct flush_cmd *issue_tail; /* list tail of issue list */ +}; + struct f2fs_sm_info { struct sit_info *sit_info; /* whole segment information */ struct free_segmap_info *free_info; /* free segment information */ struct dirty_seglist_info *dirty_info; /* dirty segment information */ struct curseg_info *curseg_array; /* active segment information */ - struct list_head wblist_head; /* list of under-writeback pages */ - spinlock_t wblist_lock; /* lock for checkpoint */ - block_t seg0_blkaddr; /* block address of 0'th segment */ block_t main_blkaddr; /* start block address of main area */ block_t ssa_blkaddr; /* start block address of SSA area */ @@ -298,6 +350,21 @@ struct f2fs_sm_info { unsigned int main_segments; /* # of segments in main area */ unsigned int reserved_segments; /* # of reserved segments */ unsigned int ovp_segments; /* # of overprovision segments */ + + /* a threshold to reclaim prefree segments */ + unsigned int rec_prefree_segments; + + /* for small discard management */ + struct list_head discard_list; /* 4KB discard list */ + int nr_discards; /* # of discards in the list */ + int max_discards; /* max. discards to be issued */ + + unsigned int ipu_policy; /* in-place-update policy */ + unsigned int min_ipu_util; /* in-place-update threshold */ + + /* for flush command control */ + struct flush_cmd_control *cmd_control_info; + }; /* @@ -318,14 +385,6 @@ enum count_type { }; /* - * Uses as sbi->fs_lock[NR_GLOBAL_LOCKS]. - * The checkpoint procedure blocks all the locks in this fs_lock array. - * Some FS operations grab free locks, and if there is no free lock, - * then wait to grab a lock in a round-robin manner. - */ -#define NR_GLOBAL_LOCKS 8 - -/* * The below are the page types of bios used in submti_bio(). * The available types are: * DATA User data pages. It operates as async mode. @@ -336,6 +395,7 @@ enum count_type { * with waiting the bio's completion * ... Only can be used with META. */ +#define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type)) enum page_type { DATA, NODE, @@ -344,6 +404,20 @@ enum page_type { META_FLUSH, }; +struct f2fs_io_info { + enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ + int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */ +}; + +#define is_read_io(rw) (((rw) & 1) == READ) +struct f2fs_bio_info { + struct f2fs_sb_info *sbi; /* f2fs superblock */ + struct bio *bio; /* bios to merge */ + sector_t last_block_in_bio; /* last block number */ + struct f2fs_io_info fio; /* store buffered io info. */ + struct rw_semaphore io_rwsem; /* blocking op for bio */ +}; + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ @@ -357,25 +431,27 @@ struct f2fs_sb_info { /* for segment-related operations */ struct f2fs_sm_info *sm_info; /* segment manager */ - struct bio *bio[NR_PAGE_TYPE]; /* bios to merge */ - sector_t last_block_in_bio[NR_PAGE_TYPE]; /* last block number */ - struct rw_semaphore bio_sem; /* IO semaphore */ + + /* for bio operations */ + struct f2fs_bio_info read_io; /* for read bios */ + struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ + struct completion *wait_io; /* for completion bios */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ struct inode *meta_inode; /* cache meta blocks */ struct mutex cp_mutex; /* checkpoint procedure lock */ - struct mutex fs_lock[NR_GLOBAL_LOCKS]; /* blocking FS operations */ + struct rw_semaphore cp_rwsem; /* blocking FS operations */ struct mutex node_write; /* locking node writes */ struct mutex writepages; /* mutex for writepages() */ - unsigned char next_lock_num; /* round-robin global locks */ - int por_doing; /* recovery is doing or not */ - int on_build_free_nids; /* build_free_nids is doing */ + bool por_doing; /* recovery is doing or not */ + wait_queue_head_t cp_wait; /* for orphan inode management */ struct list_head orphan_inode_list; /* orphan inode list */ - struct mutex orphan_inode_mutex; /* for orphan inode list */ + spinlock_t orphan_inode_lock; /* for orphan inode list */ unsigned int n_orphans; /* # of orphan inodes */ + unsigned int max_orphans; /* max orphan inodes */ /* for directory inode management */ struct list_head dir_inode_list; /* dir inode list */ @@ -397,6 +473,7 @@ struct f2fs_sb_info { unsigned int total_valid_node_count; /* valid node block count */ unsigned int total_valid_inode_count; /* valid inode count */ int active_logs; /* # of active logs */ + int dir_level; /* directory level */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ @@ -412,6 +489,9 @@ struct f2fs_sb_info { struct f2fs_gc_kthread *gc_thread; /* GC thread */ unsigned int cur_victim_sec; /* current victim section num */ + /* maximum # of trials to find a victim segment for SSR and GC */ + unsigned int max_victim_search; + /* * for stat information. * one is for the LFS mode, and the other is for the SSR mode. @@ -421,6 +501,7 @@ struct f2fs_sb_info { unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ int total_hit_ext, read_hit_ext; /* extent cache hit ratio */ + int inline_inode; /* # of inline_data inodes */ int bg_gc; /* background gc calls */ unsigned int n_dirty_dirs; /* # of dir inodes */ #endif @@ -460,6 +541,11 @@ static inline struct f2fs_node *F2FS_NODE(struct page *page) return (struct f2fs_node *)page_address(page); } +static inline struct f2fs_inode *F2FS_INODE(struct page *page) +{ + return &((struct f2fs_node *)page_address(page))->i; +} + static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi) { return (struct f2fs_nm_info *)(sbi->nm_info); @@ -485,6 +571,16 @@ static inline struct dirty_seglist_info *DIRTY_I(struct f2fs_sb_info *sbi) return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info); } +static inline struct address_space *META_MAPPING(struct f2fs_sb_info *sbi) +{ + return sbi->meta_inode->i_mapping; +} + +static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi) +{ + return sbi->node_inode->i_mapping; +} + static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi) { sbi->s_dirty = 1; @@ -520,48 +616,24 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) cp->ckpt_flags = cpu_to_le32(ckpt_flags); } -static inline void mutex_lock_all(struct f2fs_sb_info *sbi) +static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { - int i; - - for (i = 0; i < NR_GLOBAL_LOCKS; i++) { - /* - * This is the only time we take multiple fs_lock[] - * instances; the order is immaterial since we - * always hold cp_mutex, which serializes multiple - * such operations. - */ - mutex_lock_nest_lock(&sbi->fs_lock[i], &sbi->cp_mutex); - } + down_read(&sbi->cp_rwsem); } -static inline void mutex_unlock_all(struct f2fs_sb_info *sbi) +static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) { - int i = 0; - for (; i < NR_GLOBAL_LOCKS; i++) - mutex_unlock(&sbi->fs_lock[i]); + up_read(&sbi->cp_rwsem); } -static inline int mutex_lock_op(struct f2fs_sb_info *sbi) +static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) { - unsigned char next_lock = sbi->next_lock_num % NR_GLOBAL_LOCKS; - int i = 0; - - for (; i < NR_GLOBAL_LOCKS; i++) - if (mutex_trylock(&sbi->fs_lock[i])) - return i; - - mutex_lock(&sbi->fs_lock[next_lock]); - sbi->next_lock_num++; - return next_lock; + f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex); } -static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, int ilock) +static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) { - if (ilock < 0) - return; - BUG_ON(ilock >= NR_GLOBAL_LOCKS); - mutex_unlock(&sbi->fs_lock[ilock]); + up_write(&sbi->cp_rwsem); } /* @@ -569,8 +641,9 @@ static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, int ilock) */ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) { - WARN_ON((nid >= NM_I(sbi)->max_nid)); - if (nid >= NM_I(sbi)->max_nid) + if (unlikely(nid < F2FS_ROOT_INO(sbi))) + return -EINVAL; + if (unlikely(nid >= NM_I(sbi)->max_nid)) return -EINVAL; return 0; } @@ -583,9 +656,14 @@ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) static inline int F2FS_HAS_BLOCKS(struct inode *inode) { if (F2FS_I(inode)->i_xattr_nid) - return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1); + return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1; else - return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS); + return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS; +} + +static inline bool f2fs_has_xattr_block(unsigned int ofs) +{ + return ofs == XATTR_NODE_OFFSET; } static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, @@ -596,7 +674,7 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); valid_block_count = sbi->total_valid_block_count + (block_t)count; - if (valid_block_count > sbi->user_block_count) { + if (unlikely(valid_block_count > sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); return false; } @@ -607,17 +685,16 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, return true; } -static inline int dec_valid_block_count(struct f2fs_sb_info *sbi, +static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t count) { spin_lock(&sbi->stat_lock); - BUG_ON(sbi->total_valid_block_count < (block_t) count); - BUG_ON(inode->i_blocks < count); + f2fs_bug_on(sbi->total_valid_block_count < (block_t) count); + f2fs_bug_on(inode->i_blocks < count); inode->i_blocks -= count; sbi->total_valid_block_count -= (block_t)count; spin_unlock(&sbi->stat_lock); - return 0; } static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) @@ -628,6 +705,7 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_inc_dirty_dents(struct inode *inode) { + inc_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS); atomic_inc(&F2FS_I(inode)->dirty_dents); } @@ -638,6 +716,10 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_dec_dirty_dents(struct inode *inode) { + if (!S_ISDIR(inode->i_mode)) + return; + + dec_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS); atomic_dec(&F2FS_I(inode)->dirty_dents); } @@ -646,6 +728,11 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) return atomic_read(&sbi->nr_pages[count_type]); } +static inline int get_dirty_dents(struct inode *inode) +{ + return atomic_read(&F2FS_I(inode)->dirty_dents); +} + static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) { unsigned int pages_per_sec = sbi->segs_per_sec * @@ -656,11 +743,7 @@ static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) { - block_t ret; - spin_lock(&sbi->stat_lock); - ret = sbi->total_valid_block_count; - spin_unlock(&sbi->stat_lock); - return ret; + return sbi->total_valid_block_count; } static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) @@ -679,9 +762,18 @@ static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - int offset = (flag == NAT_BITMAP) ? + int offset; + + if (le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload) > 0) { + if (flag == NAT_BITMAP) + return &ckpt->sit_nat_version_bitmap; + else + return ((unsigned char *)ckpt + F2FS_BLKSIZE); + } else { + offset = (flag == NAT_BITMAP) ? le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0; - return &ckpt->sit_nat_version_bitmap + offset; + return &ckpt->sit_nat_version_bitmap + offset; + } } static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi) @@ -708,96 +800,85 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) } static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, - struct inode *inode, - unsigned int count) + struct inode *inode) { block_t valid_block_count; unsigned int valid_node_count; spin_lock(&sbi->stat_lock); - valid_block_count = sbi->total_valid_block_count + (block_t)count; - sbi->alloc_valid_block_count += (block_t)count; - valid_node_count = sbi->total_valid_node_count + count; - - if (valid_block_count > sbi->user_block_count) { + valid_block_count = sbi->total_valid_block_count + 1; + if (unlikely(valid_block_count > sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); return false; } - if (valid_node_count > sbi->total_node_count) { + valid_node_count = sbi->total_valid_node_count + 1; + if (unlikely(valid_node_count > sbi->total_node_count)) { spin_unlock(&sbi->stat_lock); return false; } if (inode) - inode->i_blocks += count; - sbi->total_valid_node_count = valid_node_count; - sbi->total_valid_block_count = valid_block_count; + inode->i_blocks++; + + sbi->alloc_valid_block_count++; + sbi->total_valid_node_count++; + sbi->total_valid_block_count++; spin_unlock(&sbi->stat_lock); return true; } static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, - struct inode *inode, - unsigned int count) + struct inode *inode) { spin_lock(&sbi->stat_lock); - BUG_ON(sbi->total_valid_block_count < count); - BUG_ON(sbi->total_valid_node_count < count); - BUG_ON(inode->i_blocks < count); + f2fs_bug_on(!sbi->total_valid_block_count); + f2fs_bug_on(!sbi->total_valid_node_count); + f2fs_bug_on(!inode->i_blocks); - inode->i_blocks -= count; - sbi->total_valid_node_count -= count; - sbi->total_valid_block_count -= (block_t)count; + inode->i_blocks--; + sbi->total_valid_node_count--; + sbi->total_valid_block_count--; spin_unlock(&sbi->stat_lock); } static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) { - unsigned int ret; - spin_lock(&sbi->stat_lock); - ret = sbi->total_valid_node_count; - spin_unlock(&sbi->stat_lock); - return ret; + return sbi->total_valid_node_count; } static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) { spin_lock(&sbi->stat_lock); - BUG_ON(sbi->total_valid_inode_count == sbi->total_node_count); + f2fs_bug_on(sbi->total_valid_inode_count == sbi->total_node_count); sbi->total_valid_inode_count++; spin_unlock(&sbi->stat_lock); } -static inline int dec_valid_inode_count(struct f2fs_sb_info *sbi) +static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) { spin_lock(&sbi->stat_lock); - BUG_ON(!sbi->total_valid_inode_count); + f2fs_bug_on(!sbi->total_valid_inode_count); sbi->total_valid_inode_count--; spin_unlock(&sbi->stat_lock); - return 0; } static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) { - unsigned int ret; - spin_lock(&sbi->stat_lock); - ret = sbi->total_valid_inode_count; - spin_unlock(&sbi->stat_lock); - return ret; + return sbi->total_valid_inode_count; } static inline void f2fs_put_page(struct page *page, int unlock) { - if (!page || IS_ERR(page)) + if (!page) return; if (unlock) { - BUG_ON(!PageLocked(page)); + f2fs_bug_on(!PageLocked(page)); unlock_page(page); } page_cache_release(page); @@ -814,9 +895,23 @@ static inline void f2fs_put_dnode(struct dnode_of_data *dn) } static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, - size_t size, void (*ctor)(void *)) + size_t size) { - return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, ctor); + return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL); +} + +static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, + gfp_t flags) +{ + void *entry; +retry: + entry = kmem_cache_alloc(cachep, flags); + if (!entry) { + cond_resched(); + goto retry; + } + + return entry; } #define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino) @@ -879,12 +974,15 @@ static inline int f2fs_clear_bit(unsigned int nr, char *addr) enum { FI_NEW_INODE, /* indicate newly allocated inode */ FI_DIRTY_INODE, /* indicate inode is dirty or not */ + FI_DIRTY_DIR, /* indicate directory has dirty pages */ FI_INC_LINK, /* need to increment i_nlink */ FI_ACL_MODE, /* indicate acl mode */ FI_NO_ALLOC, /* should not allocate any blocks */ FI_UPDATE_DIR, /* should update inode block for consistency */ FI_DELAY_IPUT, /* used for the recovery */ + FI_NO_EXTENT, /* not to use the extent cache */ FI_INLINE_XATTR, /* used for inline xattr */ + FI_INLINE_DATA, /* used for inline data*/ }; static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) @@ -922,6 +1020,8 @@ static inline void get_inline_info(struct f2fs_inode_info *fi, { if (ri->i_inline & F2FS_INLINE_XATTR) set_inode_flag(fi, FI_INLINE_XATTR); + if (ri->i_inline & F2FS_INLINE_DATA) + set_inode_flag(fi, FI_INLINE_DATA); } static inline void set_raw_inline(struct f2fs_inode_info *fi, @@ -931,41 +1031,75 @@ static inline void set_raw_inline(struct f2fs_inode_info *fi, if (is_inode_flag_set(fi, FI_INLINE_XATTR)) ri->i_inline |= F2FS_INLINE_XATTR; + if (is_inode_flag_set(fi, FI_INLINE_DATA)) + ri->i_inline |= F2FS_INLINE_DATA; +} + +static inline int f2fs_has_inline_xattr(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR); } static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi) { - if (is_inode_flag_set(fi, FI_INLINE_XATTR)) + if (f2fs_has_inline_xattr(&fi->vfs_inode)) return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; return DEF_ADDRS_PER_INODE; } static inline void *inline_xattr_addr(struct page *page) { - struct f2fs_inode *ri; - ri = (struct f2fs_inode *)page_address(page); + struct f2fs_inode *ri = F2FS_INODE(page); return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS]); } static inline int inline_xattr_size(struct inode *inode) { - if (is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR)) + if (f2fs_has_inline_xattr(inode)) return F2FS_INLINE_XATTR_ADDRS << 2; else return 0; } +static inline int f2fs_has_inline_data(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA); +} + +static inline void *inline_data_addr(struct page *page) +{ + struct f2fs_inode *ri = F2FS_INODE(page); + return (void *)&(ri->i_addr[1]); +} + static inline int f2fs_readonly(struct super_block *sb) { return sb->s_flags & MS_RDONLY; } +static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi) +{ + set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); + sbi->sb->s_flags |= MS_RDONLY; +} + +#define get_inode_mode(i) \ + ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ + (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) + +/* get offset of first page in next direct node */ +#define PGOFS_OF_NEXT_DNODE(pgofs, fi) \ + ((pgofs < ADDRS_PER_INODE(fi)) ? ADDRS_PER_INODE(fi) : \ + (pgofs - ADDRS_PER_INODE(fi) + ADDRS_PER_BLOCK) / \ + ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi)) + /* * file.c */ int f2fs_sync_file(struct file *, loff_t, loff_t, int); void truncate_data_blocks(struct dnode_of_data *); +int truncate_blocks(struct inode *, u64); void f2fs_truncate(struct inode *); int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); int f2fs_setattr(struct dentry *, struct iattr *); @@ -979,8 +1113,9 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); */ void f2fs_set_inode_flags(struct inode *); struct inode *f2fs_iget(struct super_block *, unsigned long); +int try_to_free_nats(struct f2fs_sb_info *, int); void update_inode(struct inode *, struct page *); -int update_inode_page(struct inode *); +void update_inode_page(struct inode *); int f2fs_write_inode(struct inode *, struct writeback_control *); void f2fs_evict_inode(struct inode *); @@ -1028,12 +1163,16 @@ f2fs_hash_t f2fs_dentry_hash(const char *, size_t); struct dnode_of_data; struct node_info; +bool available_free_memory(struct f2fs_sb_info *, int); int is_checkpointed_node(struct f2fs_sb_info *, nid_t); +bool fsync_mark_done(struct f2fs_sb_info *, nid_t); +void fsync_mark_clear(struct f2fs_sb_info *, nid_t); void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); int truncate_xattr_node(struct inode *, struct page *); -int remove_inode_page(struct inode *); +int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t); +void remove_inode_page(struct inode *); struct page *new_inode_page(struct inode *, const struct qstr *); struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); void ra_node_page(struct f2fs_sb_info *, nid_t); @@ -1046,6 +1185,7 @@ void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); void recover_node_page(struct f2fs_sb_info *, struct page *, struct f2fs_summary *, struct node_info *, block_t); +bool recover_xattr_data(struct inode *, struct page *, block_t); int recover_inode_page(struct f2fs_sb_info *, struct page *); int restore_node_summary(struct f2fs_sb_info *, unsigned int, struct f2fs_summary_block *); @@ -1059,24 +1199,30 @@ void destroy_node_manager_caches(void); * segment.c */ void f2fs_balance_fs(struct f2fs_sb_info *); +void f2fs_balance_fs_bg(struct f2fs_sb_info *); +int f2fs_issue_flush(struct f2fs_sb_info *); +int create_flush_cmd_control(struct f2fs_sb_info *); +void destroy_flush_cmd_control(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); +void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *); +void discard_next_dnode(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *); void allocate_new_segments(struct f2fs_sb_info *); struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); -struct bio *f2fs_bio_alloc(struct block_device *, int); -void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool); -void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool); void write_meta_page(struct f2fs_sb_info *, struct page *); -void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int, - block_t, block_t *); -void write_data_page(struct inode *, struct page *, struct dnode_of_data*, - block_t, block_t *); -void rewrite_data_page(struct f2fs_sb_info *, struct page *, block_t); +void write_node_page(struct f2fs_sb_info *, struct page *, + struct f2fs_io_info *, unsigned int, block_t, block_t *); +void write_data_page(struct page *, struct dnode_of_data *, block_t *, + struct f2fs_io_info *); +void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *); void recover_data_page(struct f2fs_sb_info *, struct page *, struct f2fs_summary *, block_t, block_t); void rewrite_node_page(struct f2fs_sb_info *, struct page *, struct f2fs_summary *, block_t, block_t); +void allocate_data_block(struct f2fs_sb_info *, struct page *, + block_t, block_t *, struct f2fs_summary *, int); +void f2fs_wait_on_page_writeback(struct page *, enum page_type); void write_data_summaries(struct f2fs_sb_info *, block_t); void write_node_summaries(struct f2fs_sb_info *, block_t); int lookup_journal_in_cursum(struct f2fs_summary_block *, @@ -1084,23 +1230,25 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *, void flush_sit_entries(struct f2fs_sb_info *); int build_segment_manager(struct f2fs_sb_info *); void destroy_segment_manager(struct f2fs_sb_info *); +int __init create_segment_manager_caches(void); +void destroy_segment_manager_caches(void); /* * checkpoint.c */ struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); +int ra_meta_pages(struct f2fs_sb_info *, int, int, int); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); void add_orphan_inode(struct f2fs_sb_info *, nid_t); void remove_orphan_inode(struct f2fs_sb_info *, nid_t); -int recover_orphan_inodes(struct f2fs_sb_info *); +void recover_orphan_inodes(struct f2fs_sb_info *); int get_valid_checkpoint(struct f2fs_sb_info *); void set_dirty_dir_page(struct inode *, struct page *); void add_dirty_dir_inode(struct inode *); void remove_dirty_dir_inode(struct inode *); -struct inode *check_dirty_dir_inode(struct f2fs_sb_info *, nid_t); void sync_dirty_dir_inodes(struct f2fs_sb_info *); void write_checkpoint(struct f2fs_sb_info *, bool); void init_orphan_info(struct f2fs_sb_info *); @@ -1110,13 +1258,18 @@ void destroy_checkpoint_caches(void); /* * data.c */ +void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int); +int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *, block_t, int); +void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, block_t, + struct f2fs_io_info *); int reserve_new_block(struct dnode_of_data *); +int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); void update_extent_cache(block_t, struct dnode_of_data *); struct page *find_data_page(struct inode *, pgoff_t, bool); struct page *get_lock_data_page(struct inode *, pgoff_t); struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); -int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int); -int do_write_data_page(struct page *); +int do_write_data_page(struct page *, struct f2fs_io_info *); +int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); /* * gc.c @@ -1149,13 +1302,13 @@ struct f2fs_stat_info { int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; int nats, sits, fnids; int total_count, utilization; - int bg_gc; + int bg_gc, inline_inode; unsigned int valid_count, valid_node_count, valid_inode_count; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; int rsvd_segs, overp_segs; int dirty_count, node_pages, meta_pages; - int prefree_count, call_count; + int prefree_count, call_count, cp_count; int tot_segs, node_segs, data_segs, free_segs, free_secs; int tot_blks, data_blks, node_blks; int curseg[NR_CURSEG_TYPE]; @@ -1169,10 +1322,31 @@ struct f2fs_stat_info { static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) { - return (struct f2fs_stat_info*)sbi->stat_info; + return (struct f2fs_stat_info *)sbi->stat_info; } -#define stat_inc_call_count(si) ((si)->call_count++) +#define stat_inc_cp_count(si) ((si)->cp_count++) +#define stat_inc_call_count(si) ((si)->call_count++) +#define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) +#define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) +#define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) +#define stat_inc_total_hit(sb) ((F2FS_SB(sb))->total_hit_ext++) +#define stat_inc_read_hit(sb) ((F2FS_SB(sb))->read_hit_ext++) +#define stat_inc_inline_inode(inode) \ + do { \ + if (f2fs_has_inline_data(inode)) \ + ((F2FS_SB(inode->i_sb))->inline_inode++); \ + } while (0) +#define stat_dec_inline_inode(inode) \ + do { \ + if (f2fs_has_inline_data(inode)) \ + ((F2FS_SB(inode->i_sb))->inline_inode--); \ + } while (0) + +#define stat_inc_seg_type(sbi, curseg) \ + ((sbi)->segment_count[(curseg)->alloc_type]++) +#define stat_inc_block_count(sbi, curseg) \ + ((sbi)->block_count[(curseg)->alloc_type]++) #define stat_inc_seg_count(sbi, type) \ do { \ @@ -1206,7 +1380,17 @@ void f2fs_destroy_stats(struct f2fs_sb_info *); void __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else +#define stat_inc_cp_count(si) #define stat_inc_call_count(si) +#define stat_inc_bggc_count(si) +#define stat_inc_dirty_dir(sbi) +#define stat_dec_dirty_dir(sbi) +#define stat_inc_total_hit(sb) +#define stat_inc_read_hit(sb) +#define stat_inc_inline_inode(inode) +#define stat_dec_inline_inode(inode) +#define stat_inc_seg_type(sbi, curseg) +#define stat_inc_block_count(sbi, curseg) #define stat_inc_seg_count(si, type) #define stat_inc_tot_blk_count(si, blks) #define stat_inc_data_blk_count(si, blks) @@ -1227,4 +1411,14 @@ extern const struct address_space_operations f2fs_meta_aops; extern const struct inode_operations f2fs_dir_inode_operations; extern const struct inode_operations f2fs_symlink_inode_operations; extern const struct inode_operations f2fs_special_inode_operations; + +/* + * inline.c + */ +bool f2fs_may_inline(struct inode *); +int f2fs_read_inline_data(struct inode *, struct page *); +int f2fs_convert_inline_data(struct inode *, pgoff_t); +int f2fs_write_inline_data(struct inode *, struct page *, unsigned int); +void truncate_inline_data(struct inode *, u64); +int recover_inline_data(struct inode *, struct page *); #endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 02c906971cc..7d8b9627509 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -19,6 +19,7 @@ #include <linux/compat.h> #include <linux/uaccess.h> #include <linux/mount.h> +#include <linux/pagevec.h> #include "f2fs.h" #include "node.h" @@ -33,41 +34,26 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page = vmf->page; struct inode *inode = file_inode(vma->vm_file); struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - block_t old_blk_addr; struct dnode_of_data dn; - int err, ilock; + int err; f2fs_balance_fs(sbi); sb_start_pagefault(inode->i_sb); /* block allocation */ - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, page->index, ALLOC_NODE); - if (err) { - mutex_unlock_op(sbi, ilock); + err = f2fs_reserve_block(&dn, page->index); + f2fs_unlock_op(sbi); + if (err) goto out; - } - - old_blk_addr = dn.data_blkaddr; - - if (old_blk_addr == NULL_ADDR) { - err = reserve_new_block(&dn); - if (err) { - f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, ilock); - goto out; - } - } - f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, ilock); file_update_time(vma->vm_file); lock_page(page); - if (page->mapping != inode->i_mapping || + if (unlikely(page->mapping != inode->i_mapping || page_offset(page) > i_size_read(inode) || - !PageUptodate(page)) { + !PageUptodate(page))) { unlock_page(page); err = -EFAULT; goto out; @@ -88,9 +74,10 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, set_page_dirty(page); SetPageUptodate(page); + trace_f2fs_vm_page_mkwrite(page, DATA); mapped: /* fill the page */ - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(err); @@ -98,6 +85,7 @@ out: static const struct vm_operations_struct f2fs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = f2fs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; @@ -125,6 +113,7 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; + struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); int ret = 0; bool need_cp = false; @@ -134,7 +123,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) .for_reclaim = 0, }; - if (f2fs_readonly(inode->i_sb)) + if (unlikely(f2fs_readonly(inode->i_sb))) return 0; trace_f2fs_sync_file_enter(inode); @@ -147,7 +136,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) /* guarantee free sections for fsync */ f2fs_balance_fs(sbi); - mutex_lock(&inode->i_mutex); + down_read(&fi->i_sem); /* * Both of fdatasync() and fsync() are able to be recovered from @@ -164,40 +153,174 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi))) need_cp = true; + up_read(&fi->i_sem); + if (need_cp) { nid_t pino; - F2FS_I(inode)->xattr_ver = 0; - /* all the dirty node pages should be flushed for POR */ ret = f2fs_sync_fs(inode->i_sb, 1); + + down_write(&fi->i_sem); + F2FS_I(inode)->xattr_ver = 0; if (file_wrong_pino(inode) && inode->i_nlink == 1 && get_parent_ino(inode, &pino)) { F2FS_I(inode)->i_pino = pino; file_got_pino(inode); + up_write(&fi->i_sem); mark_inode_dirty_sync(inode); ret = f2fs_write_inode(inode, NULL); if (ret) goto out; + } else { + up_write(&fi->i_sem); } } else { /* if there is no written node page, write its inode page */ while (!sync_node_pages(sbi, inode->i_ino, &wbc)) { + if (fsync_mark_done(sbi, inode->i_ino)) + goto out; mark_inode_dirty_sync(inode); ret = f2fs_write_inode(inode, NULL); if (ret) goto out; } - filemap_fdatawait_range(sbi->node_inode->i_mapping, - 0, LONG_MAX); - ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + ret = wait_on_node_pages_writeback(sbi, inode->i_ino); + if (ret) + goto out; + ret = f2fs_issue_flush(F2FS_SB(inode->i_sb)); } out: - mutex_unlock(&inode->i_mutex); trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); return ret; } +static pgoff_t __get_first_dirty_index(struct address_space *mapping, + pgoff_t pgofs, int whence) +{ + struct pagevec pvec; + int nr_pages; + + if (whence != SEEK_DATA) + return 0; + + /* find first dirty page index */ + pagevec_init(&pvec, 0); + nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, PAGECACHE_TAG_DIRTY, 1); + pgofs = nr_pages ? pvec.pages[0]->index: LONG_MAX; + pagevec_release(&pvec); + return pgofs; +} + +static bool __found_offset(block_t blkaddr, pgoff_t dirty, pgoff_t pgofs, + int whence) +{ + switch (whence) { + case SEEK_DATA: + if ((blkaddr == NEW_ADDR && dirty == pgofs) || + (blkaddr != NEW_ADDR && blkaddr != NULL_ADDR)) + return true; + break; + case SEEK_HOLE: + if (blkaddr == NULL_ADDR) + return true; + break; + } + return false; +} + +static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + loff_t maxbytes = inode->i_sb->s_maxbytes; + struct dnode_of_data dn; + pgoff_t pgofs, end_offset, dirty; + loff_t data_ofs = offset; + loff_t isize; + int err = 0; + + mutex_lock(&inode->i_mutex); + + isize = i_size_read(inode); + if (offset >= isize) + goto fail; + + /* handle inline data case */ + if (f2fs_has_inline_data(inode)) { + if (whence == SEEK_HOLE) + data_ofs = isize; + goto found; + } + + pgofs = (pgoff_t)(offset >> PAGE_CACHE_SHIFT); + + dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence); + + for (; data_ofs < isize; data_ofs = pgofs << PAGE_CACHE_SHIFT) { + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); + if (err && err != -ENOENT) { + goto fail; + } else if (err == -ENOENT) { + /* direct node is not exist */ + if (whence == SEEK_DATA) { + pgofs = PGOFS_OF_NEXT_DNODE(pgofs, + F2FS_I(inode)); + continue; + } else { + goto found; + } + } + + end_offset = IS_INODE(dn.node_page) ? + ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK; + + /* find data/hole in dnode block */ + for (; dn.ofs_in_node < end_offset; + dn.ofs_in_node++, pgofs++, + data_ofs = pgofs << PAGE_CACHE_SHIFT) { + block_t blkaddr; + blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + + if (__found_offset(blkaddr, dirty, pgofs, whence)) { + f2fs_put_dnode(&dn); + goto found; + } + } + f2fs_put_dnode(&dn); + } + + if (whence == SEEK_DATA) + goto fail; +found: + if (whence == SEEK_HOLE && data_ofs > isize) + data_ofs = isize; + mutex_unlock(&inode->i_mutex); + return vfs_setpos(file, data_ofs, maxbytes); +fail: + mutex_unlock(&inode->i_mutex); + return -ENXIO; +} + +static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + loff_t maxbytes = inode->i_sb->s_maxbytes; + + switch (whence) { + case SEEK_SET: + case SEEK_CUR: + case SEEK_END: + return generic_file_llseek_size(file, offset, whence, + maxbytes, i_size_read(inode)); + case SEEK_DATA: + case SEEK_HOLE: + return f2fs_seek_block(file, offset, whence); + } + + return -EINVAL; +} + static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); @@ -215,7 +338,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) raw_node = F2FS_NODE(dn->node_page); addr = blkaddr_in_node(raw_node) + ofs; - for ( ; count > 0; count--, addr++, dn->ofs_in_node++) { + for (; count > 0; count--, addr++, dn->ofs_in_node++) { block_t blkaddr = le32_to_cpu(*addr); if (blkaddr == NULL_ADDR) continue; @@ -246,6 +369,9 @@ static void truncate_partial_data_page(struct inode *inode, u64 from) unsigned offset = from & (PAGE_CACHE_SIZE - 1); struct page *page; + if (f2fs_has_inline_data(inode)) + return truncate_inline_data(inode, from); + if (!offset) return; @@ -254,48 +380,48 @@ static void truncate_partial_data_page(struct inode *inode, u64 from) return; lock_page(page); - if (page->mapping != inode->i_mapping) { + if (unlikely(page->mapping != inode->i_mapping)) { f2fs_put_page(page, 1); return; } - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); zero_user(page, offset, PAGE_CACHE_SIZE - offset); set_page_dirty(page); f2fs_put_page(page, 1); } -static int truncate_blocks(struct inode *inode, u64 from) +int truncate_blocks(struct inode *inode, u64 from) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); unsigned int blocksize = inode->i_sb->s_blocksize; struct dnode_of_data dn; pgoff_t free_from; - int count = 0, ilock = -1; - int err; + int count = 0, err = 0; trace_f2fs_truncate_blocks_enter(inode, from); + if (f2fs_has_inline_data(inode)) + goto done; + free_from = (pgoff_t) ((from + blocksize - 1) >> (sbi->log_blocksize)); - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE); if (err) { if (err == -ENOENT) goto free_next; - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); trace_f2fs_truncate_blocks_exit(inode, err); return err; } - if (IS_INODE(dn.node_page)) - count = ADDRS_PER_INODE(F2FS_I(inode)); - else - count = ADDRS_PER_BLOCK; + count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); count -= dn.ofs_in_node; - BUG_ON(count < 0); + f2fs_bug_on(count < 0); if (dn.ofs_in_node || IS_INODE(dn.node_page)) { truncate_data_blocks_range(&dn, count); @@ -305,8 +431,8 @@ static int truncate_blocks(struct inode *inode, u64 from) f2fs_put_dnode(&dn); free_next: err = truncate_inode_blocks(inode, free_from); - mutex_unlock_op(sbi, ilock); - + f2fs_unlock_op(sbi); +done: /* lastly zero out the first data page */ truncate_partial_data_page(inode, from); @@ -380,6 +506,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { + err = f2fs_convert_inline_data(inode, attr->ia_size); + if (err) + return err; + truncate_setsize(inode, attr->ia_size); f2fs_truncate(inode); f2fs_balance_fs(F2FS_SB(inode->i_sb)); @@ -388,7 +518,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) __setattr_copy(inode, attr); if (attr->ia_valid & ATTR_MODE) { - err = f2fs_acl_chmod(inode); + err = posix_acl_chmod(inode, get_inode_mode(inode)); if (err || is_inode_flag_set(fi, FI_ACL_MODE)) { inode->i_mode = fi->i_acl_mode; clear_inode_flag(fi, FI_ACL_MODE); @@ -403,12 +533,14 @@ const struct inode_operations f2fs_file_inode_operations = { .getattr = f2fs_getattr, .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, + .set_acl = f2fs_set_acl, #ifdef CONFIG_F2FS_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = f2fs_listxattr, .removexattr = generic_removexattr, #endif + .fiemap = f2fs_fiemap, }; static void fill_zero(struct inode *inode, pgoff_t index, @@ -416,19 +548,18 @@ static void fill_zero(struct inode *inode, pgoff_t index, { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *page; - int ilock; if (!len) return; f2fs_balance_fs(sbi); - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); page = get_new_data_page(inode, NULL, index, false); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); if (!IS_ERR(page)) { - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); zero_user(page, start, len); set_page_dirty(page); f2fs_put_page(page, 1); @@ -458,12 +589,16 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) return 0; } -static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode) +static int punch_hole(struct inode *inode, loff_t offset, loff_t len) { pgoff_t pg_start, pg_end; loff_t off_start, off_end; int ret = 0; + ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1); + if (ret) + return ret; + pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; @@ -484,7 +619,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode) struct address_space *mapping = inode->i_mapping; loff_t blk_start, blk_end; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - int ilock; f2fs_balance_fs(sbi); @@ -493,18 +627,12 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode) truncate_inode_pages_range(mapping, blk_start, blk_end - 1); - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); ret = truncate_hole(inode, pg_start, pg_end); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); } } - if (!(mode & FALLOC_FL_KEEP_SIZE) && - i_size_read(inode) <= (offset + len)) { - i_size_write(inode, offset); - mark_inode_dirty(inode); - } - return ret; } @@ -521,35 +649,29 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (ret) return ret; + ret = f2fs_convert_inline_data(inode, offset + len); + if (ret) + return ret; + pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; off_start = offset & (PAGE_CACHE_SIZE - 1); off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + f2fs_lock_op(sbi); + for (index = pg_start; index <= pg_end; index++) { struct dnode_of_data dn; - int ilock; - ilock = mutex_lock_op(sbi); + if (index == pg_end && !off_end) + goto noalloc; + set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, index, ALLOC_NODE); - if (ret) { - mutex_unlock_op(sbi, ilock); + ret = f2fs_reserve_block(&dn, index); + if (ret) break; - } - - if (dn.data_blkaddr == NULL_ADDR) { - ret = reserve_new_block(&dn); - if (ret) { - f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, ilock); - break; - } - } - f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, ilock); - +noalloc: if (pg_start == pg_end) new_size = offset + len; else if (index == pg_start && off_start) @@ -564,7 +686,9 @@ static int expand_inode_data(struct inode *inode, loff_t offset, i_size_read(inode) < new_size) { i_size_write(inode, new_size); mark_inode_dirty(inode); + update_inode_page(inode); } + f2fs_unlock_op(sbi); return ret; } @@ -578,8 +702,10 @@ static long f2fs_fallocate(struct file *file, int mode, if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; + mutex_lock(&inode->i_mutex); + if (mode & FALLOC_FL_PUNCH_HOLE) - ret = punch_hole(inode, offset, len, mode); + ret = punch_hole(inode, offset, len); else ret = expand_inode_data(inode, offset, len, mode); @@ -587,6 +713,9 @@ static long f2fs_fallocate(struct file *file, int mode, inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); } + + mutex_unlock(&inode->i_mutex); + trace_f2fs_fallocate(inode, mode, offset, len, ret); return ret; } @@ -682,11 +811,11 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) #endif const struct file_operations f2fs_file_operations = { - .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .llseek = f2fs_llseek, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .open = generic_file_open, .mmap = f2fs_file_mmap, .fsync = f2fs_sync_file, @@ -696,5 +825,5 @@ const struct file_operations f2fs_file_operations = { .compat_ioctl = f2fs_compat_ioctl, #endif .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, }; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 2f157e88368..b90dbe55403 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -77,13 +77,15 @@ static int gc_thread_func(void *data) else wait_ms = increase_sleep_time(gc_th, wait_ms); -#ifdef CONFIG_F2FS_STAT_FS - sbi->bg_gc++; -#endif + stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ if (f2fs_gc(sbi)) wait_ms = gc_th->no_gc_sleep_time; + + /* balancing f2fs's metadata periodically */ + f2fs_balance_fs_bg(sbi); + } while (!kthread_should_stop()); return 0; } @@ -117,7 +119,6 @@ int start_gc_thread(struct f2fs_sb_info *sbi) kfree(gc_th); sbi->gc_thread = NULL; } - out: return err; } @@ -162,8 +163,8 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->ofs_unit = sbi->segs_per_sec; } - if (p->max_search > MAX_VICTIM_SEARCH) - p->max_search = MAX_VICTIM_SEARCH; + if (p->max_search > sbi->max_victim_search) + p->max_search = sbi->max_victim_search; p->offset = sbi->last_victim[p->gc_mode]; } @@ -236,8 +237,8 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); } -static unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, - struct victim_sel_policy *p) +static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, + unsigned int segno, struct victim_sel_policy *p) { if (p->alloc_mode == SSR) return get_seg_entry(sbi, segno)->ckpt_valid_blocks; @@ -293,7 +294,11 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, } break; } - p.offset = ((segno / p.ofs_unit) * p.ofs_unit) + p.ofs_unit; + + p.offset = segno + p.ofs_unit; + if (p.ofs_unit > 1) + p.offset -= segno % p.ofs_unit; + secno = GET_SECNO(sbi, segno); if (sec_usage_check(sbi, secno)) @@ -306,10 +311,9 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, if (p.min_cost > cost) { p.min_segno = segno; p.min_cost = cost; - } - - if (cost == max_cost) + } else if (unlikely(cost == max_cost)) { continue; + } if (nsearched++ >= p.max_search) { sbi->last_victim[p.gc_mode] = segno; @@ -358,12 +362,8 @@ static void add_gc_inode(struct inode *inode, struct list_head *ilist) iput(inode); return; } -repeat: - new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS); - if (!new_ie) { - cond_resched(); - goto repeat; - } + + new_ie = f2fs_kmem_cache_alloc(winode_slab, GFP_NOFS); new_ie->inode = inode; list_add_tail(&new_ie->list, ilist); } @@ -428,7 +428,7 @@ next_step: /* set page dirty and write it */ if (gc_type == FG_GC) { - f2fs_wait_on_page_writeback(node_page, NODE, true); + f2fs_wait_on_page_writeback(node_page, NODE); set_page_dirty(node_page); } else { if (!PageWriteback(node_page)) @@ -520,23 +520,23 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, static void move_data_page(struct inode *inode, struct page *page, int gc_type) { + struct f2fs_io_info fio = { + .type = DATA, + .rw = WRITE_SYNC, + }; + if (gc_type == BG_GC) { if (PageWriteback(page)) goto out; set_page_dirty(page); set_cold_data(page); } else { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - - f2fs_wait_on_page_writeback(page, DATA, true); + f2fs_wait_on_page_writeback(page, DATA); - if (clear_page_dirty_for_io(page) && - S_ISDIR(inode->i_mode)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); + if (clear_page_dirty_for_io(page)) inode_dec_dirty_dents(inode); - } set_cold_data(page); - do_write_data_page(page); + do_write_data_page(page, &fio); clear_cold_data(page); } out: @@ -630,7 +630,7 @@ next_iput: goto next_step; if (gc_type == FG_GC) { - f2fs_submit_bio(sbi, DATA, true); + f2fs_submit_merged_bio(sbi, DATA, WRITE); /* * In the case of FG_GC, it'd be better to reclaim this victim @@ -663,8 +663,6 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, /* read segment summary of victim */ sum_page = get_sum_page(sbi, segno); - if (IS_ERR(sum_page)) - return; blk_start_plug(&plug); @@ -696,7 +694,9 @@ int f2fs_gc(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&ilist); gc_more: - if (!(sbi->sb->s_flags & MS_ACTIVE)) + if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) + goto stop; + if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) goto stop; if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { @@ -708,6 +708,11 @@ gc_more: goto stop; ret = 0; + /* readahead multi ssa blocks those have contiguous address */ + if (sbi->segs_per_sec > 1) + ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec, + META_SSA); + for (i = 0; i < sbi->segs_per_sec; i++) do_garbage_collect(sbi, segno + i, &ilist, gc_type); @@ -737,7 +742,7 @@ void build_gc_manager(struct f2fs_sb_info *sbi) int __init create_gc_caches(void) { winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes", - sizeof(struct inode_entry), NULL); + sizeof(struct inode_entry)); if (!winode_slab) return -ENOMEM; return 0; diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 507056d2220..5d5eb6047bf 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -20,7 +20,7 @@ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ /* Search max. number of dirty segments to select a victim segment */ -#define MAX_VICTIM_SEARCH 4096 /* covers 8GB */ +#define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */ struct f2fs_gc_kthread { struct task_struct *f2fs_gc_task; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c new file mode 100644 index 00000000000..1bba5228c19 --- /dev/null +++ b/fs/f2fs/inline.c @@ -0,0 +1,250 @@ +/* + * fs/f2fs/inline.c + * Copyright (c) 2013, Intel Corporation + * Authors: Huajun Li <huajun.li@intel.com> + * Haicheng Li <haicheng.li@intel.com> + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/f2fs_fs.h> + +#include "f2fs.h" + +bool f2fs_may_inline(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + block_t nr_blocks; + loff_t i_size; + + if (!test_opt(sbi, INLINE_DATA)) + return false; + + nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2; + if (inode->i_blocks > nr_blocks) + return false; + + i_size = i_size_read(inode); + if (i_size > MAX_INLINE_DATA) + return false; + + return true; +} + +int f2fs_read_inline_data(struct inode *inode, struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct page *ipage; + void *src_addr, *dst_addr; + + if (page->index) { + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + goto out; + } + + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + unlock_page(page); + return PTR_ERR(ipage); + } + + zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); + + /* Copy the whole inline data block */ + src_addr = inline_data_addr(ipage); + dst_addr = kmap(page); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + kunmap(page); + f2fs_put_page(ipage, 1); + +out: + SetPageUptodate(page); + unlock_page(page); + + return 0; +} + +static int __f2fs_convert_inline_data(struct inode *inode, struct page *page) +{ + int err; + struct page *ipage; + struct dnode_of_data dn; + void *src_addr, *dst_addr; + block_t new_blk_addr; + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_io_info fio = { + .type = DATA, + .rw = WRITE_SYNC | REQ_PRIO, + }; + + f2fs_lock_op(sbi); + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto out; + } + + /* + * i_addr[0] is not used for inline data, + * so reserving new block will not destroy inline data + */ + set_new_dnode(&dn, inode, ipage, NULL, 0); + err = f2fs_reserve_block(&dn, 0); + if (err) + goto out; + + f2fs_wait_on_page_writeback(page, DATA); + zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); + + /* Copy the whole inline data block */ + src_addr = inline_data_addr(ipage); + dst_addr = kmap(page); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + kunmap(page); + SetPageUptodate(page); + + /* write data page to try to make data consistent */ + set_page_writeback(page); + write_data_page(page, &dn, &new_blk_addr, &fio); + update_extent_cache(new_blk_addr, &dn); + f2fs_wait_on_page_writeback(page, DATA); + + /* clear inline data and flag after data writeback */ + zero_user_segment(ipage, INLINE_DATA_OFFSET, + INLINE_DATA_OFFSET + MAX_INLINE_DATA); + clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + stat_dec_inline_inode(inode); + + sync_inode_page(&dn); + f2fs_put_dnode(&dn); +out: + f2fs_unlock_op(sbi); + return err; +} + +int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size) +{ + struct page *page; + int err; + + if (!f2fs_has_inline_data(inode)) + return 0; + else if (to_size <= MAX_INLINE_DATA) + return 0; + + page = grab_cache_page(inode->i_mapping, 0); + if (!page) + return -ENOMEM; + + err = __f2fs_convert_inline_data(inode, page); + f2fs_put_page(page, 1); + return err; +} + +int f2fs_write_inline_data(struct inode *inode, + struct page *page, unsigned size) +{ + void *src_addr, *dst_addr; + struct page *ipage; + struct dnode_of_data dn; + int err; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, 0, LOOKUP_NODE); + if (err) + return err; + ipage = dn.inode_page; + + f2fs_wait_on_page_writeback(ipage, NODE); + zero_user_segment(ipage, INLINE_DATA_OFFSET, + INLINE_DATA_OFFSET + MAX_INLINE_DATA); + src_addr = kmap(page); + dst_addr = inline_data_addr(ipage); + memcpy(dst_addr, src_addr, size); + kunmap(page); + + /* Release the first data block if it is allocated */ + if (!f2fs_has_inline_data(inode)) { + truncate_data_blocks_range(&dn, 1); + set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + stat_inc_inline_inode(inode); + } + + sync_inode_page(&dn); + f2fs_put_dnode(&dn); + + return 0; +} + +void truncate_inline_data(struct inode *inode, u64 from) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct page *ipage; + + if (from >= MAX_INLINE_DATA) + return; + + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) + return; + + f2fs_wait_on_page_writeback(ipage, NODE); + + zero_user_segment(ipage, INLINE_DATA_OFFSET + from, + INLINE_DATA_OFFSET + MAX_INLINE_DATA); + set_page_dirty(ipage); + f2fs_put_page(ipage, 1); +} + +int recover_inline_data(struct inode *inode, struct page *npage) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_inode *ri = NULL; + void *src_addr, *dst_addr; + struct page *ipage; + + /* + * The inline_data recovery policy is as follows. + * [prev.] [next] of inline_data flag + * o o -> recover inline_data + * o x -> remove inline_data, and then recover data blocks + * x o -> remove inline_data, and then recover inline_data + * x x -> recover data blocks + */ + if (IS_INODE(npage)) + ri = F2FS_INODE(npage); + + if (f2fs_has_inline_data(inode) && + ri && ri->i_inline & F2FS_INLINE_DATA) { +process_inline: + ipage = get_node_page(sbi, inode->i_ino); + f2fs_bug_on(IS_ERR(ipage)); + + f2fs_wait_on_page_writeback(ipage, NODE); + + src_addr = inline_data_addr(npage); + dst_addr = inline_data_addr(ipage); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + update_inode(inode, ipage); + f2fs_put_page(ipage, 1); + return -1; + } + + if (f2fs_has_inline_data(inode)) { + ipage = get_node_page(sbi, inode->i_ino); + f2fs_bug_on(IS_ERR(ipage)); + f2fs_wait_on_page_writeback(ipage, NODE); + zero_user_segment(ipage, INLINE_DATA_OFFSET, + INLINE_DATA_OFFSET + MAX_INLINE_DATA); + clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + update_inode(inode, ipage); + f2fs_put_page(ipage, 1); + } else if (ri && ri->i_inline & F2FS_INLINE_DATA) { + truncate_blocks(inode, 0); + set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + goto process_inline; + } + return 0; +} diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 9339cd29204..2cf6962f6cc 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -12,6 +12,7 @@ #include <linux/f2fs_fs.h> #include <linux/buffer_head.h> #include <linux/writeback.h> +#include <linux/bitops.h> #include "f2fs.h" #include "node.h" @@ -21,20 +22,49 @@ void f2fs_set_inode_flags(struct inode *inode) { unsigned int flags = F2FS_I(inode)->i_flags; - - inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | - S_NOATIME | S_DIRSYNC); + unsigned int new_fl = 0; if (flags & FS_SYNC_FL) - inode->i_flags |= S_SYNC; + new_fl |= S_SYNC; if (flags & FS_APPEND_FL) - inode->i_flags |= S_APPEND; + new_fl |= S_APPEND; if (flags & FS_IMMUTABLE_FL) - inode->i_flags |= S_IMMUTABLE; + new_fl |= S_IMMUTABLE; if (flags & FS_NOATIME_FL) - inode->i_flags |= S_NOATIME; + new_fl |= S_NOATIME; if (flags & FS_DIRSYNC_FL) - inode->i_flags |= S_DIRSYNC; + new_fl |= S_DIRSYNC; + set_mask_bits(&inode->i_flags, + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl); +} + +static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) +{ + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + if (ri->i_addr[0]) + inode->i_rdev = + old_decode_dev(le32_to_cpu(ri->i_addr[0])); + else + inode->i_rdev = + new_decode_dev(le32_to_cpu(ri->i_addr[1])); + } +} + +static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) +{ + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + ri->i_addr[0] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); + ri->i_addr[1] = 0; + } else { + ri->i_addr[0] = 0; + ri->i_addr[1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); + ri->i_addr[2] = 0; + } + } } static int do_read_inode(struct inode *inode) @@ -42,13 +72,13 @@ static int do_read_inode(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct f2fs_inode_info *fi = F2FS_I(inode); struct page *node_page; - struct f2fs_node *rn; struct f2fs_inode *ri; /* Check if ino is within scope */ if (check_nid_range(sbi, inode->i_ino)) { f2fs_msg(inode->i_sb, KERN_ERR, "bad inode number: %lu", (unsigned long) inode->i_ino); + WARN_ON(1); return -EINVAL; } @@ -56,8 +86,7 @@ static int do_read_inode(struct inode *inode) if (IS_ERR(node_page)) return PTR_ERR(node_page); - rn = F2FS_NODE(node_page); - ri = &(rn->i); + ri = F2FS_INODE(node_page); inode->i_mode = le16_to_cpu(ri->i_mode); i_uid_write(inode, le32_to_cpu(ri->i_uid)); @@ -73,10 +102,6 @@ static int do_read_inode(struct inode *inode) inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); inode->i_generation = le32_to_cpu(ri->i_generation); - if (ri->i_addr[0]) - inode->i_rdev = old_decode_dev(le32_to_cpu(ri->i_addr[0])); - else - inode->i_rdev = new_decode_dev(le32_to_cpu(ri->i_addr[1])); fi->i_current_depth = le32_to_cpu(ri->i_current_depth); fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid); @@ -84,8 +109,14 @@ static int do_read_inode(struct inode *inode) fi->flags = 0; fi->i_advise = ri->i_advise; fi->i_pino = le32_to_cpu(ri->i_pino); + fi->i_dir_level = ri->i_dir_level; + get_extent_info(&fi->ext, ri->i_ext); get_inline_info(fi, ri); + + /* get rdev by using inline_info */ + __get_inode_rdev(inode, ri); + f2fs_put_page(node_page, 1); return 0; } @@ -149,13 +180,11 @@ bad_inode: void update_inode(struct inode *inode, struct page *node_page) { - struct f2fs_node *rn; struct f2fs_inode *ri; - f2fs_wait_on_page_writeback(node_page, NODE, false); + f2fs_wait_on_page_writeback(node_page, NODE); - rn = F2FS_NODE(node_page); - ri = &(rn->i); + ri = F2FS_INODE(node_page); ri->i_mode = cpu_to_le16(inode->i_mode); ri->i_advise = F2FS_I(inode)->i_advise; @@ -178,43 +207,38 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); ri->i_generation = cpu_to_le32(inode->i_generation); + ri->i_dir_level = F2FS_I(inode)->i_dir_level; - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { - if (old_valid_dev(inode->i_rdev)) { - ri->i_addr[0] = - cpu_to_le32(old_encode_dev(inode->i_rdev)); - ri->i_addr[1] = 0; - } else { - ri->i_addr[0] = 0; - ri->i_addr[1] = - cpu_to_le32(new_encode_dev(inode->i_rdev)); - ri->i_addr[2] = 0; - } - } - + __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); set_page_dirty(node_page); + clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); } -int update_inode_page(struct inode *inode) +void update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *node_page; - +retry: node_page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(node_page)) - return PTR_ERR(node_page); - + if (IS_ERR(node_page)) { + int err = PTR_ERR(node_page); + if (err == -ENOMEM) { + cond_resched(); + goto retry; + } else if (err != -ENOENT) { + f2fs_stop_checkpoint(sbi); + } + return; + } update_inode(inode, node_page); f2fs_put_page(node_page, 1); - return 0; } int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - int ret, ilock; if (inode->i_ino == F2FS_NODE_INO(sbi) || inode->i_ino == F2FS_META_INO(sbi)) @@ -227,14 +251,14 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * We need to lock here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - ilock = mutex_lock_op(sbi); - ret = update_inode_page(inode); - mutex_unlock_op(sbi, ilock); + f2fs_lock_op(sbi); + update_inode_page(inode); + f2fs_unlock_op(sbi); if (wbc) f2fs_balance_fs(sbi); - return ret; + return 0; } /* @@ -243,16 +267,15 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) void f2fs_evict_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - int ilock; trace_f2fs_evict_inode(inode); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (inode->i_ino == F2FS_NODE_INO(sbi) || inode->i_ino == F2FS_META_INO(sbi)) goto no_delete; - BUG_ON(atomic_read(&F2FS_I(inode)->dirty_dents)); + f2fs_bug_on(get_dirty_dents(inode)); remove_dirty_dir_inode(inode); if (inode->i_nlink || is_bad_inode(inode)) @@ -265,11 +288,13 @@ void f2fs_evict_inode(struct inode *inode) if (F2FS_HAS_BLOCKS(inode)) f2fs_truncate(inode); - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); remove_inode_page(inode); - mutex_unlock_op(sbi, ilock); + stat_dec_inline_inode(inode); + f2fs_unlock_op(sbi); sb_end_intwrite(inode->i_sb); no_delete: clear_inode(inode); + invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 2a5359c990f..a6bdddc33ce 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -27,32 +27,23 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) nid_t ino; struct inode *inode; bool nid_free = false; - int err, ilock; + int err; inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); if (!alloc_nid(sbi, &ino)) { - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); err = -ENOSPC; goto fail; } - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); - inode->i_uid = current_fsuid(); - - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else { - inode->i_gid = current_fsgid(); - } + inode_init_owner(inode, dir, mode); inode->i_ino = ino; - inode->i_mode = mode; inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_generation = sbi->s_next_generation++; @@ -115,7 +106,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; nid_t ino = 0; - int err, ilock; + int err; f2fs_balance_fs(sbi); @@ -131,9 +122,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); if (err) goto out; @@ -157,7 +148,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, struct inode *inode = old_dentry->d_inode; struct super_block *sb = dir->i_sb; struct f2fs_sb_info *sbi = F2FS_SB(sb); - int err, ilock; + int err; f2fs_balance_fs(sbi); @@ -165,9 +156,9 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, ihold(inode); set_inode_flag(F2FS_I(inode), FI_INC_LINK); - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); if (err) goto out; @@ -207,6 +198,8 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, inode = f2fs_iget(dir->i_sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); + + stat_inc_inline_inode(inode); } return d_splice_alias(inode, dentry); @@ -220,7 +213,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) struct f2fs_dir_entry *de; struct page *page; int err = -ENOENT; - int ilock; trace_f2fs_unlink_enter(dir, dentry); f2fs_balance_fs(sbi); @@ -229,16 +221,16 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) if (!de) goto fail; + f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); if (err) { + f2fs_unlock_op(sbi); kunmap(page); f2fs_put_page(page, 0); goto fail; } - - ilock = mutex_lock_op(sbi); f2fs_delete_entry(de, page, inode); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); /* In order to evict this inode, we set it dirty */ mark_inode_dirty(inode); @@ -254,7 +246,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; size_t symlen = strlen(symname) + 1; - int err, ilock; + int err; f2fs_balance_fs(sbi); @@ -265,9 +257,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &f2fs_symlink_inode_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); if (err) goto out; @@ -290,7 +282,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct inode *inode; - int err, ilock; + int err; f2fs_balance_fs(sbi); @@ -304,9 +296,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); set_inode_flag(F2FS_I(inode), FI_INC_LINK); - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); if (err) goto out_fail; @@ -342,7 +334,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; int err = 0; - int ilock; if (!new_valid_dev(rdev)) return -EINVAL; @@ -356,9 +347,9 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); if (err) goto out; @@ -387,7 +378,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct f2fs_dir_entry *old_dir_entry = NULL; struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; - int err = -ENOENT, ilock = -1; + int err = -ENOENT; f2fs_balance_fs(sbi); @@ -402,7 +393,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_old; } - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); if (new_inode) { @@ -428,9 +419,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_set_link(new_dir, new_entry, new_page, old_inode); new_inode->i_ctime = CURRENT_TIME; + down_write(&F2FS_I(new_inode)->i_sem); if (old_dir_entry) drop_nlink(new_inode); drop_nlink(new_inode); + up_write(&F2FS_I(new_inode)->i_sem); + + mark_inode_dirty(new_inode); if (!new_inode->i_nlink) add_orphan_inode(sbi, new_inode->i_ino); @@ -450,6 +445,10 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } } + down_write(&F2FS_I(old_inode)->i_sem); + file_lost_pino(old_inode); + up_write(&F2FS_I(old_inode)->i_sem); + old_inode->i_ctime = CURRENT_TIME; mark_inode_dirty(old_inode); @@ -459,25 +458,28 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir != new_dir) { f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); + update_inode_page(old_inode); } else { kunmap(old_dir_page); f2fs_put_page(old_dir_page, 0); } drop_nlink(old_dir); + mark_inode_dirty(old_dir); update_inode_page(old_dir); } - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); return 0; put_out_dir: - f2fs_put_page(new_page, 1); + kunmap(new_page); + f2fs_put_page(new_page, 0); out_dir: if (old_dir_entry) { kunmap(old_dir_page); f2fs_put_page(old_dir_page, 0); } - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); out_old: kunmap(old_page); f2fs_put_page(old_page, 0); @@ -498,6 +500,7 @@ const struct inode_operations f2fs_dir_inode_operations = { .getattr = f2fs_getattr, .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, + .set_acl = f2fs_set_acl, #ifdef CONFIG_F2FS_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, @@ -524,6 +527,7 @@ const struct inode_operations f2fs_special_inode_operations = { .getattr = f2fs_getattr, .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, + .set_acl = f2fs_set_acl, #ifdef CONFIG_F2FS_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 51ef2789443..4b697ccc9b0 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -21,9 +21,35 @@ #include "segment.h" #include <trace/events/f2fs.h> +#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock) + static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; +bool available_free_memory(struct f2fs_sb_info *sbi, int type) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct sysinfo val; + unsigned long mem_size = 0; + bool res = false; + + si_meminfo(&val); + /* give 25%, 25%, 50% memory for each components respectively */ + if (type == FREE_NIDS) { + mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> 12; + res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 2); + } else if (type == NAT_ENTRIES) { + mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> 12; + res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 2); + } else if (type == DIRTY_DENTS) { + if (sbi->sb->s_bdi->dirty_exceeded) + return false; + mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); + res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 1); + } + return res; +} + static void clear_node_page_dirty(struct page *page) { struct address_space *mapping = page->mapping; @@ -82,40 +108,6 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) return dst_page; } -/* - * Readahead NAT pages - */ -static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) -{ - struct address_space *mapping = sbi->meta_inode->i_mapping; - struct f2fs_nm_info *nm_i = NM_I(sbi); - struct blk_plug plug; - struct page *page; - pgoff_t index; - int i; - - blk_start_plug(&plug); - - for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) { - if (nid >= nm_i->max_nid) - nid = 0; - index = current_nat_addr(sbi, nid); - - page = grab_cache_page(mapping, index); - if (!page) - continue; - if (PageUptodate(page)) { - f2fs_put_page(page, 1); - continue; - } - if (f2fs_readpage(sbi, page, index, READ)) - continue; - - f2fs_put_page(page, 0); - } - blk_finish_plug(&plug); -} - static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) { return radix_tree_lookup(&nm_i->nat_root, n); @@ -149,6 +141,32 @@ int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) return is_cp; } +bool fsync_mark_done(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; + bool fsync_done = false; + + read_lock(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid); + if (e) + fsync_done = e->fsync_done; + read_unlock(&nm_i->nat_tree_lock); + return fsync_done; +} + +void fsync_mark_clear(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; + + write_lock(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid); + if (e) + e->fsync_done = false; + write_unlock(&nm_i->nat_tree_lock); +} + static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) { struct nat_entry *new; @@ -162,6 +180,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) } memset(new, 0, sizeof(struct nat_entry)); nat_set_nid(new, nid); + new->checkpointed = true; list_add_tail(&new->list, &nm_i->nat_entries); nm_i->nat_cnt++; return new; @@ -180,16 +199,13 @@ retry: write_unlock(&nm_i->nat_tree_lock); goto retry; } - nat_set_blkaddr(e, le32_to_cpu(ne->block_addr)); - nat_set_ino(e, le32_to_cpu(ne->ino)); - nat_set_version(e, ne->version); - e->checkpointed = true; + node_info_from_raw_nat(&e->ni, ne); } write_unlock(&nm_i->nat_tree_lock); } static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, - block_t new_blkaddr) + block_t new_blkaddr, bool fsync_done) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; @@ -203,8 +219,7 @@ retry: goto retry; } e->ni = *ni; - e->checkpointed = true; - BUG_ON(ni->blk_addr == NEW_ADDR); + f2fs_bug_on(ni->blk_addr == NEW_ADDR); } else if (new_blkaddr == NEW_ADDR) { /* * when nid is reallocated, @@ -212,19 +227,16 @@ retry: * So, reinitialize it with new information. */ e->ni = *ni; - BUG_ON(ni->blk_addr != NULL_ADDR); + f2fs_bug_on(ni->blk_addr != NULL_ADDR); } - if (new_blkaddr == NEW_ADDR) - e->checkpointed = false; - /* sanity check */ - BUG_ON(nat_get_blkaddr(e) != ni->blk_addr); - BUG_ON(nat_get_blkaddr(e) == NULL_ADDR && + f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr); + f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR && new_blkaddr == NULL_ADDR); - BUG_ON(nat_get_blkaddr(e) == NEW_ADDR && + f2fs_bug_on(nat_get_blkaddr(e) == NEW_ADDR && new_blkaddr == NEW_ADDR); - BUG_ON(nat_get_blkaddr(e) != NEW_ADDR && + f2fs_bug_on(nat_get_blkaddr(e) != NEW_ADDR && nat_get_blkaddr(e) != NULL_ADDR && new_blkaddr == NEW_ADDR); @@ -237,14 +249,19 @@ retry: /* change address */ nat_set_blkaddr(e, new_blkaddr); __set_nat_cache_dirty(nm_i, e); + + /* update fsync_mark if its inode nat entry is still alive */ + e = __lookup_nat_cache(nm_i, ni->ino); + if (e) + e->fsync_done = fsync_done; write_unlock(&nm_i->nat_tree_lock); } -static int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) +int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); - if (nm_i->nat_cnt <= NM_WOUT_THRESHOLD) + if (available_free_memory(sbi, NAT_ENTRIES)) return 0; write_lock(&nm_i->nat_tree_lock); @@ -391,8 +408,8 @@ got: /* * Caller should call f2fs_put_dnode(dn). - * Also, it should grab and release a mutex by calling mutex_lock_op() and - * mutex_unlock_op() only if ro is not set RDONLY_NODE. + * Also, it should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op() only if ro is not set RDONLY_NODE. * In the case of RDONLY_NODE, we don't need to care about mutex. */ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) @@ -495,15 +512,15 @@ static void truncate_node(struct dnode_of_data *dn) get_node_info(sbi, dn->nid, &ni); if (dn->inode->i_blocks == 0) { - BUG_ON(ni.blk_addr != NULL_ADDR); + f2fs_bug_on(ni.blk_addr != NULL_ADDR); goto invalidate; } - BUG_ON(ni.blk_addr == NULL_ADDR); + f2fs_bug_on(ni.blk_addr == NULL_ADDR); /* Deallocate node address */ invalidate_blocks(sbi, ni.blk_addr); - dec_valid_node_count(sbi, dn->inode, 1); - set_node_addr(sbi, &ni, NULL_ADDR); + dec_valid_node_count(sbi, dn->inode); + set_node_addr(sbi, &ni, NULL_ADDR, false); if (dn->nid == dn->inode->i_ino) { remove_orphan_inode(sbi, dn->nid); @@ -516,6 +533,10 @@ invalidate: F2FS_SET_SB_DIRT(sbi); f2fs_put_page(dn->node_page, 1); + + invalidate_mapping_pages(NODE_MAPPING(sbi), + dn->node_page->index, dn->node_page->index); + dn->node_page = NULL; trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr); } @@ -631,19 +652,19 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, return 0; /* get indirect nodes in the path */ - for (i = 0; i < depth - 1; i++) { + for (i = 0; i < idx + 1; i++) { /* refernece count'll be increased */ pages[i] = get_node_page(sbi, nid[i]); if (IS_ERR(pages[i])) { - depth = i + 1; err = PTR_ERR(pages[i]); + idx = i - 1; goto fail; } nid[i + 1] = get_nid(pages[i], offset[i + 1], false); } /* free direct nodes linked to a partial indirect node */ - for (i = offset[depth - 1]; i < NIDS_PER_BLOCK; i++) { + for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) { child_nid = get_nid(pages[idx], i, false); if (!child_nid) continue; @@ -654,7 +675,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, set_nid(pages[idx], i, 0, false); } - if (offset[depth - 1] == 0) { + if (offset[idx + 1] == 0) { dn->node_page = pages[idx]; dn->nid = nid[idx]; truncate_node(dn); @@ -662,9 +683,10 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, f2fs_put_page(pages[idx], 1); } offset[idx]++; - offset[depth - 1] = 0; + offset[idx + 1] = 0; + idx--; fail: - for (i = depth - 3; i >= 0; i--) + for (i = idx; i >= 0; i--) f2fs_put_page(pages[i], 1); trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err); @@ -678,11 +700,10 @@ fail: int truncate_inode_blocks(struct inode *inode, pgoff_t from) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct address_space *node_mapping = sbi->node_inode->i_mapping; int err = 0, cont = 1; int level, offset[4], noffset[4]; unsigned int nofs = 0; - struct f2fs_node *rn; + struct f2fs_inode *ri; struct dnode_of_data dn; struct page *page; @@ -699,7 +720,7 @@ restart: set_new_dnode(&dn, inode, page, NULL, 0); unlock_page(page); - rn = F2FS_NODE(page); + ri = F2FS_INODE(page); switch (level) { case 0: case 1: @@ -709,7 +730,7 @@ restart: nofs = noffset[1]; if (!offset[level - 1]) goto skip_partial; - err = truncate_partial_nodes(&dn, &rn->i, offset, level); + err = truncate_partial_nodes(&dn, ri, offset, level); if (err < 0 && err != -ENOENT) goto fail; nofs += 1 + NIDS_PER_BLOCK; @@ -718,7 +739,7 @@ restart: nofs = 5 + 2 * NIDS_PER_BLOCK; if (!offset[level - 1]) goto skip_partial; - err = truncate_partial_nodes(&dn, &rn->i, offset, level); + err = truncate_partial_nodes(&dn, ri, offset, level); if (err < 0 && err != -ENOENT) goto fail; break; @@ -728,7 +749,7 @@ restart: skip_partial: while (cont) { - dn.nid = le32_to_cpu(rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]); + dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]); switch (offset[0]) { case NODE_DIR1_BLOCK: case NODE_DIR2_BLOCK: @@ -751,14 +772,14 @@ skip_partial: if (err < 0 && err != -ENOENT) goto fail; if (offset[1] == 0 && - rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) { + ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) { lock_page(page); - if (page->mapping != node_mapping) { + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { f2fs_put_page(page, 1); goto restart; } - wait_on_page_writeback(page); - rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; + f2fs_wait_on_page_writeback(page, NODE); + ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; set_page_dirty(page); unlock_page(page); } @@ -794,38 +815,34 @@ int truncate_xattr_node(struct inode *inode, struct page *page) set_new_dnode(&dn, inode, page, npage, nid); if (page) - dn.inode_page_locked = 1; + dn.inode_page_locked = true; truncate_node(&dn); return 0; } /* - * Caller should grab and release a mutex by calling mutex_lock_op() and - * mutex_unlock_op(). + * Caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). */ -int remove_inode_page(struct inode *inode) +void remove_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *page; nid_t ino = inode->i_ino; struct dnode_of_data dn; - int err; page = get_node_page(sbi, ino); if (IS_ERR(page)) - return PTR_ERR(page); + return; - err = truncate_xattr_node(inode, page); - if (err) { + if (truncate_xattr_node(inode, page)) { f2fs_put_page(page, 1); - return err; + return; } - /* 0 is possible, after f2fs_new_inode() is failed */ - BUG_ON(inode->i_blocks != 0 && inode->i_blocks != 1); + f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1); set_new_dnode(&dn, inode, page, page, ino); truncate_node(&dn); - return 0; } struct page *new_inode_page(struct inode *inode, const struct qstr *name) @@ -843,19 +860,18 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); - struct address_space *mapping = sbi->node_inode->i_mapping; struct node_info old_ni, new_ni; struct page *page; int err; - if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)) + if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return ERR_PTR(-EPERM); - page = grab_cache_page(mapping, dn->nid); + page = grab_cache_page(NODE_MAPPING(sbi), dn->nid); if (!page) return ERR_PTR(-ENOMEM); - if (!inc_valid_node_count(sbi, dn->inode, 1)) { + if (unlikely(!inc_valid_node_count(sbi, dn->inode))) { err = -ENOSPC; goto fail; } @@ -863,17 +879,18 @@ struct page *new_node_page(struct dnode_of_data *dn, get_node_info(sbi, dn->nid, &old_ni); /* Reinitialize old_ni with new node page */ - BUG_ON(old_ni.blk_addr != NULL_ADDR); + f2fs_bug_on(old_ni.blk_addr != NULL_ADDR); new_ni = old_ni; new_ni.ino = dn->inode->i_ino; - set_node_addr(sbi, &new_ni, NEW_ADDR); + set_node_addr(sbi, &new_ni, NEW_ADDR, false); + f2fs_wait_on_page_writeback(page, NODE); fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); set_cold_node(dn->inode, page); SetPageUptodate(page); set_page_dirty(page); - if (ofs == XATTR_NODE_OFFSET) + if (f2fs_has_xattr_block(ofs)) F2FS_I(dn->inode)->i_xattr_nid = dn->nid; dn->node_page = page; @@ -898,14 +915,14 @@ fail: * LOCKED_PAGE: f2fs_put_page(page, 1) * error: nothing */ -static int read_node_page(struct page *page, int type) +static int read_node_page(struct page *page, int rw) { struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); struct node_info ni; get_node_info(sbi, page->index, &ni); - if (ni.blk_addr == NULL_ADDR) { + if (unlikely(ni.blk_addr == NULL_ADDR)) { f2fs_put_page(page, 1); return -ENOENT; } @@ -913,7 +930,7 @@ static int read_node_page(struct page *page, int type) if (PageUptodate(page)) return LOCKED_PAGE; - return f2fs_readpage(sbi, page, ni.blk_addr, type); + return f2fs_submit_page_bio(sbi, page, ni.blk_addr, rw); } /* @@ -921,18 +938,17 @@ static int read_node_page(struct page *page, int type) */ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) { - struct address_space *mapping = sbi->node_inode->i_mapping; struct page *apage; int err; - apage = find_get_page(mapping, nid); + apage = find_get_page(NODE_MAPPING(sbi), nid); if (apage && PageUptodate(apage)) { f2fs_put_page(apage, 0); return; } f2fs_put_page(apage, 0); - apage = grab_cache_page(mapping, nid); + apage = grab_cache_page(NODE_MAPPING(sbi), nid); if (!apage) return; @@ -945,11 +961,10 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) { - struct address_space *mapping = sbi->node_inode->i_mapping; struct page *page; int err; repeat: - page = grab_cache_page(mapping, nid); + page = grab_cache_page(NODE_MAPPING(sbi), nid); if (!page) return ERR_PTR(-ENOMEM); @@ -960,17 +975,15 @@ repeat: goto got_it; lock_page(page); - if (!PageUptodate(page)) { + if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) { f2fs_put_page(page, 1); return ERR_PTR(-EIO); } - if (page->mapping != mapping) { + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { f2fs_put_page(page, 1); goto repeat; } got_it: - BUG_ON(nid != nid_of_node(page)); - mark_page_accessed(page); return page; } @@ -981,7 +994,6 @@ got_it: struct page *get_node_page_ra(struct page *parent, int start) { struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb); - struct address_space *mapping = sbi->node_inode->i_mapping; struct blk_plug plug; struct page *page; int err, i, end; @@ -992,7 +1004,7 @@ struct page *get_node_page_ra(struct page *parent, int start) if (!nid) return ERR_PTR(-ENOENT); repeat: - page = grab_cache_page(mapping, nid); + page = grab_cache_page(NODE_MAPPING(sbi), nid); if (!page) return ERR_PTR(-ENOMEM); @@ -1017,16 +1029,15 @@ repeat: blk_finish_plug(&plug); lock_page(page); - if (page->mapping != mapping) { + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { f2fs_put_page(page, 1); goto repeat; } page_hit: - if (!PageUptodate(page)) { + if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); return ERR_PTR(-EIO); } - mark_page_accessed(page); return page; } @@ -1048,7 +1059,6 @@ void sync_inode_page(struct dnode_of_data *dn) int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, struct writeback_control *wbc) { - struct address_space *mapping = sbi->node_inode->i_mapping; pgoff_t index, end; struct pagevec pvec; int step = ino ? 2 : 0; @@ -1062,7 +1072,7 @@ next_step: while (index <= end) { int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) @@ -1095,7 +1105,7 @@ next_step: else if (!trylock_page(page)) continue; - if (unlikely(page->mapping != mapping)) { + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { continue_unlock: unlock_page(page); continue; @@ -1122,7 +1132,7 @@ continue_unlock: set_fsync_mark(page, 0); set_dentry_mark(page, 0); } - mapping->a_ops->writepage(page, wbc); + NODE_MAPPING(sbi)->a_ops->writepage(page, wbc); wrote++; if (--wbc->nr_to_write == 0) @@ -1143,11 +1153,52 @@ continue_unlock: } if (wrote) - f2fs_submit_bio(sbi, NODE, wbc->sync_mode == WB_SYNC_ALL); - + f2fs_submit_merged_bio(sbi, NODE, WRITE); return nwritten; } +int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) +{ + pgoff_t index = 0, end = LONG_MAX; + struct pagevec pvec; + int ret2 = 0, ret = 0; + + pagevec_init(&pvec, 0); + + while (index <= end) { + int i, nr_pages; + nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_WRITEBACK, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* until radix tree lookup accepts end_index */ + if (unlikely(page->index > end)) + continue; + + if (ino && ino_of_node(page) == ino) { + f2fs_wait_on_page_writeback(page, NODE); + if (TestClearPageError(page)) + ret = -EIO; + } + } + pagevec_release(&pvec); + cond_resched(); + } + + if (unlikely(test_and_clear_bit(AS_ENOSPC, &NODE_MAPPING(sbi)->flags))) + ret2 = -ENOSPC; + if (unlikely(test_and_clear_bit(AS_EIO, &NODE_MAPPING(sbi)->flags))) + ret2 = -EIO; + if (!ret) + ret = ret2; + return ret; +} + static int f2fs_write_node_page(struct page *page, struct writeback_control *wbc) { @@ -1155,66 +1206,71 @@ static int f2fs_write_node_page(struct page *page, nid_t nid; block_t new_addr; struct node_info ni; + struct f2fs_io_info fio = { + .type = NODE, + .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + }; + + trace_f2fs_writepage(page, NODE); + + if (unlikely(sbi->por_doing)) + goto redirty_out; - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, NODE); /* get old block addr of this node page */ nid = nid_of_node(page); - BUG_ON(page->index != nid); + f2fs_bug_on(page->index != nid); get_node_info(sbi, nid, &ni); /* This page is already truncated */ - if (ni.blk_addr == NULL_ADDR) { + if (unlikely(ni.blk_addr == NULL_ADDR)) { dec_page_count(sbi, F2FS_DIRTY_NODES); unlock_page(page); return 0; } - if (wbc->for_reclaim) { - dec_page_count(sbi, F2FS_DIRTY_NODES); - wbc->pages_skipped++; - set_page_dirty(page); - return AOP_WRITEPAGE_ACTIVATE; - } + if (wbc->for_reclaim) + goto redirty_out; mutex_lock(&sbi->node_write); set_page_writeback(page); - write_node_page(sbi, page, nid, ni.blk_addr, &new_addr); - set_node_addr(sbi, &ni, new_addr); + write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr); + set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); mutex_unlock(&sbi->node_write); unlock_page(page); return 0; + +redirty_out: + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; } -/* - * It is very important to gather dirty pages and write at once, so that we can - * submit a big bio without interfering other data writes. - * Be default, 512 pages (2MB) * 3 node types, is more reasonable. - */ -#define COLLECT_DIRTY_NODES 1536 static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - long nr_to_write = wbc->nr_to_write; + long diff; - /* First check balancing cached NAT entries */ - if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) { - f2fs_sync_fs(sbi->sb, true); - return 0; - } + trace_f2fs_writepages(mapping->host, wbc, NODE); + + /* balancing f2fs's metadata in background */ + f2fs_balance_fs_bg(sbi); /* collect a number of dirty node pages and write together */ - if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES) - return 0; + if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) + goto skip_write; - /* if mounting is failed, skip writing node pages */ - wbc->nr_to_write = 3 * max_hw_blocks(sbi); + diff = nr_pages_to_write(sbi, NODE, wbc); + wbc->sync_mode = WB_SYNC_NONE; sync_node_pages(sbi, 0, wbc); - wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) - - wbc->nr_to_write); + wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); + return 0; + +skip_write: + wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES); return 0; } @@ -1223,6 +1279,8 @@ static int f2fs_set_node_page_dirty(struct page *page) struct address_space *mapping = page->mapping; struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); + trace_f2fs_set_page_dirty(page, NODE); + SetPageUptodate(page); if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); @@ -1260,59 +1318,51 @@ const struct address_space_operations f2fs_node_aops = { .releasepage = f2fs_release_node_page, }; -static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head) +static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, + nid_t n) { - struct list_head *this; - struct free_nid *i; - list_for_each(this, head) { - i = list_entry(this, struct free_nid, list); - if (i->nid == n) - return i; - } - return NULL; + return radix_tree_lookup(&nm_i->free_nid_root, n); } -static void __del_from_free_nid_list(struct free_nid *i) +static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, + struct free_nid *i) { list_del(&i->list); - kmem_cache_free(free_nid_slab, i); + radix_tree_delete(&nm_i->free_nid_root, i->nid); } -static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) +static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) { + struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; struct nat_entry *ne; bool allocated = false; - if (nm_i->fcnt > 2 * MAX_FREE_NIDS) + if (!available_free_memory(sbi, FREE_NIDS)) return -1; /* 0 nid should not be used */ - if (nid == 0) + if (unlikely(nid == 0)) return 0; - if (!build) - goto retry; - - /* do not add allocated nids */ - read_lock(&nm_i->nat_tree_lock); - ne = __lookup_nat_cache(nm_i, nid); - if (ne && nat_get_blkaddr(ne) != NULL_ADDR) - allocated = true; - read_unlock(&nm_i->nat_tree_lock); - if (allocated) - return 0; -retry: - i = kmem_cache_alloc(free_nid_slab, GFP_NOFS); - if (!i) { - cond_resched(); - goto retry; + if (build) { + /* do not add allocated nids */ + read_lock(&nm_i->nat_tree_lock); + ne = __lookup_nat_cache(nm_i, nid); + if (ne && + (!ne->checkpointed || nat_get_blkaddr(ne) != NULL_ADDR)) + allocated = true; + read_unlock(&nm_i->nat_tree_lock); + if (allocated) + return 0; } + + i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); i->nid = nid; i->state = NID_NEW; spin_lock(&nm_i->free_nid_list_lock); - if (__lookup_free_nid_list(nid, &nm_i->free_nid_list)) { + if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) { spin_unlock(&nm_i->free_nid_list_lock); kmem_cache_free(free_nid_slab, i); return 0; @@ -1326,18 +1376,25 @@ retry: static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) { struct free_nid *i; + bool need_free = false; + spin_lock(&nm_i->free_nid_list_lock); - i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); + i = __lookup_free_nid_list(nm_i, nid); if (i && i->state == NID_NEW) { - __del_from_free_nid_list(i); + __del_from_free_nid_list(nm_i, i); nm_i->fcnt--; + need_free = true; } spin_unlock(&nm_i->free_nid_list_lock); + + if (need_free) + kmem_cache_free(free_nid_slab, i); } -static void scan_nat_page(struct f2fs_nm_info *nm_i, +static void scan_nat_page(struct f2fs_sb_info *sbi, struct page *nat_page, nid_t start_nid) { + struct f2fs_nm_info *nm_i = NM_I(sbi); struct f2fs_nat_block *nat_blk = page_address(nat_page); block_t blk_addr; int i; @@ -1346,13 +1403,13 @@ static void scan_nat_page(struct f2fs_nm_info *nm_i, for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { - if (start_nid >= nm_i->max_nid) + if (unlikely(start_nid >= nm_i->max_nid)) break; blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); - BUG_ON(blk_addr == NEW_ADDR); + f2fs_bug_on(blk_addr == NEW_ADDR); if (blk_addr == NULL_ADDR) { - if (add_free_nid(nm_i, start_nid, true) < 0) + if (add_free_nid(sbi, start_nid, true) < 0) break; } } @@ -1371,16 +1428,16 @@ static void build_free_nids(struct f2fs_sb_info *sbi) return; /* readahead nat pages to be scanned */ - ra_nat_pages(sbi, nid); + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT); while (1) { struct page *page = get_current_nat_page(sbi, nid); - scan_nat_page(nm_i, page, nid); + scan_nat_page(sbi, page, nid); f2fs_put_page(page, 1); nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK)); - if (nid >= nm_i->max_nid) + if (unlikely(nid >= nm_i->max_nid)) nid = 0; if (i++ == FREE_NID_PAGES) @@ -1396,7 +1453,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi) block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr); nid = le32_to_cpu(nid_in_journal(sum, i)); if (addr == NULL_ADDR) - add_free_nid(nm_i, nid, true); + add_free_nid(sbi, nid, true); else remove_free_nid(nm_i, nid); } @@ -1412,23 +1469,20 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i = NULL; - struct list_head *this; retry: - if (sbi->total_valid_node_count + 1 >= nm_i->max_nid) + if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) return false; spin_lock(&nm_i->free_nid_list_lock); /* We should not use stale free nids created by build_free_nids */ - if (nm_i->fcnt && !sbi->on_build_free_nids) { - BUG_ON(list_empty(&nm_i->free_nid_list)); - list_for_each(this, &nm_i->free_nid_list) { - i = list_entry(this, struct free_nid, list); + if (nm_i->fcnt && !on_build_free_nids(nm_i)) { + f2fs_bug_on(list_empty(&nm_i->free_nid_list)); + list_for_each_entry(i, &nm_i->free_nid_list, list) if (i->state == NID_NEW) break; - } - BUG_ON(i->state != NID_NEW); + f2fs_bug_on(i->state != NID_NEW); *nid = i->nid; i->state = NID_ALLOC; nm_i->fcnt--; @@ -1439,9 +1493,7 @@ retry: /* Let's scan nat pages and its caches to get free nids */ mutex_lock(&nm_i->build_lock); - sbi->on_build_free_nids = 1; build_free_nids(sbi); - sbi->on_build_free_nids = 0; mutex_unlock(&nm_i->build_lock); goto retry; } @@ -1455,10 +1507,12 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) struct free_nid *i; spin_lock(&nm_i->free_nid_list_lock); - i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); - BUG_ON(!i || i->state != NID_ALLOC); - __del_from_free_nid_list(i); + i = __lookup_free_nid_list(nm_i, nid); + f2fs_bug_on(!i || i->state != NID_ALLOC); + __del_from_free_nid_list(nm_i, i); spin_unlock(&nm_i->free_nid_list_lock); + + kmem_cache_free(free_nid_slab, i); } /* @@ -1468,20 +1522,25 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; + bool need_free = false; if (!nid) return; spin_lock(&nm_i->free_nid_list_lock); - i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); - BUG_ON(!i || i->state != NID_ALLOC); - if (nm_i->fcnt > 2 * MAX_FREE_NIDS) { - __del_from_free_nid_list(i); + i = __lookup_free_nid_list(nm_i, nid); + f2fs_bug_on(!i || i->state != NID_ALLOC); + if (!available_free_memory(sbi, FREE_NIDS)) { + __del_from_free_nid_list(nm_i, i); + need_free = true; } else { i->state = NID_NEW; nm_i->fcnt++; } spin_unlock(&nm_i->free_nid_list_lock); + + if (need_free) + kmem_cache_free(free_nid_slab, i); } void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, @@ -1489,90 +1548,200 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, block_t new_blkaddr) { rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr); - set_node_addr(sbi, ni, new_blkaddr); + set_node_addr(sbi, ni, new_blkaddr, false); clear_node_page_dirty(page); } +static void recover_inline_xattr(struct inode *inode, struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + void *src_addr, *dst_addr; + size_t inline_size; + struct page *ipage; + struct f2fs_inode *ri; + + if (!f2fs_has_inline_xattr(inode)) + return; + + if (!IS_INODE(page)) + return; + + ri = F2FS_INODE(page); + if (!(ri->i_inline & F2FS_INLINE_XATTR)) + return; + + ipage = get_node_page(sbi, inode->i_ino); + f2fs_bug_on(IS_ERR(ipage)); + + dst_addr = inline_xattr_addr(ipage); + src_addr = inline_xattr_addr(page); + inline_size = inline_xattr_size(inode); + + f2fs_wait_on_page_writeback(ipage, NODE); + memcpy(dst_addr, src_addr, inline_size); + + update_inode(inode, ipage); + f2fs_put_page(ipage, 1); +} + +bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; + nid_t new_xnid = nid_of_node(page); + struct node_info ni; + + recover_inline_xattr(inode, page); + + if (!f2fs_has_xattr_block(ofs_of_node(page))) + return false; + + /* 1: invalidate the previous xattr nid */ + if (!prev_xnid) + goto recover_xnid; + + /* Deallocate node address */ + get_node_info(sbi, prev_xnid, &ni); + f2fs_bug_on(ni.blk_addr == NULL_ADDR); + invalidate_blocks(sbi, ni.blk_addr); + dec_valid_node_count(sbi, inode); + set_node_addr(sbi, &ni, NULL_ADDR, false); + +recover_xnid: + /* 2: allocate new xattr nid */ + if (unlikely(!inc_valid_node_count(sbi, inode))) + f2fs_bug_on(1); + + remove_free_nid(NM_I(sbi), new_xnid); + get_node_info(sbi, new_xnid, &ni); + ni.ino = inode->i_ino; + set_node_addr(sbi, &ni, NEW_ADDR, false); + F2FS_I(inode)->i_xattr_nid = new_xnid; + + /* 3: update xattr blkaddr */ + refresh_sit_entry(sbi, NEW_ADDR, blkaddr); + set_node_addr(sbi, &ni, blkaddr, false); + + update_inode_page(inode); + return true; +} + int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) { - struct address_space *mapping = sbi->node_inode->i_mapping; - struct f2fs_node *src, *dst; + struct f2fs_inode *src, *dst; nid_t ino = ino_of_node(page); struct node_info old_ni, new_ni; struct page *ipage; - ipage = grab_cache_page(mapping, ino); + get_node_info(sbi, ino, &old_ni); + + if (unlikely(old_ni.blk_addr != NULL_ADDR)) + return -EINVAL; + + ipage = grab_cache_page(NODE_MAPPING(sbi), ino); if (!ipage) return -ENOMEM; /* Should not use this inode from free nid list */ remove_free_nid(NM_I(sbi), ino); - get_node_info(sbi, ino, &old_ni); SetPageUptodate(ipage); fill_node_footer(ipage, ino, ino, 0, true); - src = F2FS_NODE(page); - dst = F2FS_NODE(ipage); + src = F2FS_INODE(page); + dst = F2FS_INODE(ipage); - memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i); - dst->i.i_size = 0; - dst->i.i_blocks = cpu_to_le64(1); - dst->i.i_links = cpu_to_le32(1); - dst->i.i_xattr_nid = 0; + memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src); + dst->i_size = 0; + dst->i_blocks = cpu_to_le64(1); + dst->i_links = cpu_to_le32(1); + dst->i_xattr_nid = 0; new_ni = old_ni; new_ni.ino = ino; - if (!inc_valid_node_count(sbi, NULL, 1)) + if (unlikely(!inc_valid_node_count(sbi, NULL))) WARN_ON(1); - set_node_addr(sbi, &new_ni, NEW_ADDR); + set_node_addr(sbi, &new_ni, NEW_ADDR, false); inc_valid_inode_count(sbi); f2fs_put_page(ipage, 1); return 0; } +/* + * ra_sum_pages() merge contiguous pages into one bio and submit. + * these pre-readed pages are alloced in bd_inode's mapping tree. + */ +static int ra_sum_pages(struct f2fs_sb_info *sbi, struct page **pages, + int start, int nrpages) +{ + struct inode *inode = sbi->sb->s_bdev->bd_inode; + struct address_space *mapping = inode->i_mapping; + int i, page_idx = start; + struct f2fs_io_info fio = { + .type = META, + .rw = READ_SYNC | REQ_META | REQ_PRIO + }; + + for (i = 0; page_idx < start + nrpages; page_idx++, i++) { + /* alloc page in bd_inode for reading node summary info */ + pages[i] = grab_cache_page(mapping, page_idx); + if (!pages[i]) + break; + f2fs_submit_page_mbio(sbi, pages[i], page_idx, &fio); + } + + f2fs_submit_merged_bio(sbi, META, READ); + return i; +} + int restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum) { struct f2fs_node *rn; struct f2fs_summary *sum_entry; - struct page *page; + struct inode *inode = sbi->sb->s_bdev->bd_inode; block_t addr; - int i, last_offset; - - /* alloc temporal page for read node */ - page = alloc_page(GFP_NOFS | __GFP_ZERO); - if (!page) - return -ENOMEM; - lock_page(page); + int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + struct page *pages[bio_blocks]; + int i, idx, last_offset, nrpages, err = 0; /* scan the node segment */ last_offset = sbi->blocks_per_seg; addr = START_BLOCK(sbi, segno); sum_entry = &sum->entries[0]; - for (i = 0; i < last_offset; i++, sum_entry++) { - /* - * In order to read next node page, - * we must clear PageUptodate flag. - */ - ClearPageUptodate(page); + for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) { + nrpages = min(last_offset - i, bio_blocks); - if (f2fs_readpage(sbi, page, addr, READ_SYNC)) - goto out; + /* read ahead node pages */ + nrpages = ra_sum_pages(sbi, pages, addr, nrpages); + if (!nrpages) + return -ENOMEM; + + for (idx = 0; idx < nrpages; idx++) { + if (err) + goto skip; + + lock_page(pages[idx]); + if (unlikely(!PageUptodate(pages[idx]))) { + err = -EIO; + } else { + rn = F2FS_NODE(pages[idx]); + sum_entry->nid = rn->footer.nid; + sum_entry->version = 0; + sum_entry->ofs_in_node = 0; + sum_entry++; + } + unlock_page(pages[idx]); +skip: + page_cache_release(pages[idx]); + } - lock_page(page); - rn = F2FS_NODE(page); - sum_entry->nid = rn->footer.nid; - sum_entry->version = 0; - sum_entry->ofs_in_node = 0; - addr++; + invalidate_mapping_pages(inode->i_mapping, addr, + addr + nrpages); } - unlock_page(page); -out: - __free_pages(page, 0); - return 0; + return err; } static bool flush_nats_in_journal(struct f2fs_sb_info *sbi) @@ -1608,9 +1777,7 @@ retry: write_unlock(&nm_i->nat_tree_lock); goto retry; } - nat_set_blkaddr(ne, le32_to_cpu(raw_ne.block_addr)); - nat_set_ino(ne, le32_to_cpu(raw_ne.ino)); - nat_set_version(ne, raw_ne.version); + node_info_from_raw_nat(&ne->ni, &raw_ne); __set_nat_cache_dirty(nm_i, ne); write_unlock(&nm_i->nat_tree_lock); } @@ -1627,7 +1794,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_summary_block *sum = curseg->sum_blk; - struct list_head *cur, *n; + struct nat_entry *ne, *cur; struct page *page = NULL; struct f2fs_nat_block *nat_blk = NULL; nid_t start_nid = 0, end_nid = 0; @@ -1639,18 +1806,16 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) mutex_lock(&curseg->curseg_mutex); /* 1) flush dirty nat caches */ - list_for_each_safe(cur, n, &nm_i->dirty_nat_entries) { - struct nat_entry *ne; + list_for_each_entry_safe(ne, cur, &nm_i->dirty_nat_entries, list) { nid_t nid; struct f2fs_nat_entry raw_ne; int offset = -1; - block_t new_blkaddr; - - ne = list_entry(cur, struct nat_entry, list); - nid = nat_get_nid(ne); if (nat_get_blkaddr(ne) == NEW_ADDR) continue; + + nid = nat_get_nid(ne); + if (flushed) goto to_nat_page; @@ -1677,14 +1842,10 @@ to_nat_page: nat_blk = page_address(page); } - BUG_ON(!nat_blk); + f2fs_bug_on(!nat_blk); raw_ne = nat_blk->entries[nid - start_nid]; flush_now: - new_blkaddr = nat_get_blkaddr(ne); - - raw_ne.ino = cpu_to_le32(nat_get_ino(ne)); - raw_ne.block_addr = cpu_to_le32(new_blkaddr); - raw_ne.version = nat_get_version(ne); + raw_nat_from_node_info(&raw_ne, &ne->ni); if (offset < 0) { nat_blk->entries[nid - start_nid] = raw_ne; @@ -1694,23 +1855,19 @@ flush_now: } if (nat_get_blkaddr(ne) == NULL_ADDR && - add_free_nid(NM_I(sbi), nid, false) <= 0) { + add_free_nid(sbi, nid, false) <= 0) { write_lock(&nm_i->nat_tree_lock); __del_from_nat_cache(nm_i, ne); write_unlock(&nm_i->nat_tree_lock); } else { write_lock(&nm_i->nat_tree_lock); __clear_nat_cache_dirty(nm_i, ne); - ne->checkpointed = true; write_unlock(&nm_i->nat_tree_lock); } } if (!flushed) mutex_unlock(&curseg->curseg_mutex); f2fs_put_page(page, 1); - - /* 2) shrink nat caches if necessary */ - try_to_free_nats(sbi, nm_i->nat_cnt - NM_WOUT_THRESHOLD); } static int init_node_manager(struct f2fs_sb_info *sbi) @@ -1725,10 +1882,16 @@ static int init_node_manager(struct f2fs_sb_info *sbi) /* segment_count_nat includes pair segment so divide to 2. */ nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); + nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; + + /* not used nids: 0, node, meta, (and root counted as valid node) */ + nm_i->available_nids = nm_i->max_nid - 3; nm_i->fcnt = 0; nm_i->nat_cnt = 0; + nm_i->ram_thresh = DEF_RAM_THRESHOLD; + INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->nat_entries); @@ -1781,11 +1944,14 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) /* destroy free nid list */ spin_lock(&nm_i->free_nid_list_lock); list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { - BUG_ON(i->state == NID_ALLOC); - __del_from_free_nid_list(i); + f2fs_bug_on(i->state == NID_ALLOC); + __del_from_free_nid_list(nm_i, i); nm_i->fcnt--; + spin_unlock(&nm_i->free_nid_list_lock); + kmem_cache_free(free_nid_slab, i); + spin_lock(&nm_i->free_nid_list_lock); } - BUG_ON(nm_i->fcnt); + f2fs_bug_on(nm_i->fcnt); spin_unlock(&nm_i->free_nid_list_lock); /* destroy nat cache */ @@ -1793,13 +1959,11 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) while ((found = __gang_lookup_nat_cache(nm_i, nid, NATVEC_SIZE, natvec))) { unsigned idx; - for (idx = 0; idx < found; idx++) { - struct nat_entry *e = natvec[idx]; - nid = nat_get_nid(e) + 1; - __del_from_nat_cache(nm_i, e); - } + nid = nat_get_nid(natvec[found - 1]) + 1; + for (idx = 0; idx < found; idx++) + __del_from_nat_cache(nm_i, natvec[idx]); } - BUG_ON(nm_i->nat_cnt); + f2fs_bug_on(nm_i->nat_cnt); write_unlock(&nm_i->nat_tree_lock); kfree(nm_i->nat_bitmap); @@ -1810,12 +1974,12 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) int __init create_node_manager_caches(void) { nat_entry_slab = f2fs_kmem_cache_create("nat_entry", - sizeof(struct nat_entry), NULL); + sizeof(struct nat_entry)); if (!nat_entry_slab) return -ENOMEM; free_nid_slab = f2fs_kmem_cache_create("free_nid", - sizeof(struct free_nid), NULL); + sizeof(struct free_nid)); if (!free_nid_slab) { kmem_cache_destroy(nat_entry_slab); return -ENOMEM; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 3496bb3e15d..7281112cd1c 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -17,14 +17,11 @@ /* # of pages to perform readahead before building free nids */ #define FREE_NID_PAGES 4 -/* maximum # of free node ids to produce during build_free_nids */ -#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES) - /* maximum readahead size for node during getting data blocks */ #define MAX_RA_NODE 128 -/* maximum cached nat entries to manage memory footprint */ -#define NM_WOUT_THRESHOLD (64 * NAT_ENTRY_PER_BLOCK) +/* control the memory footprint threshold (10MB per 1GB ram) */ +#define DEF_RAM_THRESHOLD 10 /* vector size for gang look-up from nat cache that consists of radix tree */ #define NATVEC_SIZE 64 @@ -45,6 +42,7 @@ struct node_info { struct nat_entry { struct list_head list; /* for clean or dirty nat list */ bool checkpointed; /* whether it is checkpointed or not */ + bool fsync_done; /* whether the latest node has fsync mark */ struct node_info ni; /* in-memory node information */ }; @@ -58,9 +56,15 @@ struct nat_entry { #define nat_set_version(nat, v) (nat->ni.version = v) #define __set_nat_cache_dirty(nm_i, ne) \ - list_move_tail(&ne->list, &nm_i->dirty_nat_entries); + do { \ + ne->checkpointed = false; \ + list_move_tail(&ne->list, &nm_i->dirty_nat_entries); \ + } while (0) #define __clear_nat_cache_dirty(nm_i, ne) \ - list_move_tail(&ne->list, &nm_i->nat_entries); + do { \ + ne->checkpointed = true; \ + list_move_tail(&ne->list, &nm_i->nat_entries); \ + } while (0) #define inc_node_version(version) (++version) static inline void node_info_from_raw_nat(struct node_info *ni, @@ -71,6 +75,20 @@ static inline void node_info_from_raw_nat(struct node_info *ni, ni->version = raw_ne->version; } +static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne, + struct node_info *ni) +{ + raw_ne->ino = cpu_to_le32(ni->ino); + raw_ne->block_addr = cpu_to_le32(ni->blk_addr); + raw_ne->version = ni->version; +} + +enum mem_type { + FREE_NIDS, /* indicates the free nid list */ + NAT_ENTRIES, /* indicates the cached nat entry */ + DIRTY_DENTS /* indicates dirty dentry pages */ +}; + /* * For free nid mangement */ @@ -224,13 +242,19 @@ static inline block_t next_blkaddr_of_node(struct page *node_page) * | `- direct node (5 + N => 5 + 2N - 1) * `- double indirect node (5 + 2N) * `- indirect node (6 + 2N) - * `- direct node (x(N + 1)) + * `- direct node + * ...... + * `- indirect node ((6 + 2N) + x(N + 1)) + * `- direct node + * ...... + * `- indirect node ((6 + 2N) + (N - 1)(N + 1)) + * `- direct node */ static inline bool IS_DNODE(struct page *node_page) { unsigned int ofs = ofs_of_node(node_page); - if (ofs == XATTR_NODE_OFFSET) + if (f2fs_has_xattr_block(ofs)) return false; if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || @@ -248,7 +272,7 @@ static inline void set_nid(struct page *p, int off, nid_t nid, bool i) { struct f2fs_node *rn = F2FS_NODE(p); - wait_on_page_writeback(p); + f2fs_wait_on_page_writeback(p, NODE); if (i) rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 51ef5eec33d..a112368a4a8 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -27,21 +27,18 @@ bool space_for_roll_forward(struct f2fs_sb_info *sbi) static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, nid_t ino) { - struct list_head *this; struct fsync_inode_entry *entry; - list_for_each(this, head) { - entry = list_entry(this, struct fsync_inode_entry, list); + list_for_each_entry(entry, head, list) if (entry->inode->i_ino == ino) return entry; - } + return NULL; } static int recover_dentry(struct page *ipage, struct inode *inode) { - struct f2fs_node *raw_node = F2FS_NODE(ipage); - struct f2fs_inode *raw_inode = &(raw_node->i); + struct f2fs_inode *raw_inode = F2FS_INODE(ipage); nid_t pino = le32_to_cpu(raw_inode->i_pino); struct f2fs_dir_entry *de; struct qstr name; @@ -49,51 +46,71 @@ static int recover_dentry(struct page *ipage, struct inode *inode) struct inode *dir, *einode; int err = 0; - dir = check_dirty_dir_inode(F2FS_SB(inode->i_sb), pino); - if (!dir) { - dir = f2fs_iget(inode->i_sb, pino); - if (IS_ERR(dir)) { - err = PTR_ERR(dir); - goto out; - } - set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT); - add_dirty_dir_inode(dir); + dir = f2fs_iget(inode->i_sb, pino); + if (IS_ERR(dir)) { + err = PTR_ERR(dir); + goto out; } name.len = le32_to_cpu(raw_inode->i_namelen); name.name = raw_inode->i_name; + + if (unlikely(name.len > F2FS_NAME_LEN)) { + WARN_ON(1); + err = -ENAMETOOLONG; + goto out_err; + } retry: de = f2fs_find_entry(dir, &name, &page); - if (de && inode->i_ino == le32_to_cpu(de->ino)) { - kunmap(page); - f2fs_put_page(page, 0); - goto out; - } + if (de && inode->i_ino == le32_to_cpu(de->ino)) + goto out_unmap_put; if (de) { einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino)); if (IS_ERR(einode)) { WARN_ON(1); - if (PTR_ERR(einode) == -ENOENT) + err = PTR_ERR(einode); + if (err == -ENOENT) err = -EEXIST; - goto out; + goto out_unmap_put; + } + err = acquire_orphan_inode(F2FS_SB(inode->i_sb)); + if (err) { + iput(einode); + goto out_unmap_put; } f2fs_delete_entry(de, page, einode); iput(einode); goto retry; } err = __f2fs_add_link(dir, &name, inode); + if (err) + goto out_err; + + if (is_inode_flag_set(F2FS_I(dir), FI_DELAY_IPUT)) { + iput(dir); + } else { + add_dirty_dir_inode(dir); + set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT); + } + + goto out; + +out_unmap_put: + kunmap(page); + f2fs_put_page(page, 0); +out_err: + iput(dir); out: - f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode and its dentry: " - "ino = %x, name = %s, dir = %lx, err = %d", - ino_of_node(ipage), raw_inode->i_name, + f2fs_msg(inode->i_sb, KERN_NOTICE, + "%s: ino = %x, name = %s, dir = %lx, err = %d", + __func__, ino_of_node(ipage), raw_inode->i_name, IS_ERR(dir) ? 0 : dir->i_ino, err); return err; } static int recover_inode(struct inode *inode, struct page *node_page) { - struct f2fs_node *raw_node = F2FS_NODE(node_page); - struct f2fs_inode *raw_inode = &(raw_node->i); + struct f2fs_inode *raw_inode = F2FS_INODE(node_page); if (!IS_INODE(node_page)) return 0; @@ -125,7 +142,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) /* get node pages in the current segment */ curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); - blkaddr = START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff; + blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); /* read node page */ page = alloc_page(GFP_F2FS_ZERO); @@ -136,9 +153,9 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) while (1) { struct fsync_inode_entry *entry; - err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC); + err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC); if (err) - goto out; + return err; lock_page(page); @@ -184,9 +201,10 @@ next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); } + unlock_page(page); -out: __free_pages(page, 0); + return err; } @@ -206,13 +224,12 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, { struct seg_entry *sentry; unsigned int segno = GET_SEGNO(sbi, blkaddr); - unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & - (sbi->blocks_per_seg - 1); + unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + struct f2fs_summary_block *sum_node; struct f2fs_summary sum; + struct page *sum_page, *node_page; nid_t ino, nid; - void *kaddr; struct inode *inode; - struct page *node_page; unsigned int offset; block_t bidx; int i; @@ -226,18 +243,15 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, struct curseg_info *curseg = CURSEG_I(sbi, i); if (curseg->segno == segno) { sum = curseg->sum_blk->entries[blkoff]; - break; + goto got_it; } } - if (i > CURSEG_COLD_DATA) { - struct page *sum_page = get_sum_page(sbi, segno); - struct f2fs_summary_block *sum_node; - kaddr = page_address(sum_page); - sum_node = (struct f2fs_summary_block *)kaddr; - sum = sum_node->entries[blkoff]; - f2fs_put_page(sum_page, 1); - } + sum_page = get_sum_page(sbi, segno); + sum_node = (struct f2fs_summary_block *)page_address(sum_page); + sum = sum_node->entries[blkoff]; + f2fs_put_page(sum_page, 1); +got_it: /* Use the locked dnode page and inode */ nid = le32_to_cpu(sum.nid); if (dn->inode->i_ino == nid) { @@ -285,28 +299,31 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct f2fs_summary sum; struct node_info ni; int err = 0, recovered = 0; - int ilock; + + if (recover_inline_data(inode, page)) + goto out; + + if (recover_xattr_data(inode, page, blkaddr)) + goto out; start = start_bidx_of_node(ofs_of_node(page), fi); - if (IS_INODE(page)) - end = start + ADDRS_PER_INODE(fi); - else - end = start + ADDRS_PER_BLOCK; + end = start + ADDRS_PER_PAGE(page, fi); + + f2fs_lock_op(sbi); - ilock = mutex_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, start, ALLOC_NODE); if (err) { - mutex_unlock_op(sbi, ilock); - return err; + f2fs_unlock_op(sbi); + goto out; } - wait_on_page_writeback(dn.node_page); + f2fs_wait_on_page_writeback(dn.node_page, NODE); get_node_info(sbi, dn.nid, &ni); - BUG_ON(ni.ino != ino_of_node(page)); - BUG_ON(ofs_of_node(dn.node_page) != ofs_of_node(page)); + f2fs_bug_on(ni.ino != ino_of_node(page)); + f2fs_bug_on(ofs_of_node(dn.node_page) != ofs_of_node(page)); for (; start < end; start++) { block_t src, dest; @@ -316,9 +333,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR) { if (src == NULL_ADDR) { - int err = reserve_new_block(&dn); + err = reserve_new_block(&dn); /* We should not get -ENOSPC */ - BUG_ON(err); + f2fs_bug_on(err); } /* Check the previous node page having this index */ @@ -349,11 +366,11 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr); err: f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, ilock); - - f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, " - "recovered_data = %d blocks, err = %d", - inode->i_ino, recovered, err); + f2fs_unlock_op(sbi); +out: + f2fs_msg(sbi->sb, KERN_NOTICE, + "recover_data: ino = %lx, recovered = %d blocks, err = %d", + inode->i_ino, recovered, err); return err; } @@ -371,7 +388,7 @@ static int recover_data(struct f2fs_sb_info *sbi, blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); /* read node page */ - page = alloc_page(GFP_NOFS | __GFP_ZERO); + page = alloc_page(GFP_F2FS_ZERO); if (!page) return -ENOMEM; @@ -380,9 +397,9 @@ static int recover_data(struct f2fs_sb_info *sbi, while (1) { struct fsync_inode_entry *entry; - err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC); + err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC); if (err) - goto out; + return err; lock_page(page); @@ -406,8 +423,8 @@ next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); } + unlock_page(page); -out: __free_pages(page, 0); if (!err) @@ -419,16 +436,17 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) { struct list_head inode_list; int err; + bool need_writecp = false; fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", - sizeof(struct fsync_inode_entry), NULL); - if (unlikely(!fsync_entry_slab)) + sizeof(struct fsync_inode_entry)); + if (!fsync_entry_slab) return -ENOMEM; INIT_LIST_HEAD(&inode_list); /* step #1: find fsynced inode numbers */ - sbi->por_doing = 1; + sbi->por_doing = true; err = find_fsync_dnodes(sbi, &inode_list); if (err) goto out; @@ -436,14 +454,16 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) if (list_empty(&inode_list)) goto out; + need_writecp = true; + /* step #2: recover data */ err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); - BUG_ON(!list_empty(&inode_list)); + f2fs_bug_on(!list_empty(&inode_list)); out: destroy_fsync_dnodes(&inode_list); kmem_cache_destroy(fsync_entry_slab); - sbi->por_doing = 0; - if (!err) + sbi->por_doing = false; + if (!err && need_writecp) write_checkpoint(sbi, false); return err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 09af9c7b0f5..d04613df710 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -13,13 +13,165 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/prefetch.h> +#include <linux/kthread.h> #include <linux/vmalloc.h> +#include <linux/swap.h> #include "f2fs.h" #include "segment.h" #include "node.h" #include <trace/events/f2fs.h> +#define __reverse_ffz(x) __reverse_ffs(~(x)) + +static struct kmem_cache *discard_entry_slab; + +/* + * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since + * MSB and LSB are reversed in a byte by f2fs_set_bit. + */ +static inline unsigned long __reverse_ffs(unsigned long word) +{ + int num = 0; + +#if BITS_PER_LONG == 64 + if ((word & 0xffffffff) == 0) { + num += 32; + word >>= 32; + } +#endif + if ((word & 0xffff) == 0) { + num += 16; + word >>= 16; + } + if ((word & 0xff) == 0) { + num += 8; + word >>= 8; + } + if ((word & 0xf0) == 0) + num += 4; + else + word >>= 4; + if ((word & 0xc) == 0) + num += 2; + else + word >>= 2; + if ((word & 0x2) == 0) + num += 1; + return num; +} + +/* + * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c becasue + * f2fs_set_bit makes MSB and LSB reversed in a byte. + * Example: + * LSB <--> MSB + * f2fs_set_bit(0, bitmap) => 0000 0001 + * f2fs_set_bit(7, bitmap) => 1000 0000 + */ +static unsigned long __find_rev_next_bit(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + BIT_WORD(offset); + unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long tmp; + unsigned long mask, submask; + unsigned long quot, rest; + + if (offset >= size) + return size; + + size -= result; + offset %= BITS_PER_LONG; + if (!offset) + goto aligned; + + tmp = *(p++); + quot = (offset >> 3) << 3; + rest = offset & 0x7; + mask = ~0UL << quot; + submask = (unsigned char)(0xff << rest) >> rest; + submask <<= quot; + mask &= submask; + tmp &= mask; + if (size < BITS_PER_LONG) + goto found_first; + if (tmp) + goto found_middle; + + size -= BITS_PER_LONG; + result += BITS_PER_LONG; +aligned: + while (size & ~(BITS_PER_LONG-1)) { + tmp = *(p++); + if (tmp) + goto found_middle; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + tmp = *p; +found_first: + tmp &= (~0UL >> (BITS_PER_LONG - size)); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ +found_middle: + return result + __reverse_ffs(tmp); +} + +static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + BIT_WORD(offset); + unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long tmp; + unsigned long mask, submask; + unsigned long quot, rest; + + if (offset >= size) + return size; + + size -= result; + offset %= BITS_PER_LONG; + if (!offset) + goto aligned; + + tmp = *(p++); + quot = (offset >> 3) << 3; + rest = offset & 0x7; + mask = ~(~0UL << quot); + submask = (unsigned char)~((unsigned char)(0xff << rest) >> rest); + submask <<= quot; + mask += submask; + tmp |= mask; + if (size < BITS_PER_LONG) + goto found_first; + if (~tmp) + goto found_middle; + + size -= BITS_PER_LONG; + result += BITS_PER_LONG; +aligned: + while (size & ~(BITS_PER_LONG - 1)) { + tmp = *(p++); + if (~tmp) + goto found_middle; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + tmp = *p; + +found_first: + tmp |= ~0UL << size; + if (tmp == ~0UL) /* Are any bits zero? */ + return result + size; /* Nope. */ +found_middle: + return result + __reverse_ffz(tmp); +} + /* * This function balances dirty node and dentry pages. * In addition, it controls garbage collection. @@ -36,6 +188,114 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi) } } +void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) +{ + /* check the # of cached NAT entries and prefree segments */ + if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) || + excess_prefree_segs(sbi)) + f2fs_sync_fs(sbi->sb, true); +} + +static int issue_flush_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + wait_queue_head_t *q = &fcc->flush_wait_queue; +repeat: + if (kthread_should_stop()) + return 0; + + spin_lock(&fcc->issue_lock); + if (fcc->issue_list) { + fcc->dispatch_list = fcc->issue_list; + fcc->issue_list = fcc->issue_tail = NULL; + } + spin_unlock(&fcc->issue_lock); + + if (fcc->dispatch_list) { + struct bio *bio = bio_alloc(GFP_NOIO, 0); + struct flush_cmd *cmd, *next; + int ret; + + bio->bi_bdev = sbi->sb->s_bdev; + ret = submit_bio_wait(WRITE_FLUSH, bio); + + for (cmd = fcc->dispatch_list; cmd; cmd = next) { + cmd->ret = ret; + next = cmd->next; + complete(&cmd->wait); + } + bio_put(bio); + fcc->dispatch_list = NULL; + } + + wait_event_interruptible(*q, + kthread_should_stop() || fcc->issue_list); + goto repeat; +} + +int f2fs_issue_flush(struct f2fs_sb_info *sbi) +{ + struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + struct flush_cmd cmd; + + if (!test_opt(sbi, FLUSH_MERGE)) + return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL); + + init_completion(&cmd.wait); + cmd.next = NULL; + + spin_lock(&fcc->issue_lock); + if (fcc->issue_list) + fcc->issue_tail->next = &cmd; + else + fcc->issue_list = &cmd; + fcc->issue_tail = &cmd; + spin_unlock(&fcc->issue_lock); + + if (!fcc->dispatch_list) + wake_up(&fcc->flush_wait_queue); + + wait_for_completion(&cmd.wait); + + return cmd.ret; +} + +int create_flush_cmd_control(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + struct flush_cmd_control *fcc; + int err = 0; + + fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); + if (!fcc) + return -ENOMEM; + spin_lock_init(&fcc->issue_lock); + init_waitqueue_head(&fcc->flush_wait_queue); + sbi->sm_info->cmd_control_info = fcc; + fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, + "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(fcc->f2fs_issue_flush)) { + err = PTR_ERR(fcc->f2fs_issue_flush); + kfree(fcc); + sbi->sm_info->cmd_control_info = NULL; + return err; + } + + return err; +} + +void destroy_flush_cmd_control(struct f2fs_sb_info *sbi) +{ + struct flush_cmd_control *fcc = + sbi->sm_info->cmd_control_info; + + if (fcc && fcc->f2fs_issue_flush) + kthread_stop(fcc->f2fs_issue_flush); + kfree(fcc); + sbi->sm_info->cmd_control_info = NULL; +} + static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, enum dirty_type dirty_type) { @@ -50,20 +310,10 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, if (dirty_type == DIRTY) { struct seg_entry *sentry = get_seg_entry(sbi, segno); - enum dirty_type t = DIRTY_HOT_DATA; + enum dirty_type t = sentry->type; - dirty_type = sentry->type; - - if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type])) - dirty_i->nr_dirty[dirty_type]++; - - /* Only one bitmap should be set */ - for (; t <= DIRTY_COLD_NODE; t++) { - if (t == dirty_type) - continue; - if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) - dirty_i->nr_dirty[t]--; - } + if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t])) + dirty_i->nr_dirty[t]++; } } @@ -76,12 +326,11 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, dirty_i->nr_dirty[dirty_type]--; if (dirty_type == DIRTY) { - enum dirty_type t = DIRTY_HOT_DATA; + struct seg_entry *sentry = get_seg_entry(sbi, segno); + enum dirty_type t = sentry->type; - /* clear all the bitmaps */ - for (; t <= DIRTY_COLD_NODE; t++) - if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) - dirty_i->nr_dirty[t]--; + if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) + dirty_i->nr_dirty[t]--; if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0) clear_bit(GET_SECNO(sbi, segno), @@ -119,6 +368,69 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } +static int f2fs_issue_discard(struct f2fs_sb_info *sbi, + block_t blkstart, block_t blklen) +{ + sector_t start = SECTOR_FROM_BLOCK(sbi, blkstart); + sector_t len = SECTOR_FROM_BLOCK(sbi, blklen); + trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); +} + +void discard_next_dnode(struct f2fs_sb_info *sbi) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); + block_t blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + + if (f2fs_issue_discard(sbi, blkaddr, 1)) { + struct page *page = grab_meta_page(sbi, blkaddr); + /* zero-filled page */ + set_page_dirty(page); + f2fs_put_page(page, 1); + } +} + +static void add_discard_addrs(struct f2fs_sb_info *sbi, + unsigned int segno, struct seg_entry *se) +{ + struct list_head *head = &SM_I(sbi)->discard_list; + struct discard_entry *new; + int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); + int max_blocks = sbi->blocks_per_seg; + unsigned long *cur_map = (unsigned long *)se->cur_valid_map; + unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; + unsigned long dmap[entries]; + unsigned int start = 0, end = -1; + int i; + + if (!test_opt(sbi, DISCARD)) + return; + + /* zero block will be discarded through the prefree list */ + if (!se->valid_blocks || se->valid_blocks == max_blocks) + return; + + /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */ + for (i = 0; i < entries; i++) + dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; + + while (SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { + start = __find_rev_next_bit(dmap, max_blocks, end + 1); + if (start >= max_blocks) + break; + + end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); + + new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); + INIT_LIST_HEAD(&new->list); + new->blkaddr = START_BLOCK(sbi, segno) + start; + new->len = end - start; + + list_add_tail(&new->list, head); + SM_I(sbi)->nr_discards += end - start; + } +} + /* * Should call clear_prefree_segments after checkpoint is done. */ @@ -141,30 +453,42 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi) { + struct list_head *head = &(SM_I(sbi)->discard_list); + struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int segno = -1; + unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int total_segs = TOTAL_SEGS(sbi); + unsigned int start = 0, end = -1; mutex_lock(&dirty_i->seglist_lock); + while (1) { - segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs, - segno + 1); - if (segno >= total_segs) + int i; + start = find_next_bit(prefree_map, total_segs, end + 1); + if (start >= total_segs) break; + end = find_next_zero_bit(prefree_map, total_segs, start + 1); + + for (i = start; i < end; i++) + clear_bit(i, prefree_map); + + dirty_i->nr_dirty[PRE] -= end - start; - if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE])) - dirty_i->nr_dirty[PRE]--; - - /* Let's use trim */ - if (test_opt(sbi, DISCARD)) - blkdev_issue_discard(sbi->sb->s_bdev, - START_BLOCK(sbi, segno) << - sbi->log_sectors_per_block, - 1 << (sbi->log_sectors_per_block + - sbi->log_blocks_per_seg), - GFP_NOFS, 0); + if (!test_opt(sbi, DISCARD)) + continue; + + f2fs_issue_discard(sbi, START_BLOCK(sbi, start), + (end - start) << sbi->log_blocks_per_seg); } mutex_unlock(&dirty_i->seglist_lock); + + /* send small discards */ + list_for_each_entry_safe(entry, this, head, list) { + f2fs_issue_discard(sbi, entry->blkaddr, entry->len); + list_del(&entry->list); + SM_I(sbi)->nr_discards -= entry->len; + kmem_cache_free(discard_entry_slab, entry); + } } static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) @@ -193,9 +517,9 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) se = get_seg_entry(sbi, segno); new_vblocks = se->valid_blocks + del; - offset = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1); + offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); - BUG_ON((new_vblocks >> (sizeof(unsigned short) << 3) || + f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) || (new_vblocks > sbi->blocks_per_seg))); se->valid_blocks = new_vblocks; @@ -222,12 +546,14 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) get_sec_entry(sbi, segno)->valid_blocks += del; } -static void refresh_sit_entry(struct f2fs_sb_info *sbi, - block_t old_blkaddr, block_t new_blkaddr) +void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new) { - update_sit_entry(sbi, new_blkaddr, 1); - if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) - update_sit_entry(sbi, old_blkaddr, -1); + update_sit_entry(sbi, new, 1); + if (GET_SEGNO(sbi, old) != NULL_SEGNO) + update_sit_entry(sbi, old, -1); + + locate_dirty_segment(sbi, GET_SEGNO(sbi, old)); + locate_dirty_segment(sbi, GET_SEGNO(sbi, new)); } void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) @@ -235,7 +561,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) unsigned int segno = GET_SEGNO(sbi, addr); struct sit_info *sit_i = SIT_I(sbi); - BUG_ON(addr == NULL_ADDR); + f2fs_bug_on(addr == NULL_ADDR); if (addr == NEW_ADDR) return; @@ -267,9 +593,8 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, */ int npages_for_summary_flush(struct f2fs_sb_info *sbi) { - int total_size_bytes = 0; int valid_sum_count = 0; - int i, sum_space; + int i, sum_in_page; for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { if (sbi->ckpt->alloc_type[i] == SSR) @@ -278,13 +603,12 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi) valid_sum_count += curseg_blkoff(sbi, i); } - total_size_bytes = valid_sum_count * (SUMMARY_SIZE + 1) - + sizeof(struct nat_journal) + 2 - + sizeof(struct sit_journal) + 2; - sum_space = PAGE_CACHE_SIZE - SUM_FOOTER_SIZE; - if (total_size_bytes < sum_space) + sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE - + SUM_FOOTER_SIZE) / SUMMARY_SIZE; + if (valid_sum_count <= sum_in_page) return 1; - else if (total_size_bytes < 2 * sum_space) + else if ((valid_sum_count - sum_in_page) <= + (PAGE_CACHE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE) return 2; return 3; } @@ -350,7 +674,7 @@ find_other_zone: if (dir == ALLOC_RIGHT) { secno = find_next_zero_bit(free_i->free_secmap, TOTAL_SECS(sbi), 0); - BUG_ON(secno >= TOTAL_SECS(sbi)); + f2fs_bug_on(secno >= TOTAL_SECS(sbi)); } else { go_left = 1; left_start = hint - 1; @@ -366,7 +690,7 @@ find_other_zone: } left_start = find_next_zero_bit(free_i->free_secmap, TOTAL_SECS(sbi), 0); - BUG_ON(left_start >= TOTAL_SECS(sbi)); + f2fs_bug_on(left_start >= TOTAL_SECS(sbi)); break; } secno = left_start; @@ -405,7 +729,7 @@ skip_left: } got_it: /* set it as dirty segment in free segmap */ - BUG_ON(test_bit(segno, free_i->free_segmap)); + f2fs_bug_on(test_bit(segno, free_i->free_segmap)); __set_inuse(sbi, segno); *newseg = segno; write_unlock(&free_i->segmap_lock); @@ -458,13 +782,18 @@ static void __next_free_blkoff(struct f2fs_sb_info *sbi, struct curseg_info *seg, block_t start) { struct seg_entry *se = get_seg_entry(sbi, seg->segno); - block_t ofs; - for (ofs = start; ofs < sbi->blocks_per_seg; ofs++) { - if (!f2fs_test_bit(ofs, se->ckpt_valid_map) - && !f2fs_test_bit(ofs, se->cur_valid_map)) - break; - } - seg->next_blkoff = ofs; + int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); + unsigned long target_map[entries]; + unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; + unsigned long *cur_map = (unsigned long *)se->cur_valid_map; + int i, pos; + + for (i = 0; i < entries; i++) + target_map[i] = ckpt_map[i] | cur_map[i]; + + pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start); + + seg->next_blkoff = pos; } /* @@ -550,9 +879,8 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, change_curseg(sbi, type, true); else new_curseg(sbi, type, false); -#ifdef CONFIG_F2FS_STAT_FS - sbi->segment_count[curseg->alloc_type]++; -#endif + + stat_inc_seg_type(sbi, curseg); } void allocate_new_segments(struct f2fs_sb_info *sbi) @@ -573,141 +901,6 @@ static const struct segment_allocation default_salloc_ops = { .allocate_segment = allocate_segment_by_default, }; -static void f2fs_end_io_write(struct bio *bio, int err) -{ - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct bio_private *p = bio->bi_private; - - do { - struct page *page = bvec->bv_page; - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - if (!uptodate) { - SetPageError(page); - if (page->mapping) - set_bit(AS_EIO, &page->mapping->flags); - set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG); - p->sbi->sb->s_flags |= MS_RDONLY; - } - end_page_writeback(page); - dec_page_count(p->sbi, F2FS_WRITEBACK); - } while (bvec >= bio->bi_io_vec); - - if (p->is_sync) - complete(p->wait); - kfree(p); - bio_put(bio); -} - -struct bio *f2fs_bio_alloc(struct block_device *bdev, int npages) -{ - struct bio *bio; - - /* No failure on bio allocation */ - bio = bio_alloc(GFP_NOIO, npages); - bio->bi_bdev = bdev; - bio->bi_private = NULL; - - return bio; -} - -static void do_submit_bio(struct f2fs_sb_info *sbi, - enum page_type type, bool sync) -{ - int rw = sync ? WRITE_SYNC : WRITE; - enum page_type btype = type > META ? META : type; - - if (type >= META_FLUSH) - rw = WRITE_FLUSH_FUA; - - if (btype == META) - rw |= REQ_META; - - if (sbi->bio[btype]) { - struct bio_private *p = sbi->bio[btype]->bi_private; - p->sbi = sbi; - sbi->bio[btype]->bi_end_io = f2fs_end_io_write; - - trace_f2fs_do_submit_bio(sbi->sb, btype, sync, sbi->bio[btype]); - - if (type == META_FLUSH) { - DECLARE_COMPLETION_ONSTACK(wait); - p->is_sync = true; - p->wait = &wait; - submit_bio(rw, sbi->bio[btype]); - wait_for_completion(&wait); - } else { - p->is_sync = false; - submit_bio(rw, sbi->bio[btype]); - } - sbi->bio[btype] = NULL; - } -} - -void f2fs_submit_bio(struct f2fs_sb_info *sbi, enum page_type type, bool sync) -{ - down_write(&sbi->bio_sem); - do_submit_bio(sbi, type, sync); - up_write(&sbi->bio_sem); -} - -static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page, - block_t blk_addr, enum page_type type) -{ - struct block_device *bdev = sbi->sb->s_bdev; - - verify_block_addr(sbi, blk_addr); - - down_write(&sbi->bio_sem); - - inc_page_count(sbi, F2FS_WRITEBACK); - - if (sbi->bio[type] && sbi->last_block_in_bio[type] != blk_addr - 1) - do_submit_bio(sbi, type, false); -alloc_new: - if (sbi->bio[type] == NULL) { - struct bio_private *priv; -retry: - priv = kmalloc(sizeof(struct bio_private), GFP_NOFS); - if (!priv) { - cond_resched(); - goto retry; - } - - sbi->bio[type] = f2fs_bio_alloc(bdev, max_hw_blocks(sbi)); - sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); - sbi->bio[type]->bi_private = priv; - /* - * The end_io will be assigned at the sumbission phase. - * Until then, let bio_add_page() merge consecutive IOs as much - * as possible. - */ - } - - if (bio_add_page(sbi->bio[type], page, PAGE_CACHE_SIZE, 0) < - PAGE_CACHE_SIZE) { - do_submit_bio(sbi, type, false); - goto alloc_new; - } - - sbi->last_block_in_bio[type] = blk_addr; - - up_write(&sbi->bio_sem); - trace_f2fs_submit_write_page(page, blk_addr, type); -} - -void f2fs_wait_on_page_writeback(struct page *page, - enum page_type type, bool sync) -{ - struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); - if (PageWriteback(page)) { - f2fs_submit_bio(sbi, type, sync); - wait_on_page_writeback(page); - } -} - static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -771,20 +964,18 @@ static int __get_segment_type(struct page *page, enum page_type p_type) return __get_segment_type_4(page, p_type); } /* NR_CURSEG_TYPE(6) logs by default */ - BUG_ON(sbi->active_logs != NR_CURSEG_TYPE); + f2fs_bug_on(sbi->active_logs != NR_CURSEG_TYPE); return __get_segment_type_6(page, p_type); } -static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, - block_t old_blkaddr, block_t *new_blkaddr, - struct f2fs_summary *sum, enum page_type p_type) +void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, + block_t old_blkaddr, block_t *new_blkaddr, + struct f2fs_summary *sum, int type) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg; unsigned int old_cursegno; - int type; - type = __get_segment_type(page, p_type); curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); @@ -801,66 +992,78 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, mutex_lock(&sit_i->sentry_lock); __refresh_next_blkoff(sbi, curseg); -#ifdef CONFIG_F2FS_STAT_FS - sbi->block_count[curseg->alloc_type]++; -#endif + stat_inc_block_count(sbi, curseg); + + if (!__has_curseg_space(sbi, type)) + sit_i->s_ops->allocate_segment(sbi, type, false); /* * SIT information should be updated before segment allocation, * since SSR needs latest valid block information. */ refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); - - if (!__has_curseg_space(sbi, type)) - sit_i->s_ops->allocate_segment(sbi, type, false); - locate_dirty_segment(sbi, old_cursegno); - locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + mutex_unlock(&sit_i->sentry_lock); - if (p_type == NODE) + if (page && IS_NODESEG(type)) fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); - /* writeout dirty page into bdev */ - submit_write_page(sbi, page, *new_blkaddr, p_type); - mutex_unlock(&curseg->curseg_mutex); } +static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, + block_t old_blkaddr, block_t *new_blkaddr, + struct f2fs_summary *sum, struct f2fs_io_info *fio) +{ + int type = __get_segment_type(page, fio->type); + + allocate_data_block(sbi, page, old_blkaddr, new_blkaddr, sum, type); + + /* writeout dirty page into bdev */ + f2fs_submit_page_mbio(sbi, page, *new_blkaddr, fio); +} + void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) { + struct f2fs_io_info fio = { + .type = META, + .rw = WRITE_SYNC | REQ_META | REQ_PRIO + }; + set_page_writeback(page); - submit_write_page(sbi, page, page->index, META); + f2fs_submit_page_mbio(sbi, page, page->index, &fio); } void write_node_page(struct f2fs_sb_info *sbi, struct page *page, + struct f2fs_io_info *fio, unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr) { struct f2fs_summary sum; set_summary(&sum, nid, 0, 0); - do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, NODE); + do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, fio); } -void write_data_page(struct inode *inode, struct page *page, - struct dnode_of_data *dn, block_t old_blkaddr, - block_t *new_blkaddr) +void write_data_page(struct page *page, struct dnode_of_data *dn, + block_t *new_blkaddr, struct f2fs_io_info *fio) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); struct f2fs_summary sum; struct node_info ni; - BUG_ON(old_blkaddr == NULL_ADDR); + f2fs_bug_on(dn->data_blkaddr == NULL_ADDR); get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - do_write_page(sbi, page, old_blkaddr, - new_blkaddr, &sum, DATA); + do_write_page(sbi, page, dn->data_blkaddr, new_blkaddr, &sum, fio); } -void rewrite_data_page(struct f2fs_sb_info *sbi, struct page *page, - block_t old_blk_addr) +void rewrite_data_page(struct page *page, block_t old_blkaddr, + struct f2fs_io_info *fio) { - submit_write_page(sbi, page, old_blk_addr, DATA); + struct inode *inode = page->mapping->host; + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + f2fs_submit_page_mbio(sbi, page, old_blkaddr, fio); } void recover_data_page(struct f2fs_sb_info *sbi, @@ -896,14 +1099,11 @@ void recover_data_page(struct f2fs_sb_info *sbi, change_curseg(sbi, type, true); } - curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & - (sbi->blocks_per_seg - 1); + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); __add_sum_entry(sbi, type, sum); refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); - locate_dirty_segment(sbi, old_cursegno); - locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); mutex_unlock(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); @@ -919,6 +1119,10 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, unsigned int segno, old_cursegno; block_t next_blkaddr = next_blkaddr_of_node(page); unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr); + struct f2fs_io_info fio = { + .type = NODE, + .rw = WRITE_SYNC, + }; curseg = CURSEG_I(sbi, type); @@ -933,8 +1137,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, curseg->next_segno = segno; change_curseg(sbi, type, true); } - curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & - (sbi->blocks_per_seg - 1); + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); __add_sum_entry(sbi, type, sum); /* change the current log to the next block addr in advance */ @@ -942,22 +1145,54 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, curseg->next_segno = next_segno; change_curseg(sbi, type, true); } - curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, next_blkaddr) & - (sbi->blocks_per_seg - 1); + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, next_blkaddr); /* rewrite node page */ set_page_writeback(page); - submit_write_page(sbi, page, new_blkaddr, NODE); - f2fs_submit_bio(sbi, NODE, true); + f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio); + f2fs_submit_merged_bio(sbi, NODE, WRITE); refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); - locate_dirty_segment(sbi, old_cursegno); - locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); mutex_unlock(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); } +static inline bool is_merged_page(struct f2fs_sb_info *sbi, + struct page *page, enum page_type type) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(type); + struct f2fs_bio_info *io = &sbi->write_io[btype]; + struct bio_vec *bvec; + int i; + + down_read(&io->io_rwsem); + if (!io->bio) + goto out; + + bio_for_each_segment_all(bvec, io->bio, i) { + if (page == bvec->bv_page) { + up_read(&io->io_rwsem); + return true; + } + } + +out: + up_read(&io->io_rwsem); + return false; +} + +void f2fs_wait_on_page_writeback(struct page *page, + enum page_type type) +{ + struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); + if (PageWriteback(page)) { + if (is_merged_page(sbi, page, type)) + f2fs_submit_merged_bio(sbi, type, WRITE); + wait_on_page_writeback(page); + } +} + static int read_compacted_summaries(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1062,9 +1297,12 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) ns->ofs_in_node = 0; } } else { - if (restore_node_summary(sbi, segno, sum)) { + int err; + + err = restore_node_summary(sbi, segno, sum); + if (err) { f2fs_put_page(new, 1); - return -EINVAL; + return err; } } } @@ -1085,6 +1323,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) static int restore_curseg_summaries(struct f2fs_sb_info *sbi) { int type = CURSEG_HOT_DATA; + int err; if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) { /* restore for compacted data summary */ @@ -1093,9 +1332,12 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) type = CURSEG_HOT_NODE; } - for (; type <= CURSEG_COLD_NODE; type++) - if (read_normal_summaries(sbi, type)) - return -EINVAL; + for (; type <= CURSEG_COLD_NODE; type++) { + err = read_normal_summaries(sbi, type); + if (err) + return err; + } + return 0; } @@ -1122,8 +1364,6 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) SUM_JOURNAL_SIZE); written_size += SUM_JOURNAL_SIZE; - set_page_dirty(page); - /* Step 3: write summary entries */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { unsigned short blkoff; @@ -1142,18 +1382,20 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) summary = (struct f2fs_summary *)(kaddr + written_size); *summary = seg_i->sum_blk->entries[j]; written_size += SUMMARY_SIZE; - set_page_dirty(page); if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE - SUM_FOOTER_SIZE) continue; + set_page_dirty(page); f2fs_put_page(page, 1); page = NULL; } } - if (page) + if (page) { + set_page_dirty(page); f2fs_put_page(page, 1); + } } static void write_normal_summaries(struct f2fs_sb_info *sbi, @@ -1239,7 +1481,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, /* get current sit block page without lock */ src_page = get_meta_page(sbi, src_off); dst_page = grab_meta_page(sbi, dst_off); - BUG_ON(PageDirty(src_page)); + f2fs_bug_on(PageDirty(src_page)); src_addr = page_address(src_page); dst_addr = page_address(dst_page); @@ -1271,9 +1513,9 @@ static bool flush_sits_in_journal(struct f2fs_sb_info *sbi) __mark_sit_entry_dirty(sbi, segno); } update_sits_in_cursum(sum, -sits_in_cursum(sum)); - return 1; + return true; } - return 0; + return false; } /* @@ -1308,6 +1550,10 @@ void flush_sit_entries(struct f2fs_sb_info *sbi) sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); + /* add discard candidates */ + if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) + add_discard_addrs(sbi, segno, se); + if (flushed) goto to_sit_page; @@ -1479,36 +1725,48 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); struct f2fs_summary_block *sum = curseg->sum_blk; - unsigned int start; + int sit_blk_cnt = SIT_BLK_CNT(sbi); + unsigned int i, start, end; + unsigned int readed, start_blk = 0; + int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); - for (start = 0; start < TOTAL_SEGS(sbi); start++) { - struct seg_entry *se = &sit_i->sentries[start]; - struct f2fs_sit_block *sit_blk; - struct f2fs_sit_entry sit; - struct page *page; - int i; - - mutex_lock(&curseg->curseg_mutex); - for (i = 0; i < sits_in_cursum(sum); i++) { - if (le32_to_cpu(segno_in_journal(sum, i)) == start) { - sit = sit_in_journal(sum, i); - mutex_unlock(&curseg->curseg_mutex); - goto got_it; + do { + readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT); + + start = start_blk * sit_i->sents_per_block; + end = (start_blk + readed) * sit_i->sents_per_block; + + for (; start < end && start < TOTAL_SEGS(sbi); start++) { + struct seg_entry *se = &sit_i->sentries[start]; + struct f2fs_sit_block *sit_blk; + struct f2fs_sit_entry sit; + struct page *page; + + mutex_lock(&curseg->curseg_mutex); + for (i = 0; i < sits_in_cursum(sum); i++) { + if (le32_to_cpu(segno_in_journal(sum, i)) + == start) { + sit = sit_in_journal(sum, i); + mutex_unlock(&curseg->curseg_mutex); + goto got_it; + } } - } - mutex_unlock(&curseg->curseg_mutex); - page = get_current_sit_page(sbi, start); - sit_blk = (struct f2fs_sit_block *)page_address(page); - sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; - f2fs_put_page(page, 1); + mutex_unlock(&curseg->curseg_mutex); + + page = get_current_sit_page(sbi, start); + sit_blk = (struct f2fs_sit_block *)page_address(page); + sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; + f2fs_put_page(page, 1); got_it: - check_block_count(sbi, start, &sit); - seg_info_from_raw_sit(se, &sit); - if (sbi->segs_per_sec > 1) { - struct sec_entry *e = get_sec_entry(sbi, start); - e->valid_blocks += se->valid_blocks; + check_block_count(sbi, start, &sit); + seg_info_from_raw_sit(se, &sit); + if (sbi->segs_per_sec > 1) { + struct sec_entry *e = get_sec_entry(sbi, start); + e->valid_blocks += se->valid_blocks; + } } - } + start_blk += readed; + } while (start_blk < sit_blk_cnt); } static void init_free_segmap(struct f2fs_sb_info *sbi) @@ -1628,8 +1886,6 @@ int build_segment_manager(struct f2fs_sb_info *sbi) /* init sm info */ sbi->sm_info = sm_info; - INIT_LIST_HEAD(&sm_info->wblist_head); - spin_lock_init(&sm_info->wblist_lock); sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); sm_info->segment_count = le32_to_cpu(raw_super->segment_count); @@ -1637,6 +1893,20 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main); sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); + sm_info->rec_prefree_segments = sm_info->main_segments * + DEF_RECLAIM_PREFREE_SEGMENTS / 100; + sm_info->ipu_policy = F2FS_IPU_DISABLE; + sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; + + INIT_LIST_HEAD(&sm_info->discard_list); + sm_info->nr_discards = 0; + sm_info->max_discards = 0; + + if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { + err = create_flush_cmd_control(sbi); + if (err) + return err; + } err = build_sit_info(sbi); if (err) @@ -1744,6 +2014,10 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) void destroy_segment_manager(struct f2fs_sb_info *sbi) { struct f2fs_sm_info *sm_info = SM_I(sbi); + + if (!sm_info) + return; + destroy_flush_cmd_control(sbi); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); @@ -1751,3 +2025,17 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) sbi->sm_info = NULL; kfree(sm_info); } + +int __init create_segment_manager_caches(void) +{ + discard_entry_slab = f2fs_kmem_cache_create("discard_entry", + sizeof(struct discard_entry)); + if (!discard_entry_slab) + return -ENOMEM; + return 0; +} + +void destroy_segment_manager_caches(void) +{ + kmem_cache_destroy(discard_entry_slab); +} diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index bdd10eab8c4..7091204680f 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -14,17 +14,14 @@ #define NULL_SEGNO ((unsigned int)(~0)) #define NULL_SECNO ((unsigned int)(~0)) +#define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ + /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) #define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) -#define IS_DATASEG(t) \ - ((t == CURSEG_HOT_DATA) || (t == CURSEG_COLD_DATA) || \ - (t == CURSEG_WARM_DATA)) - -#define IS_NODESEG(t) \ - ((t == CURSEG_HOT_NODE) || (t == CURSEG_COLD_NODE) || \ - (t == CURSEG_WARM_NODE)) +#define IS_DATASEG(t) (t <= CURSEG_COLD_DATA) +#define IS_NODESEG(t) (t >= CURSEG_HOT_NODE) #define IS_CURSEG(sbi, seg) \ ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ @@ -60,6 +57,9 @@ ((blk_addr) - SM_I(sbi)->seg0_blkaddr) #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) +#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1)) + #define GET_SEGNO(sbi, blk_addr) \ (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ @@ -81,22 +81,19 @@ (segno / SIT_ENTRY_PER_BLOCK) #define START_SEGNO(sit_i, segno) \ (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK) +#define SIT_BLK_CNT(sbi) \ + ((TOTAL_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK) #define f2fs_bitmap_size(nr) \ (BITS_TO_LONGS(nr) * sizeof(unsigned long)) #define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments) #define TOTAL_SECS(sbi) (sbi->total_sections) #define SECTOR_FROM_BLOCK(sbi, blk_addr) \ - (blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE)) + (((sector_t)blk_addr) << (sbi)->log_sectors_per_block) #define SECTOR_TO_BLOCK(sbi, sectors) \ - (sectors >> ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE)) - -/* during checkpoint, bio_private is used to synchronize the last bio */ -struct bio_private { - struct f2fs_sb_info *sbi; - bool is_sync; - void *wait; -}; + (sectors >> (sbi)->log_sectors_per_block) +#define MAX_BIO_BLOCKS(max_hw_blocks) \ + (min((int)max_hw_blocks, BIO_MAX_PAGES)) /* * indicate a block allocation direction: RIGHT and LEFT. @@ -383,26 +380,12 @@ static inline void get_sit_bitmap(struct f2fs_sb_info *sbi, static inline block_t written_block_count(struct f2fs_sb_info *sbi) { - struct sit_info *sit_i = SIT_I(sbi); - block_t vblocks; - - mutex_lock(&sit_i->sentry_lock); - vblocks = sit_i->written_valid_blocks; - mutex_unlock(&sit_i->sentry_lock); - - return vblocks; + return SIT_I(sbi)->written_valid_blocks; } static inline unsigned int free_segments(struct f2fs_sb_info *sbi) { - struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int free_segs; - - read_lock(&free_i->segmap_lock); - free_segs = free_i->free_segments; - read_unlock(&free_i->segmap_lock); - - return free_segs; + return FREE_I(sbi)->free_segments; } static inline int reserved_segments(struct f2fs_sb_info *sbi) @@ -412,14 +395,7 @@ static inline int reserved_segments(struct f2fs_sb_info *sbi) static inline unsigned int free_sections(struct f2fs_sb_info *sbi) { - struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int free_secs; - - read_lock(&free_i->segmap_lock); - free_secs = free_i->free_sections; - read_unlock(&free_i->segmap_lock); - - return free_secs; + return FREE_I(sbi)->free_sections; } static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi) @@ -454,8 +430,8 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) static inline bool need_SSR(struct f2fs_sb_info *sbi) { - return ((prefree_segments(sbi) / sbi->segs_per_sec) - + free_sections(sbi) < overprovision_sections(sbi)); + return (prefree_segments(sbi) / sbi->segs_per_sec) + + free_sections(sbi) < overprovision_sections(sbi); } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) @@ -463,33 +439,71 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - if (sbi->por_doing) + if (unlikely(sbi->por_doing)) return false; - return ((free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + - reserved_sections(sbi))); + return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + + reserved_sections(sbi)); +} + +static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) +{ + return prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments; } static inline int utilization(struct f2fs_sb_info *sbi) { - return div_u64((u64)valid_user_blocks(sbi) * 100, sbi->user_block_count); + return div_u64((u64)valid_user_blocks(sbi) * 100, + sbi->user_block_count); } /* * Sometimes f2fs may be better to drop out-of-place update policy. - * So, if fs utilization is over MIN_IPU_UTIL, then f2fs tries to write - * data in the original place likewise other traditional file systems. - * But, currently set 100 in percentage, which means it is disabled. - * See below need_inplace_update(). + * And, users can control the policy through sysfs entries. + * There are five policies with triggering conditions as follows. + * F2FS_IPU_FORCE - all the time, + * F2FS_IPU_SSR - if SSR mode is activated, + * F2FS_IPU_UTIL - if FS utilization is over threashold, + * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over + * threashold, + * F2FS_IPUT_DISABLE - disable IPU. (=default option) */ -#define MIN_IPU_UTIL 100 +#define DEF_MIN_IPU_UTIL 70 + +enum { + F2FS_IPU_FORCE, + F2FS_IPU_SSR, + F2FS_IPU_UTIL, + F2FS_IPU_SSR_UTIL, + F2FS_IPU_DISABLE, +}; + static inline bool need_inplace_update(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + + /* IPU can be done only for the user data */ if (S_ISDIR(inode->i_mode)) return false; - if (need_SSR(sbi) && utilization(sbi) > MIN_IPU_UTIL) + + switch (SM_I(sbi)->ipu_policy) { + case F2FS_IPU_FORCE: return true; + case F2FS_IPU_SSR: + if (need_SSR(sbi)) + return true; + break; + case F2FS_IPU_UTIL: + if (utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + break; + case F2FS_IPU_SSR_UTIL: + if (need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + break; + case F2FS_IPU_DISABLE: + break; + } return false; } @@ -513,16 +527,13 @@ static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type) return curseg->next_blkoff; } +#ifdef CONFIG_F2FS_CHECK_FS static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) { unsigned int end_segno = SM_I(sbi)->segment_count - 1; BUG_ON(segno > end_segno); } -/* - * This function is used for only debugging. - * NOTE: In future, we have to remove this function. - */ static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) { struct f2fs_sm_info *sm_info = SM_I(sbi); @@ -541,8 +552,9 @@ static inline void check_block_count(struct f2fs_sb_info *sbi, { struct f2fs_sm_info *sm_info = SM_I(sbi); unsigned int end_segno = sm_info->segment_count - 1; + bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false; int valid_blocks = 0; - int i; + int cur_pos = 0, next_pos; /* check segment usage */ BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg); @@ -551,11 +563,26 @@ static inline void check_block_count(struct f2fs_sb_info *sbi, BUG_ON(segno > end_segno); /* check bitmap with valid block count */ - for (i = 0; i < sbi->blocks_per_seg; i++) - if (f2fs_test_bit(i, raw_sit->valid_map)) - valid_blocks++; + do { + if (is_valid) { + next_pos = find_next_zero_bit_le(&raw_sit->valid_map, + sbi->blocks_per_seg, + cur_pos); + valid_blocks += next_pos - cur_pos; + } else + next_pos = find_next_bit_le(&raw_sit->valid_map, + sbi->blocks_per_seg, + cur_pos); + cur_pos = next_pos; + is_valid = !is_valid; + } while (cur_pos < sbi->blocks_per_seg); BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks); } +#else +#define check_seg_range(sbi, segno) +#define verify_block_addr(sbi, blk_addr) +#define check_block_count(sbi, segno, raw_sit) +#endif static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, unsigned int start) @@ -637,3 +664,46 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) struct request_queue *q = bdev_get_queue(bdev); return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q)); } + +/* + * It is very important to gather dirty pages and write at once, so that we can + * submit a big bio without interfering other data writes. + * By default, 512 pages for directory data, + * 512 pages (2MB) * 3 for three types of nodes, and + * max_bio_blocks for meta are set. + */ +static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) +{ + if (type == DATA) + return sbi->blocks_per_seg; + else if (type == NODE) + return 3 * sbi->blocks_per_seg; + else if (type == META) + return MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + else + return 0; +} + +/* + * When writing pages, it'd better align nr_to_write for segment size. + */ +static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, + struct writeback_control *wbc) +{ + long nr_to_write, desired; + + if (wbc->sync_mode != WB_SYNC_NONE) + return 0; + + nr_to_write = wbc->nr_to_write; + + if (type == DATA) + desired = 4096; + else if (type == NODE) + desired = 3 * max_hw_blocks(sbi); + else + desired = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + + wbc->nr_to_write = desired; + return desired - nr_to_write; +} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 13d0a0fe49d..8f96d9372ad 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -43,11 +43,15 @@ enum { Opt_disable_roll_forward, Opt_discard, Opt_noheap, + Opt_user_xattr, Opt_nouser_xattr, + Opt_acl, Opt_noacl, Opt_active_logs, Opt_disable_ext_identify, Opt_inline_xattr, + Opt_inline_data, + Opt_flush_merge, Opt_err, }; @@ -56,33 +60,59 @@ static match_table_t f2fs_tokens = { {Opt_disable_roll_forward, "disable_roll_forward"}, {Opt_discard, "discard"}, {Opt_noheap, "no_heap"}, + {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, {Opt_active_logs, "active_logs=%u"}, {Opt_disable_ext_identify, "disable_ext_identify"}, {Opt_inline_xattr, "inline_xattr"}, + {Opt_inline_data, "inline_data"}, + {Opt_flush_merge, "flush_merge"}, {Opt_err, NULL}, }; /* Sysfs support for f2fs */ +enum { + GC_THREAD, /* struct f2fs_gc_thread */ + SM_INFO, /* struct f2fs_sm_info */ + NM_INFO, /* struct f2fs_nm_info */ + F2FS_SBI, /* struct f2fs_sb_info */ +}; + struct f2fs_attr { struct attribute attr; ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, const char *, size_t); + int struct_type; int offset; }; +static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) +{ + if (struct_type == GC_THREAD) + return (unsigned char *)sbi->gc_thread; + else if (struct_type == SM_INFO) + return (unsigned char *)SM_I(sbi); + else if (struct_type == NM_INFO) + return (unsigned char *)NM_I(sbi); + else if (struct_type == F2FS_SBI) + return (unsigned char *)sbi; + return NULL; +} + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - struct f2fs_gc_kthread *gc_kth = sbi->gc_thread; + unsigned char *ptr = NULL; unsigned int *ui; - if (!gc_kth) + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) return -EINVAL; - ui = (unsigned int *)(((char *)gc_kth) + a->offset); + ui = (unsigned int *)(ptr + a->offset); return snprintf(buf, PAGE_SIZE, "%u\n", *ui); } @@ -91,15 +121,16 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, struct f2fs_sb_info *sbi, const char *buf, size_t count) { - struct f2fs_gc_kthread *gc_kth = sbi->gc_thread; + unsigned char *ptr; unsigned long t; unsigned int *ui; ssize_t ret; - if (!gc_kth) + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) return -EINVAL; - ui = (unsigned int *)(((char *)gc_kth) + a->offset); + ui = (unsigned int *)(ptr + a->offset); ret = kstrtoul(skip_spaces(buf), 0, &t); if (ret < 0) @@ -135,21 +166,31 @@ static void f2fs_sb_release(struct kobject *kobj) complete(&sbi->s_kobj_unregister); } -#define F2FS_ATTR_OFFSET(_name, _mode, _show, _store, _elname) \ +#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ static struct f2fs_attr f2fs_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ .show = _show, \ .store = _store, \ - .offset = offsetof(struct f2fs_gc_kthread, _elname), \ + .struct_type = _struct_type, \ + .offset = _offset \ } -#define F2FS_RW_ATTR(name, elname) \ - F2FS_ATTR_OFFSET(name, 0644, f2fs_sbi_show, f2fs_sbi_store, elname) - -F2FS_RW_ATTR(gc_min_sleep_time, min_sleep_time); -F2FS_RW_ATTR(gc_max_sleep_time, max_sleep_time); -F2FS_RW_ATTR(gc_no_gc_sleep_time, no_gc_sleep_time); -F2FS_RW_ATTR(gc_idle, gc_idle); +#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ + F2FS_ATTR_OFFSET(struct_type, name, 0644, \ + f2fs_sbi_show, f2fs_sbi_store, \ + offsetof(struct struct_name, elname)) + +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -157,6 +198,13 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_max_sleep_time), ATTR_LIST(gc_no_gc_sleep_time), ATTR_LIST(gc_idle), + ATTR_LIST(reclaim_segments), + ATTR_LIST(max_small_discards), + ATTR_LIST(ipu_policy), + ATTR_LIST(min_ipu_util), + ATTR_LIST(max_victim_search), + ATTR_LIST(dir_level), + ATTR_LIST(ram_thresh), NULL, }; @@ -217,9 +265,9 @@ static int parse_options(struct super_block *sb, char *options) if (!name) return -ENOMEM; - if (!strncmp(name, "on", 2)) + if (strlen(name) == 2 && !strncmp(name, "on", 2)) set_opt(sbi, BG_GC); - else if (!strncmp(name, "off", 3)) + else if (strlen(name) == 3 && !strncmp(name, "off", 3)) clear_opt(sbi, BG_GC); else { kfree(name); @@ -237,6 +285,9 @@ static int parse_options(struct super_block *sb, char *options) set_opt(sbi, NOHEAP); break; #ifdef CONFIG_F2FS_FS_XATTR + case Opt_user_xattr: + set_opt(sbi, XATTR_USER); + break; case Opt_nouser_xattr: clear_opt(sbi, XATTR_USER); break; @@ -244,6 +295,10 @@ static int parse_options(struct super_block *sb, char *options) set_opt(sbi, INLINE_XATTR); break; #else + case Opt_user_xattr: + f2fs_msg(sb, KERN_INFO, + "user_xattr options not supported"); + break; case Opt_nouser_xattr: f2fs_msg(sb, KERN_INFO, "nouser_xattr options not supported"); @@ -254,10 +309,16 @@ static int parse_options(struct super_block *sb, char *options) break; #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL + case Opt_acl: + set_opt(sbi, POSIX_ACL); + break; case Opt_noacl: clear_opt(sbi, POSIX_ACL); break; #else + case Opt_acl: + f2fs_msg(sb, KERN_INFO, "acl options not supported"); + break; case Opt_noacl: f2fs_msg(sb, KERN_INFO, "noacl options not supported"); break; @@ -272,6 +333,12 @@ static int parse_options(struct super_block *sb, char *options) case Opt_disable_ext_identify: set_opt(sbi, DISABLE_EXT_IDENTIFY); break; + case Opt_inline_data: + set_opt(sbi, INLINE_DATA); + break; + case Opt_flush_merge: + set_opt(sbi, FLUSH_MERGE); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -286,7 +353,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) { struct f2fs_inode_info *fi; - fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_NOFS | __GFP_ZERO); + fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_F2FS_ZERO); if (!fi) return NULL; @@ -298,12 +365,16 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) fi->i_current_depth = 1; fi->i_advise = 0; rwlock_init(&fi->ext.ext_lock); + init_rwsem(&fi->i_sem); set_inode_flag(fi, FI_NEW_INODE); if (test_opt(F2FS_SB(sb), INLINE_XATTR)) set_inode_flag(fi, FI_INLINE_XATTR); + /* Will be used by directory only */ + fi->i_dir_level = F2FS_SB(sb)->dir_level; + return &fi->vfs_inode; } @@ -355,7 +426,9 @@ static void f2fs_put_super(struct super_block *sb) f2fs_destroy_stats(sbi); stop_gc_thread(sbi); - write_checkpoint(sbi, true); + /* We don't need to do checkpoint when it's clean */ + if (sbi->s_dirty && get_pages(sbi, F2FS_DIRTY_NODES)) + write_checkpoint(sbi, true); iput(sbi->node_inode); iput(sbi->meta_inode); @@ -441,7 +514,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); - if (!(root->d_sb->s_flags & MS_RDONLY) && test_opt(sbi, BG_GC)) + if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) seq_printf(seq, ",background_gc=%s", "on"); else seq_printf(seq, ",background_gc=%s", "off"); @@ -467,7 +540,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) #endif if (test_opt(sbi, DISABLE_EXT_IDENTIFY)) seq_puts(seq, ",disable_ext_identify"); - + if (test_opt(sbi, INLINE_DATA)) + seq_puts(seq, ",inline_data"); + if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) + seq_puts(seq, ",flush_merge"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; @@ -477,16 +553,26 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; struct f2fs_sb_info *sbi = F2FS_SB(sb); - unsigned int total_segs = le32_to_cpu(sbi->raw_super->segment_count_main); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); int i; + seq_puts(seq, "format: segment_type|valid_blocks\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + for (i = 0; i < total_segs; i++) { - seq_printf(seq, "%u", get_valid_blocks(sbi, i, 1)); - if (i != 0 && (i % 10) == 0) - seq_puts(seq, "\n"); + struct seg_entry *se = get_seg_entry(sbi, i); + + if ((i % 10) == 0) + seq_printf(seq, "%-5d", i); + seq_printf(seq, "%d|%-3u", se->type, + get_valid_blocks(sbi, i, 1)); + if ((i % 10) == 9 || i == (total_segs - 1)) + seq_putc(seq, '\n'); else - seq_puts(seq, " "); + seq_putc(seq, ' '); } + return 0; } @@ -508,6 +594,10 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) struct f2fs_sb_info *sbi = F2FS_SB(sb); struct f2fs_mount_info org_mount_opt; int err, active_logs; + bool need_restart_gc = false; + bool need_stop_gc = false; + + sync_filesystem(sb); /* * Save the old mount options in case we @@ -523,7 +613,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) /* * Previous and new state of filesystem is RO, - * so no point in checking GC conditions. + * so skip checking GC and FLUSH_MERGE conditions. */ if ((sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) goto skip; @@ -537,18 +627,40 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (sbi->gc_thread) { stop_gc_thread(sbi); f2fs_sync_fs(sb, 1); + need_restart_gc = true; } } else if (test_opt(sbi, BG_GC) && !sbi->gc_thread) { err = start_gc_thread(sbi); if (err) goto restore_opts; + need_stop_gc = true; + } + + /* + * We stop issue flush thread if FS is mounted as RO + * or if flush_merge is not passed in mount option. + */ + if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { + destroy_flush_cmd_control(sbi); + } else if (test_opt(sbi, FLUSH_MERGE) && + !sbi->sm_info->cmd_control_info) { + err = create_flush_cmd_control(sbi); + if (err) + goto restore_gc; } skip: /* Update the POSIXACL Flag */ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); return 0; - +restore_gc: + if (need_restart_gc) { + if (start_gc_thread(sbi)) + f2fs_msg(sbi->sb, KERN_WARNING, + "background gc thread is stop"); + } else if (need_stop_gc) { + stop_gc_thread(sbi); + } restore_opts: sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; @@ -577,7 +689,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; - if (ino < F2FS_ROOT_INO(sbi)) + if (check_nid_range(sbi, ino)) return ERR_PTR(-ESTALE); /* @@ -588,7 +700,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, inode = f2fs_iget(sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); - if (generation && inode->i_generation != generation) { + if (unlikely(generation && inode->i_generation != generation)) { /* we didn't find the right inode.. */ iput(inode); return ERR_PTR(-ESTALE); @@ -691,10 +803,10 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi) fsmeta += le32_to_cpu(ckpt->rsvd_segment_count); fsmeta += le32_to_cpu(raw_super->segment_count_ssa); - if (fsmeta >= total) + if (unlikely(fsmeta >= total)) return 1; - if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { + if (unlikely(is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; } @@ -722,35 +834,56 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->node_ino_num = le32_to_cpu(raw_super->node_ino); sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino); sbi->cur_victim_sec = NULL_SECNO; + sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); + + sbi->dir_level = DEF_DIR_LEVEL; } -static int validate_superblock(struct super_block *sb, - struct f2fs_super_block **raw_super, - struct buffer_head **raw_super_buf, sector_t block) +/* + * Read f2fs raw super block. + * Because we have two copies of super block, so read the first one at first, + * if the first one is invalid, move to read the second one. + */ +static int read_raw_super_block(struct super_block *sb, + struct f2fs_super_block **raw_super, + struct buffer_head **raw_super_buf) { - const char *super = (block == 0 ? "first" : "second"); + int block = 0; - /* read f2fs raw super block */ +retry: *raw_super_buf = sb_bread(sb, block); if (!*raw_super_buf) { - f2fs_msg(sb, KERN_ERR, "unable to read %s superblock", - super); - return -EIO; + f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock", + block + 1); + if (block == 0) { + block++; + goto retry; + } else { + return -EIO; + } } *raw_super = (struct f2fs_super_block *) ((char *)(*raw_super_buf)->b_data + F2FS_SUPER_OFFSET); /* sanity checking of raw super */ - if (!sanity_check_raw_super(sb, *raw_super)) - return 0; + if (sanity_check_raw_super(sb, *raw_super)) { + brelse(*raw_super_buf); + f2fs_msg(sb, KERN_ERR, + "Can't find valid F2FS filesystem in %dth superblock", + block + 1); + if (block == 0) { + block++; + goto retry; + } else { + return -EINVAL; + } + } - f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem " - "in %s superblock", super); - return -EINVAL; + return 0; } static int f2fs_fill_super(struct super_block *sb, void *data, int silent) @@ -768,19 +901,15 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; /* set a block size */ - if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) { + if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) { f2fs_msg(sb, KERN_ERR, "unable to set blocksize"); goto free_sbi; } - err = validate_superblock(sb, &raw_super, &raw_super_buf, 0); - if (err) { - brelse(raw_super_buf); - /* check secondary superblock when primary failed */ - err = validate_superblock(sb, &raw_super, &raw_super_buf, 1); - if (err) - goto free_sb_buf; - } + err = read_raw_super_block(sb, &raw_super, &raw_super_buf); + if (err) + goto free_sbi; + sb->s_fs_info = sbi; /* init some FS parameters */ sbi->active_logs = NR_CURSEG_TYPE; @@ -818,12 +947,21 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) mutex_init(&sbi->gc_mutex); mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); - for (i = 0; i < NR_GLOBAL_LOCKS; i++) - mutex_init(&sbi->fs_lock[i]); mutex_init(&sbi->node_write); - sbi->por_doing = 0; + sbi->por_doing = false; spin_lock_init(&sbi->stat_lock); - init_rwsem(&sbi->bio_sem); + + init_rwsem(&sbi->read_io.io_rwsem); + sbi->read_io.sbi = sbi; + sbi->read_io.bio = NULL; + for (i = 0; i < NR_PAGE_TYPE; i++) { + init_rwsem(&sbi->write_io[i].io_rwsem); + sbi->write_io[i].sbi = sbi; + sbi->write_io[i].bio = NULL; + } + + init_rwsem(&sbi->cp_rwsem); + init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); /* get an inode for meta space */ @@ -886,9 +1024,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) } /* if there are nt orphan nodes free them */ - err = -EINVAL; - if (recover_orphan_inodes(sbi)) - goto free_node_inode; + recover_orphan_inodes(sbi); /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); @@ -897,8 +1033,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) err = PTR_ERR(root); goto free_node_inode; } - if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) + if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + err = -EINVAL; goto free_root_inode; + } sb->s_root = d_make_root(root); /* allocate root dentry */ if (!sb->s_root) { @@ -906,28 +1044,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_root_inode; } - /* recover fsynced data */ - if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { - err = recover_fsync_data(sbi); - if (err) - f2fs_msg(sb, KERN_ERR, - "Cannot recover all fsync data errno=%ld", err); - } - - /* - * If filesystem is not mounted as read-only then - * do start the gc_thread. - */ - if (!(sb->s_flags & MS_RDONLY)) { - /* After POR, we can run background GC thread.*/ - err = start_gc_thread(sbi); - if (err) - goto fail; - } - err = f2fs_build_stats(sbi); if (err) - goto fail; + goto free_root_inode; if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -949,11 +1068,36 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, "%s", sb->s_id); if (err) - goto fail; + goto free_proc; + /* recover fsynced data */ + if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { + err = recover_fsync_data(sbi); + if (err) + f2fs_msg(sb, KERN_ERR, + "Cannot recover all fsync data errno=%ld", err); + } + + /* + * If filesystem is not mounted as read-only then + * do start the gc_thread. + */ + if (!(sb->s_flags & MS_RDONLY)) { + /* After POR, we can run background GC thread.*/ + err = start_gc_thread(sbi); + if (err) + goto free_kobj; + } return 0; -fail: - stop_gc_thread(sbi); + +free_kobj: + kobject_del(&sbi->s_kobj); +free_proc: + if (sbi->s_proc) { + remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry(sb->s_id, f2fs_proc_root); + } + f2fs_destroy_stats(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -993,8 +1137,8 @@ MODULE_ALIAS_FS("f2fs"); static int __init init_inodecache(void) { f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", - sizeof(struct f2fs_inode_info), NULL); - if (f2fs_inode_cachep == NULL) + sizeof(struct f2fs_inode_info)); + if (!f2fs_inode_cachep) return -ENOMEM; return 0; } @@ -1019,9 +1163,12 @@ static int __init init_f2fs_fs(void) err = create_node_manager_caches(); if (err) goto free_inodecache; - err = create_gc_caches(); + err = create_segment_manager_caches(); if (err) goto free_node_manager_caches; + err = create_gc_caches(); + if (err) + goto free_segment_manager_caches; err = create_checkpoint_caches(); if (err) goto free_gc_caches; @@ -1043,6 +1190,8 @@ free_checkpoint_caches: destroy_checkpoint_caches(); free_gc_caches: destroy_gc_caches(); +free_segment_manager_caches: + destroy_segment_manager_caches(); free_node_manager_caches: destroy_node_manager_caches(); free_inodecache: @@ -1058,6 +1207,7 @@ static void __exit exit_f2fs_fs(void) unregister_filesystem(&f2fs_fs_type); destroy_checkpoint_caches(); destroy_gc_caches(); + destroy_segment_manager_caches(); destroy_node_manager_caches(); destroy_inodecache(); kset_unregister(f2fs_kset); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 1ac8a5f6e38..8bea941ee30 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -21,11 +21,12 @@ #include <linux/rwsem.h> #include <linux/f2fs_fs.h> #include <linux/security.h> +#include <linux/posix_acl_xattr.h> #include "f2fs.h" #include "xattr.h" static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) + size_t list_size, const char *name, size_t len, int type) { struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); int total_len, prefix_len = 0; @@ -52,11 +53,11 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list, return -EINVAL; } - total_len = prefix_len + name_len + 1; + total_len = prefix_len + len + 1; if (list && total_len <= list_size) { memcpy(list, prefix, prefix_len); - memcpy(list + prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; + memcpy(list + prefix_len, name, len); + list[prefix_len + len] = '\0'; } return total_len; } @@ -107,11 +108,12 @@ static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name, if (strcmp(name, "") == 0) return -EINVAL; - return f2fs_setxattr(dentry->d_inode, type, name, value, size, NULL); + return f2fs_setxattr(dentry->d_inode, type, name, + value, size, NULL, flags); } static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) + size_t list_size, const char *name, size_t len, int type) { const char *xname = F2FS_SYSTEM_ADVISE_PREFIX; size_t size; @@ -163,7 +165,7 @@ static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array, for (xattr = xattr_array; xattr->name != NULL; xattr++) { err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY, xattr->name, xattr->value, - xattr->value_len, (struct page *)page); + xattr->value_len, (struct page *)page, 0); if (err < 0) break; } @@ -213,8 +215,8 @@ const struct xattr_handler f2fs_xattr_security_handler = { static const struct xattr_handler *f2fs_xattr_handler_map[] = { [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler, #ifdef CONFIG_F2FS_FS_POSIX_ACL - [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &f2fs_xattr_acl_access_handler, - [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler, + [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, #endif [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler, #ifdef CONFIG_F2FS_FS_SECURITY @@ -226,8 +228,8 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = { const struct xattr_handler *f2fs_xattr_handlers[] = { &f2fs_xattr_user_handler, #ifdef CONFIG_F2FS_FS_POSIX_ACL - &f2fs_xattr_acl_access_handler, - &f2fs_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif &f2fs_xattr_trusted_handler, #ifdef CONFIG_F2FS_FS_SECURITY @@ -237,26 +239,26 @@ const struct xattr_handler *f2fs_xattr_handlers[] = { NULL, }; -static inline const struct xattr_handler *f2fs_xattr_handler(int name_index) +static inline const struct xattr_handler *f2fs_xattr_handler(int index) { const struct xattr_handler *handler = NULL; - if (name_index > 0 && name_index < ARRAY_SIZE(f2fs_xattr_handler_map)) - handler = f2fs_xattr_handler_map[name_index]; + if (index > 0 && index < ARRAY_SIZE(f2fs_xattr_handler_map)) + handler = f2fs_xattr_handler_map[index]; return handler; } -static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int name_index, - size_t name_len, const char *name) +static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index, + size_t len, const char *name) { struct f2fs_xattr_entry *entry; list_for_each_xattr(entry, base_addr) { - if (entry->e_name_index != name_index) + if (entry->e_name_index != index) continue; - if (entry->e_name_len != name_len) + if (entry->e_name_len != len) continue; - if (!memcmp(entry->e_name, name, name_len)) + if (!memcmp(entry->e_name, name, len)) break; } return entry; @@ -271,7 +273,7 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage) inline_size = inline_xattr_size(inode); - txattr_addr = kzalloc(inline_size + size, GFP_KERNEL); + txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO); if (!txattr_addr) return NULL; @@ -343,6 +345,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, if (ipage) { inline_addr = inline_xattr_addr(ipage); + f2fs_wait_on_page_writeback(ipage, NODE); } else { page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { @@ -350,6 +353,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, return PTR_ERR(page); } inline_addr = inline_xattr_addr(page); + f2fs_wait_on_page_writeback(page, NODE); } memcpy(inline_addr, txattr_addr, inline_size); f2fs_put_page(page, 1); @@ -369,7 +373,8 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, alloc_nid_failed(sbi, new_nid); return PTR_ERR(xpage); } - BUG_ON(new_nid); + f2fs_bug_on(new_nid); + f2fs_wait_on_page_writeback(xpage, NODE); } else { struct dnode_of_data dn; set_new_dnode(&dn, inode, NULL, NULL, new_nid); @@ -392,40 +397,43 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, return 0; } -int f2fs_getxattr(struct inode *inode, int name_index, const char *name, +int f2fs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size) { struct f2fs_xattr_entry *entry; void *base_addr; int error = 0; - size_t value_len, name_len; + size_t size, len; if (name == NULL) return -EINVAL; - name_len = strlen(name); + + len = strlen(name); + if (len > F2FS_NAME_LEN) + return -ERANGE; base_addr = read_all_xattrs(inode, NULL); if (!base_addr) return -ENOMEM; - entry = __find_xattr(base_addr, name_index, name_len, name); + entry = __find_xattr(base_addr, index, len, name); if (IS_XATTR_LAST_ENTRY(entry)) { error = -ENODATA; goto cleanup; } - value_len = le16_to_cpu(entry->e_value_size); + size = le16_to_cpu(entry->e_value_size); - if (buffer && value_len > buffer_size) { + if (buffer && size > buffer_size) { error = -ERANGE; goto cleanup; } if (buffer) { char *pval = entry->e_name + entry->e_name_len; - memcpy(buffer, pval, value_len); + memcpy(buffer, pval, size); } - error = value_len; + error = size; cleanup: kzfree(base_addr); @@ -469,16 +477,15 @@ cleanup: return error; } -int f2fs_setxattr(struct inode *inode, int name_index, const char *name, - const void *value, size_t value_len, struct page *ipage) +static int __f2fs_setxattr(struct inode *inode, int index, + const char *name, const void *value, size_t size, + struct page *ipage, int flags) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_xattr_entry *here, *last; void *base_addr; int found, newsize; - size_t name_len; - int ilock; + size_t len; __u32 new_hsize; int error = -ENOMEM; @@ -486,32 +493,35 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, return -EINVAL; if (value == NULL) - value_len = 0; + size = 0; - name_len = strlen(name); + len = strlen(name); - if (name_len > F2FS_NAME_LEN || value_len > MAX_VALUE_LEN(inode)) + if (len > F2FS_NAME_LEN || size > MAX_VALUE_LEN(inode)) return -ERANGE; - f2fs_balance_fs(sbi); - - ilock = mutex_lock_op(sbi); - base_addr = read_all_xattrs(inode, ipage); if (!base_addr) goto exit; /* find entry with wanted name. */ - here = __find_xattr(base_addr, name_index, name_len, name); + here = __find_xattr(base_addr, index, len, name); found = IS_XATTR_LAST_ENTRY(here) ? 0 : 1; - last = here; + if ((flags & XATTR_REPLACE) && !found) { + error = -ENODATA; + goto exit; + } else if ((flags & XATTR_CREATE) && found) { + error = -EEXIST; + goto exit; + } + + last = here; while (!IS_XATTR_LAST_ENTRY(last)) last = XATTR_NEXT_ENTRY(last); - newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + - name_len + value_len); + newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + len + size); /* 1. Check space */ if (value) { @@ -522,9 +532,9 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, */ free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr); if (found) - free = free - ENTRY_SIZE(here); + free = free + ENTRY_SIZE(here); - if (free < newsize) { + if (unlikely(free < newsize)) { error = -ENOSPC; goto exit; } @@ -554,12 +564,12 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, * We just write new entry. */ memset(last, 0, newsize); - last->e_name_index = name_index; - last->e_name_len = name_len; - memcpy(last->e_name, name, name_len); - pval = last->e_name + name_len; - memcpy(pval, value, value_len); - last->e_value_size = cpu_to_le16(value_len); + last->e_name_index = index; + last->e_name_len = len; + memcpy(last->e_name, name, len); + pval = last->e_name + len; + memcpy(pval, value, size); + last->e_value_size = cpu_to_le16(size); new_hsize += newsize; } @@ -578,7 +588,29 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, else update_inode_page(inode); exit: - mutex_unlock_op(sbi, ilock); kzfree(base_addr); return error; } + +int f2fs_setxattr(struct inode *inode, int index, const char *name, + const void *value, size_t size, + struct page *ipage, int flags) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + int err; + + /* this case is only from init_inode_metadata */ + if (ipage) + return __f2fs_setxattr(inode, index, name, value, + size, ipage, flags); + f2fs_balance_fs(sbi); + + f2fs_lock_op(sbi); + /* protect xattr_ver */ + down_write(&F2FS_I(inode)->i_sem); + err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags); + up_write(&F2FS_I(inode)->i_sem); + f2fs_unlock_op(sbi); + + return err; +} diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 02a08fb88a1..34ab7dbcf5e 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -108,26 +108,24 @@ struct f2fs_xattr_entry { #ifdef CONFIG_F2FS_FS_XATTR extern const struct xattr_handler f2fs_xattr_user_handler; extern const struct xattr_handler f2fs_xattr_trusted_handler; -extern const struct xattr_handler f2fs_xattr_acl_access_handler; -extern const struct xattr_handler f2fs_xattr_acl_default_handler; extern const struct xattr_handler f2fs_xattr_advise_handler; extern const struct xattr_handler f2fs_xattr_security_handler; extern const struct xattr_handler *f2fs_xattr_handlers[]; extern int f2fs_setxattr(struct inode *, int, const char *, - const void *, size_t, struct page *); + const void *, size_t, struct page *, int); extern int f2fs_getxattr(struct inode *, int, const char *, void *, size_t); extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t); #else #define f2fs_xattr_handlers NULL -static inline int f2fs_setxattr(struct inode *inode, int name_index, - const char *name, const void *value, size_t value_len) +static inline int f2fs_setxattr(struct inode *inode, int index, + const char *name, const void *value, size_t size, int flags) { return -EOPNOTSUPP; } -static inline int f2fs_getxattr(struct inode *inode, int name_index, +static inline int f2fs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size) { return -EOPNOTSUPP; diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 4241e6f39e8..e0c4ba39a37 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -52,7 +52,8 @@ struct fat_mount_options { usefree:1, /* Use free_clusters for FAT32 */ tz_set:1, /* Filesystem timestamps' offset set */ rodir:1, /* allow ATTR_RO for directory */ - discard:1; /* Issue discard requests on deletions */ + discard:1, /* Issue discard requests on deletions */ + dos1xfloppy:1; /* Assume default BPB for DOS 1.x floppies */ }; #define FAT_HASH_BITS 8 @@ -102,6 +103,7 @@ struct msdos_sb_info { struct hlist_head dir_hashtable[FAT_HASH_SIZE]; unsigned int dirty; /* fs state before mount */ + struct rcu_head rcu; }; #define FAT_CACHE_VALID 0 /* special case for valid cache */ diff --git a/fs/fat/file.c b/fs/fat/file.c index 9b104f54305..85f79a89e74 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -170,10 +170,10 @@ int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) const struct file_operations fat_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .release = fat_file_release, .unlocked_ioctl = fat_generic_ioctl, diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 0062da21dd8..756aead10d9 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -35,9 +35,71 @@ #define CONFIG_FAT_DEFAULT_IOCHARSET "" #endif +#define KB_IN_SECTORS 2 + +/* + * A deserialized copy of the on-disk structure laid out in struct + * fat_boot_sector. + */ +struct fat_bios_param_block { + u16 fat_sector_size; + u8 fat_sec_per_clus; + u16 fat_reserved; + u8 fat_fats; + u16 fat_dir_entries; + u16 fat_sectors; + u16 fat_fat_length; + u32 fat_total_sect; + + u8 fat16_state; + u32 fat16_vol_id; + + u32 fat32_length; + u32 fat32_root_cluster; + u16 fat32_info_sector; + u8 fat32_state; + u32 fat32_vol_id; +}; + static int fat_default_codepage = CONFIG_FAT_DEFAULT_CODEPAGE; static char fat_default_iocharset[] = CONFIG_FAT_DEFAULT_IOCHARSET; +static struct fat_floppy_defaults { + unsigned nr_sectors; + unsigned sec_per_clus; + unsigned dir_entries; + unsigned media; + unsigned fat_length; +} floppy_defaults[] = { +{ + .nr_sectors = 160 * KB_IN_SECTORS, + .sec_per_clus = 1, + .dir_entries = 64, + .media = 0xFE, + .fat_length = 1, +}, +{ + .nr_sectors = 180 * KB_IN_SECTORS, + .sec_per_clus = 1, + .dir_entries = 64, + .media = 0xFC, + .fat_length = 2, +}, +{ + .nr_sectors = 320 * KB_IN_SECTORS, + .sec_per_clus = 2, + .dir_entries = 112, + .media = 0xFF, + .fat_length = 1, +}, +{ + .nr_sectors = 360 * KB_IN_SECTORS, + .sec_per_clus = 2, + .dir_entries = 112, + .media = 0xFD, + .fat_length = 2, +}, +}; static int fat_add_cluster(struct inode *inode) { @@ -185,12 +247,13 @@ static int fat_write_end(struct file *file, struct address_space *mapping, } static ssize_t fat_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, - loff_t offset, unsigned long nr_segs) + struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); ssize_t ret; if (rw == WRITE) { @@ -203,7 +266,7 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb, * * Return 0, and fallback to normal buffered write. */ - loff_t size = offset + iov_length(iov, nr_segs); + loff_t size = offset + count; if (MSDOS_I(inode)->mmu_private < size) return 0; } @@ -212,10 +275,9 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb, * FAT need to use the DIO_LOCKING for avoiding the race * condition of fat_get_block() and ->truncate(). */ - ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, - fat_get_block); + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, fat_get_block); if (ret < 0 && (rw & WRITE)) - fat_write_failed(mapping, offset + iov_length(iov, nr_segs)); + fat_write_failed(mapping, offset + count); return ret; } @@ -359,7 +421,7 @@ struct inode *fat_iget(struct super_block *sb, loff_t i_pos) static int is_exec(unsigned char *extension) { - unsigned char *exe_extensions = "EXECOMBAT", *walk; + unsigned char exe_extensions[] = "EXECOMBAT", *walk; for (walk = exe_extensions; *walk; walk += 3) if (!strncmp(extension, walk, 3)) @@ -490,7 +552,7 @@ EXPORT_SYMBOL_GPL(fat_build_inode); static void fat_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (!inode->i_nlink) { inode->i_size = 0; fat_truncate_blocks(inode, 0); @@ -548,6 +610,16 @@ static void fat_set_state(struct super_block *sb, brelse(bh); } +static void delayed_free(struct rcu_head *p) +{ + struct msdos_sb_info *sbi = container_of(p, struct msdos_sb_info, rcu); + unload_nls(sbi->nls_disk); + unload_nls(sbi->nls_io); + if (sbi->options.iocharset != fat_default_iocharset) + kfree(sbi->options.iocharset); + kfree(sbi); +} + static void fat_put_super(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); @@ -557,14 +629,7 @@ static void fat_put_super(struct super_block *sb) iput(sbi->fsinfo_inode); iput(sbi->fat_inode); - unload_nls(sbi->nls_disk); - unload_nls(sbi->nls_io); - - if (sbi->options.iocharset != fat_default_iocharset) - kfree(sbi->options.iocharset); - - sb->s_fs_info = NULL; - kfree(sbi); + call_rcu(&sbi->rcu, delayed_free); } static struct kmem_cache *fat_inode_cachep; @@ -632,6 +697,8 @@ static int fat_remount(struct super_block *sb, int *flags, char *data) struct msdos_sb_info *sbi = MSDOS_SB(sb); *flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME); + sync_filesystem(sb); + /* make sure we update state on remount. */ new_rdonly = *flags & MS_RDONLY; if (new_rdonly != (sb->s_flags & MS_RDONLY)) { @@ -848,6 +915,8 @@ static int fat_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",nfs=stale_rw"); if (opts->discard) seq_puts(m, ",discard"); + if (opts->dos1xfloppy) + seq_puts(m, ",dos1xfloppy"); return 0; } @@ -862,7 +931,7 @@ enum { Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset, - Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err, + Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err, Opt_dos1xfloppy, }; static const match_table_t fat_tokens = { @@ -895,6 +964,7 @@ static const match_table_t fat_tokens = { {Opt_nfs_stale_rw, "nfs"}, {Opt_nfs_stale_rw, "nfs=stale_rw"}, {Opt_nfs_nostale_ro, "nfs=nostale_ro"}, + {Opt_dos1xfloppy, "dos1xfloppy"}, {Opt_obsolete, "conv=binary"}, {Opt_obsolete, "conv=text"}, {Opt_obsolete, "conv=auto"}, @@ -1097,6 +1167,9 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, case Opt_nfs_nostale_ro: opts->nfs = FAT_NFS_NOSTALE_RO; break; + case Opt_dos1xfloppy: + opts->dos1xfloppy = 1; + break; /* msdos specific */ case Opt_dots: @@ -1242,6 +1315,169 @@ static unsigned long calc_fat_clusters(struct super_block *sb) return sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits; } +static bool fat_bpb_is_zero(struct fat_boot_sector *b) +{ + if (get_unaligned_le16(&b->sector_size)) + return false; + if (b->sec_per_clus) + return false; + if (b->reserved) + return false; + if (b->fats) + return false; + if (get_unaligned_le16(&b->dir_entries)) + return false; + if (get_unaligned_le16(&b->sectors)) + return false; + if (b->media) + return false; + if (b->fat_length) + return false; + if (b->secs_track) + return false; + if (b->heads) + return false; + return true; +} + +static int fat_read_bpb(struct super_block *sb, struct fat_boot_sector *b, + int silent, struct fat_bios_param_block *bpb) +{ + int error = -EINVAL; + + /* Read in BPB ... */ + memset(bpb, 0, sizeof(*bpb)); + bpb->fat_sector_size = get_unaligned_le16(&b->sector_size); + bpb->fat_sec_per_clus = b->sec_per_clus; + bpb->fat_reserved = le16_to_cpu(b->reserved); + bpb->fat_fats = b->fats; + bpb->fat_dir_entries = get_unaligned_le16(&b->dir_entries); + bpb->fat_sectors = get_unaligned_le16(&b->sectors); + bpb->fat_fat_length = le16_to_cpu(b->fat_length); + bpb->fat_total_sect = le32_to_cpu(b->total_sect); + + bpb->fat16_state = b->fat16.state; + bpb->fat16_vol_id = get_unaligned_le32(b->fat16.vol_id); + + bpb->fat32_length = le32_to_cpu(b->fat32.length); + bpb->fat32_root_cluster = le32_to_cpu(b->fat32.root_cluster); + bpb->fat32_info_sector = le16_to_cpu(b->fat32.info_sector); + bpb->fat32_state = b->fat32.state; + bpb->fat32_vol_id = get_unaligned_le32(b->fat32.vol_id); + + /* Validate this looks like a FAT filesystem BPB */ + if (!bpb->fat_reserved) { + if (!silent) + fat_msg(sb, KERN_ERR, + "bogus number of reserved sectors"); + goto out; + } + if (!bpb->fat_fats) { + if (!silent) + fat_msg(sb, KERN_ERR, "bogus number of FAT structure"); + goto out; + } + + /* + * Earlier we checked here that b->secs_track and b->head are nonzero, + * but it turns out valid FAT filesystems can have zero there. + */ + + if (!fat_valid_media(b->media)) { + if (!silent) + fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)", + (unsigned)b->media); + goto out; + } + + if (!is_power_of_2(bpb->fat_sector_size) + || (bpb->fat_sector_size < 512) + || (bpb->fat_sector_size > 4096)) { + if (!silent) + fat_msg(sb, KERN_ERR, "bogus logical sector size %u", + (unsigned)bpb->fat_sector_size); + goto out; + } + + if (!is_power_of_2(bpb->fat_sec_per_clus)) { + if (!silent) + fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u", + (unsigned)bpb->fat_sec_per_clus); + goto out; + } + + error = 0; + +out: + return error; +} + +static int fat_read_static_bpb(struct super_block *sb, + struct fat_boot_sector *b, int silent, + struct fat_bios_param_block *bpb) +{ + static const char *notdos1x = "This doesn't look like a DOS 1.x volume"; + + struct fat_floppy_defaults *fdefaults = NULL; + int error = -EINVAL; + sector_t bd_sects; + unsigned i; + + bd_sects = i_size_read(sb->s_bdev->bd_inode) / SECTOR_SIZE; + + /* 16-bit DOS 1.x reliably wrote bootstrap short-jmp code */ + if (b->ignored[0] != 0xeb || b->ignored[2] != 0x90) { + if (!silent) + fat_msg(sb, KERN_ERR, + "%s; no bootstrapping code", notdos1x); + goto out; + } + + /* + * If any value in this region is non-zero, it isn't archaic + * DOS. + */ + if (!fat_bpb_is_zero(b)) { + if (!silent) + fat_msg(sb, KERN_ERR, + "%s; DOS 2.x BPB is non-zero", notdos1x); + goto out; + } + + for (i = 0; i < ARRAY_SIZE(floppy_defaults); i++) { + if (floppy_defaults[i].nr_sectors == bd_sects) { + fdefaults = &floppy_defaults[i]; + break; + } + } + + if (fdefaults == NULL) { + if (!silent) + fat_msg(sb, KERN_WARNING, + "This looks like a DOS 1.x volume, but isn't a recognized floppy size (%llu sectors)", + (u64)bd_sects); + goto out; + } + + if (!silent) + fat_msg(sb, KERN_INFO, + "This looks like a DOS 1.x volume; assuming default BPB values"); + + memset(bpb, 0, sizeof(*bpb)); + bpb->fat_sector_size = SECTOR_SIZE; + bpb->fat_sec_per_clus = fdefaults->sec_per_clus; + bpb->fat_reserved = 1; + bpb->fat_fats = 2; + bpb->fat_dir_entries = fdefaults->dir_entries; + bpb->fat_sectors = fdefaults->nr_sectors; + bpb->fat_fat_length = fdefaults->fat_length; + + error = 0; + +out: + return error; +} + /* * Read the super block of an MS-DOS FS. */ @@ -1251,12 +1487,11 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, struct inode *root_inode = NULL, *fat_inode = NULL; struct inode *fsinfo_inode = NULL; struct buffer_head *bh; - struct fat_boot_sector *b; + struct fat_bios_param_block bpb; struct msdos_sb_info *sbi; u16 logical_sector_size; u32 total_sectors, total_clusters, fat_clusters, rootdir_sectors; int debug; - unsigned int media; long error; char buf[50]; @@ -1293,100 +1528,72 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, goto out_fail; } - b = (struct fat_boot_sector *) bh->b_data; - if (!b->reserved) { - if (!silent) - fat_msg(sb, KERN_ERR, "bogus number of reserved sectors"); - brelse(bh); - goto out_invalid; - } - if (!b->fats) { - if (!silent) - fat_msg(sb, KERN_ERR, "bogus number of FAT structure"); - brelse(bh); - goto out_invalid; - } - - /* - * Earlier we checked here that b->secs_track and b->head are nonzero, - * but it turns out valid FAT filesystems can have zero there. - */ + error = fat_read_bpb(sb, (struct fat_boot_sector *)bh->b_data, silent, + &bpb); + if (error == -EINVAL && sbi->options.dos1xfloppy) + error = fat_read_static_bpb(sb, + (struct fat_boot_sector *)bh->b_data, silent, &bpb); + brelse(bh); - media = b->media; - if (!fat_valid_media(media)) { - if (!silent) - fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)", - media); - brelse(bh); - goto out_invalid; - } - logical_sector_size = get_unaligned_le16(&b->sector_size); - if (!is_power_of_2(logical_sector_size) - || (logical_sector_size < 512) - || (logical_sector_size > 4096)) { - if (!silent) - fat_msg(sb, KERN_ERR, "bogus logical sector size %u", - logical_sector_size); - brelse(bh); + if (error == -EINVAL) goto out_invalid; - } - sbi->sec_per_clus = b->sec_per_clus; - if (!is_power_of_2(sbi->sec_per_clus)) { - if (!silent) - fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u", - sbi->sec_per_clus); - brelse(bh); - goto out_invalid; - } + else if (error) + goto out_fail; + logical_sector_size = bpb.fat_sector_size; + sbi->sec_per_clus = bpb.fat_sec_per_clus; + + error = -EIO; if (logical_sector_size < sb->s_blocksize) { fat_msg(sb, KERN_ERR, "logical sector size too small for device" " (logical sector size = %u)", logical_sector_size); - brelse(bh); goto out_fail; } + if (logical_sector_size > sb->s_blocksize) { - brelse(bh); + struct buffer_head *bh_resize; if (!sb_set_blocksize(sb, logical_sector_size)) { fat_msg(sb, KERN_ERR, "unable to set blocksize %u", logical_sector_size); goto out_fail; } - bh = sb_bread(sb, 0); - if (bh == NULL) { + + /* Verify that the larger boot sector is fully readable */ + bh_resize = sb_bread(sb, 0); + if (bh_resize == NULL) { fat_msg(sb, KERN_ERR, "unable to read boot sector" " (logical sector size = %lu)", sb->s_blocksize); goto out_fail; } - b = (struct fat_boot_sector *) bh->b_data; + brelse(bh_resize); } mutex_init(&sbi->s_lock); sbi->cluster_size = sb->s_blocksize * sbi->sec_per_clus; sbi->cluster_bits = ffs(sbi->cluster_size) - 1; - sbi->fats = b->fats; + sbi->fats = bpb.fat_fats; sbi->fat_bits = 0; /* Don't know yet */ - sbi->fat_start = le16_to_cpu(b->reserved); - sbi->fat_length = le16_to_cpu(b->fat_length); + sbi->fat_start = bpb.fat_reserved; + sbi->fat_length = bpb.fat_fat_length; sbi->root_cluster = 0; sbi->free_clusters = -1; /* Don't know yet */ sbi->free_clus_valid = 0; sbi->prev_free = FAT_START_ENT; sb->s_maxbytes = 0xffffffff; - if (!sbi->fat_length && b->fat32.length) { + if (!sbi->fat_length && bpb.fat32_length) { struct fat_boot_fsinfo *fsinfo; struct buffer_head *fsinfo_bh; /* Must be FAT32 */ sbi->fat_bits = 32; - sbi->fat_length = le32_to_cpu(b->fat32.length); - sbi->root_cluster = le32_to_cpu(b->fat32.root_cluster); + sbi->fat_length = bpb.fat32_length; + sbi->root_cluster = bpb.fat32_root_cluster; /* MC - if info_sector is 0, don't multiply by 0 */ - sbi->fsinfo_sector = le16_to_cpu(b->fat32.info_sector); + sbi->fsinfo_sector = bpb.fat32_info_sector; if (sbi->fsinfo_sector == 0) sbi->fsinfo_sector = 1; @@ -1394,7 +1601,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, if (fsinfo_bh == NULL) { fat_msg(sb, KERN_ERR, "bread failed, FSINFO block" " (sector = %lu)", sbi->fsinfo_sector); - brelse(bh); goto out_fail; } @@ -1417,35 +1623,28 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, /* interpret volume ID as a little endian 32 bit integer */ if (sbi->fat_bits == 32) - sbi->vol_id = (((u32)b->fat32.vol_id[0]) | - ((u32)b->fat32.vol_id[1] << 8) | - ((u32)b->fat32.vol_id[2] << 16) | - ((u32)b->fat32.vol_id[3] << 24)); + sbi->vol_id = bpb.fat32_vol_id; else /* fat 16 or 12 */ - sbi->vol_id = (((u32)b->fat16.vol_id[0]) | - ((u32)b->fat16.vol_id[1] << 8) | - ((u32)b->fat16.vol_id[2] << 16) | - ((u32)b->fat16.vol_id[3] << 24)); + sbi->vol_id = bpb.fat16_vol_id; sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry); sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1; sbi->dir_start = sbi->fat_start + sbi->fats * sbi->fat_length; - sbi->dir_entries = get_unaligned_le16(&b->dir_entries); + sbi->dir_entries = bpb.fat_dir_entries; if (sbi->dir_entries & (sbi->dir_per_block - 1)) { if (!silent) fat_msg(sb, KERN_ERR, "bogus directory-entries per block" " (%u)", sbi->dir_entries); - brelse(bh); goto out_invalid; } rootdir_sectors = sbi->dir_entries * sizeof(struct msdos_dir_entry) / sb->s_blocksize; sbi->data_start = sbi->dir_start + rootdir_sectors; - total_sectors = get_unaligned_le16(&b->sectors); + total_sectors = bpb.fat_sectors; if (total_sectors == 0) - total_sectors = le32_to_cpu(b->total_sect); + total_sectors = bpb.fat_total_sect; total_clusters = (total_sectors - sbi->data_start) / sbi->sec_per_clus; @@ -1454,9 +1653,9 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, /* some OSes set FAT_STATE_DIRTY and clean it on unmount. */ if (sbi->fat_bits == 32) - sbi->dirty = b->fat32.state & FAT_STATE_DIRTY; + sbi->dirty = bpb.fat32_state & FAT_STATE_DIRTY; else /* fat 16 or 12 */ - sbi->dirty = b->fat16.state & FAT_STATE_DIRTY; + sbi->dirty = bpb.fat16_state & FAT_STATE_DIRTY; /* check that FAT table does not overflow */ fat_clusters = calc_fat_clusters(sb); @@ -1465,7 +1664,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, if (!silent) fat_msg(sb, KERN_ERR, "count of clusters too big (%u)", total_clusters); - brelse(bh); goto out_invalid; } @@ -1478,8 +1676,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, if (sbi->prev_free < FAT_START_ENT) sbi->prev_free = FAT_START_ENT; - brelse(bh); - /* set up enough so that it can read an inode */ fat_hash_init(sb); dir_hash_init(sb); diff --git a/fs/fcntl.c b/fs/fcntl.c index 65343c3741f..72c82f69b01 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -56,7 +56,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg) return -EINVAL; } - if (filp->f_op && filp->f_op->check_flags) + if (filp->f_op->check_flags) error = filp->f_op->check_flags(arg); if (error) return error; @@ -64,8 +64,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg) /* * ->fasync() is responsible for setting the FASYNC bit. */ - if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op && - filp->f_op->fasync) { + if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op->fasync) { error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0); if (error < 0) goto out; @@ -273,9 +272,19 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_SETFL: err = setfl(fd, filp, arg); break; +#if BITS_PER_LONG != 32 + /* 32-bit arches must use fcntl64() */ + case F_OFD_GETLK: +#endif case F_GETLK: - err = fcntl_getlk(filp, (struct flock __user *) arg); + err = fcntl_getlk(filp, cmd, (struct flock __user *) arg); break; +#if BITS_PER_LONG != 32 + /* 32-bit arches must use fcntl64() */ + case F_OFD_SETLK: + case F_OFD_SETLKW: +#endif + /* Fallthrough */ case F_SETLK: case F_SETLKW: err = fcntl_setlk(fd, filp, cmd, (struct flock __user *) arg); @@ -389,17 +398,20 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, goto out1; switch (cmd) { - case F_GETLK64: - err = fcntl_getlk64(f.file, (struct flock64 __user *) arg); - break; - case F_SETLK64: - case F_SETLKW64: - err = fcntl_setlk64(fd, f.file, cmd, - (struct flock64 __user *) arg); - break; - default: - err = do_fcntl(fd, cmd, arg, f.file); - break; + case F_GETLK64: + case F_OFD_GETLK: + err = fcntl_getlk64(f.file, cmd, (struct flock64 __user *) arg); + break; + case F_SETLK64: + case F_SETLKW64: + case F_OFD_SETLK: + case F_OFD_SETLKW: + err = fcntl_setlk64(fd, f.file, cmd, + (struct flock64 __user *) arg); + break; + default: + err = do_fcntl(fd, cmd, arg, f.file); + break; } out1: fdput(f); diff --git a/fs/file.c b/fs/file.c index 4a78f981557..66923fe3176 100644 --- a/fs/file.c +++ b/fs/file.c @@ -25,7 +25,10 @@ int sysctl_nr_open __read_mostly = 1024*1024; int sysctl_nr_open_min = BITS_PER_LONG; -int sysctl_nr_open_max = 1024 * 1024; /* raised later */ +/* our max() is unusable in constant expressions ;-/ */ +#define __const_max(x, y) ((x) < (y) ? (x) : (y)) +int sysctl_nr_open_max = __const_max(INT_MAX, ~(size_t)0/sizeof(void *)) & + -BITS_PER_LONG; static void *alloc_fdmem(size_t size) { @@ -34,22 +37,17 @@ static void *alloc_fdmem(size_t size) * vmalloc() if the allocation size will be considered "large" by the VM. */ if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN); + void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY); if (data != NULL) return data; } return vmalloc(size); } -static void free_fdmem(void *ptr) -{ - is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr); -} - static void __free_fdtable(struct fdtable *fdt) { - free_fdmem(fdt->fd); - free_fdmem(fdt->open_fds); + kvfree(fdt->fd); + kvfree(fdt->open_fds); kfree(fdt); } @@ -127,7 +125,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr) return fdt; out_arr: - free_fdmem(fdt->fd); + kvfree(fdt->fd); out_fdt: kfree(fdt); out: @@ -348,21 +346,16 @@ out: return NULL; } -static void close_files(struct files_struct * files) +static struct fdtable *close_files(struct files_struct * files) { - int i, j; - struct fdtable *fdt; - - j = 0; - /* * It is safe to dereference the fd table without RCU or * ->file_lock because this is the last reference to the - * files structure. But use RCU to shut RCU-lockdep up. + * files structure. */ - rcu_read_lock(); - fdt = files_fdtable(files); - rcu_read_unlock(); + struct fdtable *fdt = rcu_dereference_raw(files->fdt); + int i, j = 0; + for (;;) { unsigned long set; i = j * BITS_PER_LONG; @@ -381,6 +374,8 @@ static void close_files(struct files_struct * files) set >>= 1; } } + + return fdt; } struct files_struct *get_files_struct(struct task_struct *task) @@ -398,14 +393,9 @@ struct files_struct *get_files_struct(struct task_struct *task) void put_files_struct(struct files_struct *files) { - struct fdtable *fdt; - if (atomic_dec_and_test(&files->count)) { - close_files(files); - /* not really needed, since nobody can see us */ - rcu_read_lock(); - fdt = files_fdtable(files); - rcu_read_unlock(); + struct fdtable *fdt = close_files(files); + /* free the arrays if they are not embedded */ if (fdt != &files->fdtab) __free_fdtable(fdt); @@ -437,12 +427,6 @@ void exit_files(struct task_struct *tsk) } } -void __init files_defer_init(void) -{ - sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) & - -BITS_PER_LONG; -} - struct files_struct init_files = { .count = ATOMIC_INIT(1), .fdt = &init_files.fdtab, @@ -505,7 +489,7 @@ repeat: error = fd; #if 1 /* Sanity check */ - if (rcu_dereference_raw(fdt->fd[fd]) != NULL) { + if (rcu_access_pointer(fdt->fd[fd]) != NULL) { printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); rcu_assign_pointer(fdt->fd[fd], NULL); } @@ -645,16 +629,16 @@ void do_close_on_exec(struct files_struct *files) spin_unlock(&files->file_lock); } -struct file *fget(unsigned int fd) +static struct file *__fget(unsigned int fd, fmode_t mask) { - struct file *file; struct files_struct *files = current->files; + struct file *file; rcu_read_lock(); file = fcheck_files(files, fd); if (file) { /* File object ref couldn't be taken */ - if (file->f_mode & FMODE_PATH || + if ((file->f_mode & mask) || !atomic_long_inc_not_zero(&file->f_count)) file = NULL; } @@ -663,25 +647,16 @@ struct file *fget(unsigned int fd) return file; } +struct file *fget(unsigned int fd) +{ + return __fget(fd, FMODE_PATH); +} EXPORT_SYMBOL(fget); struct file *fget_raw(unsigned int fd) { - struct file *file; - struct files_struct *files = current->files; - - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - /* File object ref couldn't be taken */ - if (!atomic_long_inc_not_zero(&file->f_count)) - file = NULL; - } - rcu_read_unlock(); - - return file; + return __fget(fd, 0); } - EXPORT_SYMBOL(fget_raw); /* @@ -700,58 +675,54 @@ EXPORT_SYMBOL(fget_raw); * The fput_needed flag returned by fget_light should be passed to the * corresponding fput_light. */ -struct file *fget_light(unsigned int fd, int *fput_needed) +static unsigned long __fget_light(unsigned int fd, fmode_t mask) { - struct file *file; struct files_struct *files = current->files; + struct file *file; - *fput_needed = 0; if (atomic_read(&files->count) == 1) { - file = fcheck_files(files, fd); - if (file && (file->f_mode & FMODE_PATH)) - file = NULL; + file = __fcheck_files(files, fd); + if (!file || unlikely(file->f_mode & mask)) + return 0; + return (unsigned long)file; } else { - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - if (!(file->f_mode & FMODE_PATH) && - atomic_long_inc_not_zero(&file->f_count)) - *fput_needed = 1; - else - /* Didn't get the reference, someone's freed */ - file = NULL; - } - rcu_read_unlock(); + file = __fget(fd, mask); + if (!file) + return 0; + return FDPUT_FPUT | (unsigned long)file; } +} +unsigned long __fdget(unsigned int fd) +{ + return __fget_light(fd, FMODE_PATH); +} +EXPORT_SYMBOL(__fdget); - return file; +unsigned long __fdget_raw(unsigned int fd) +{ + return __fget_light(fd, 0); } -EXPORT_SYMBOL(fget_light); -struct file *fget_raw_light(unsigned int fd, int *fput_needed) +unsigned long __fdget_pos(unsigned int fd) { - struct file *file; - struct files_struct *files = current->files; + unsigned long v = __fdget(fd); + struct file *file = (struct file *)(v & ~3); - *fput_needed = 0; - if (atomic_read(&files->count) == 1) { - file = fcheck_files(files, fd); - } else { - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - if (atomic_long_inc_not_zero(&file->f_count)) - *fput_needed = 1; - else - /* Didn't get the reference, someone's freed */ - file = NULL; + if (file && (file->f_mode & FMODE_ATOMIC_POS)) { + if (file_count(file) > 1) { + v |= FDPUT_POS_UNLOCK; + mutex_lock(&file->f_pos_lock); } - rcu_read_unlock(); } - - return file; + return v; } +/* + * We only lock f_pos if we have threads or if the file might be + * shared with another process. In both cases we'll have an elevated + * file count (done either by fdget() or by fork()). + */ + void set_close_on_exec(unsigned int fd, int flag) { struct files_struct *files = current->files; diff --git a/fs/file_table.c b/fs/file_table.c index abdd15ad13c..385bfd31512 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -36,8 +36,6 @@ struct files_stat_struct files_stat = { .max_files = NR_FILE }; -DEFINE_STATIC_LGLOCK(files_lglock); - /* SLAB cache for file structures */ static struct kmem_cache *filp_cachep __read_mostly; @@ -54,7 +52,6 @@ static void file_free_rcu(struct rcu_head *head) static inline void file_free(struct file *f) { percpu_counter_dec(&nr_files); - file_check_state(f); call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } @@ -79,14 +76,14 @@ EXPORT_SYMBOL_GPL(get_max_files); * Handle nr_files sysctl */ #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) -int proc_nr_files(ctl_table *table, int write, +int proc_nr_files(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #else -int proc_nr_files(ctl_table *table, int write, +int proc_nr_files(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; @@ -134,10 +131,10 @@ struct file *get_empty_filp(void) return ERR_PTR(error); } - INIT_LIST_HEAD(&f->f_u.fu_list); atomic_long_set(&f->f_count, 1); rwlock_init(&f->f_owner.lock); spin_lock_init(&f->f_lock); + mutex_init(&f->f_pos_lock); eventpoll_init_file(f); /* f->f_version: 0 */ return f; @@ -178,49 +175,20 @@ struct file *alloc_file(struct path *path, fmode_t mode, file->f_path = *path; file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; + if ((mode & FMODE_READ) && + likely(fop->read || fop->aio_read || fop->read_iter)) + mode |= FMODE_CAN_READ; + if ((mode & FMODE_WRITE) && + likely(fop->write || fop->aio_write || fop->write_iter)) + mode |= FMODE_CAN_WRITE; file->f_mode = mode; file->f_op = fop; - - /* - * These mounts don't really matter in practice - * for r/o bind mounts. They aren't userspace- - * visible. We do this for consistency, and so - * that we can do debugging checks at __fput() - */ - if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) { - file_take_write(file); - WARN_ON(mnt_clone_write(path->mnt)); - } if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(path->dentry->d_inode); return file; } EXPORT_SYMBOL(alloc_file); -/** - * drop_file_write_access - give up ability to write to a file - * @file: the file to which we will stop writing - * - * This is a central place which will give up the ability - * to write to @file, along with access to write through - * its vfsmount. - */ -static void drop_file_write_access(struct file *file) -{ - struct vfsmount *mnt = file->f_path.mnt; - struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; - - put_write_access(inode); - - if (special_file(inode->i_mode)) - return; - if (file_check_writeable(file) != 0) - return; - __mnt_drop_write(mnt); - file_release_write(file); -} - /* the real guts of fput() - releasing the last reference to file */ static void __fput(struct file *file) @@ -237,14 +205,14 @@ static void __fput(struct file *file) * in the file cleanup chain. */ eventpoll_release(file); - locks_remove_flock(file); + locks_remove_file(file); if (unlikely(file->f_flags & FASYNC)) { - if (file->f_op && file->f_op->fasync) + if (file->f_op->fasync) file->f_op->fasync(-1, file, 0); } ima_file_free(file); - if (file->f_op && file->f_op->release) + if (file->f_op->release) file->f_op->release(inode, file); security_file_free(file); if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && @@ -255,8 +223,10 @@ static void __fput(struct file *file) put_pid(file->f_owner.pid); if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_dec(inode); - if (file->f_mode & FMODE_WRITE) - drop_file_write_access(file); + if (file->f_mode & FMODE_WRITER) { + put_write_access(inode); + __mnt_drop_write(mnt); + } file->f_path.dentry = NULL; file->f_path.mnt = NULL; file->f_inode = NULL; @@ -297,14 +267,13 @@ void flush_delayed_fput(void) delayed_fput(NULL); } -static DECLARE_WORK(delayed_fput_work, delayed_fput); +static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); void fput(struct file *file) { if (atomic_long_dec_and_test(&file->f_count)) { struct task_struct *task = current; - file_sb_list_del(file); if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { init_task_work(&file->f_u.fu_rcuhead, ____fput); if (!task_work_add(task, &file->f_u.fu_rcuhead, true)) @@ -317,7 +286,7 @@ void fput(struct file *file) } if (llist_add(&file->f_u.fu_llist, &delayed_fput_list)) - schedule_work(&delayed_fput_work); + schedule_delayed_work(&delayed_fput_work, 1); } } @@ -333,7 +302,6 @@ void __fput_sync(struct file *file) { if (atomic_long_dec_and_test(&file->f_count)) { struct task_struct *task = current; - file_sb_list_del(file); BUG_ON(!(task->flags & PF_KTHREAD)); __fput(file); } @@ -345,129 +313,10 @@ void put_filp(struct file *file) { if (atomic_long_dec_and_test(&file->f_count)) { security_file_free(file); - file_sb_list_del(file); file_free(file); } } -static inline int file_list_cpu(struct file *file) -{ -#ifdef CONFIG_SMP - return file->f_sb_list_cpu; -#else - return smp_processor_id(); -#endif -} - -/* helper for file_sb_list_add to reduce ifdefs */ -static inline void __file_sb_list_add(struct file *file, struct super_block *sb) -{ - struct list_head *list; -#ifdef CONFIG_SMP - int cpu; - cpu = smp_processor_id(); - file->f_sb_list_cpu = cpu; - list = per_cpu_ptr(sb->s_files, cpu); -#else - list = &sb->s_files; -#endif - list_add(&file->f_u.fu_list, list); -} - -/** - * file_sb_list_add - add a file to the sb's file list - * @file: file to add - * @sb: sb to add it to - * - * Use this function to associate a file with the superblock of the inode it - * refers to. - */ -void file_sb_list_add(struct file *file, struct super_block *sb) -{ - if (likely(!(file->f_mode & FMODE_WRITE))) - return; - if (!S_ISREG(file_inode(file)->i_mode)) - return; - lg_local_lock(&files_lglock); - __file_sb_list_add(file, sb); - lg_local_unlock(&files_lglock); -} - -/** - * file_sb_list_del - remove a file from the sb's file list - * @file: file to remove - * @sb: sb to remove it from - * - * Use this function to remove a file from its superblock. - */ -void file_sb_list_del(struct file *file) -{ - if (!list_empty(&file->f_u.fu_list)) { - lg_local_lock_cpu(&files_lglock, file_list_cpu(file)); - list_del_init(&file->f_u.fu_list); - lg_local_unlock_cpu(&files_lglock, file_list_cpu(file)); - } -} - -#ifdef CONFIG_SMP - -/* - * These macros iterate all files on all CPUs for a given superblock. - * files_lglock must be held globally. - */ -#define do_file_list_for_each_entry(__sb, __file) \ -{ \ - int i; \ - for_each_possible_cpu(i) { \ - struct list_head *list; \ - list = per_cpu_ptr((__sb)->s_files, i); \ - list_for_each_entry((__file), list, f_u.fu_list) - -#define while_file_list_for_each_entry \ - } \ -} - -#else - -#define do_file_list_for_each_entry(__sb, __file) \ -{ \ - struct list_head *list; \ - list = &(sb)->s_files; \ - list_for_each_entry((__file), list, f_u.fu_list) - -#define while_file_list_for_each_entry \ -} - -#endif - -/** - * mark_files_ro - mark all files read-only - * @sb: superblock in question - * - * All files are marked read-only. We don't care about pending - * delete files so this should be used in 'force' mode only. - */ -void mark_files_ro(struct super_block *sb) -{ - struct file *f; - - lg_global_lock(&files_lglock); - do_file_list_for_each_entry(sb, f) { - if (!file_count(f)) - continue; - if (!(f->f_mode & FMODE_WRITE)) - continue; - spin_lock(&f->f_lock); - f->f_mode &= ~FMODE_WRITE; - spin_unlock(&f->f_lock); - if (file_check_writeable(f) != 0) - continue; - __mnt_drop_write(f->f_path.mnt); - file_release_write(f); - } while_file_list_for_each_entry; - lg_global_unlock(&files_lglock); -} - void __init files_init(unsigned long mempages) { unsigned long n; @@ -482,7 +331,5 @@ void __init files_init(unsigned long mempages) n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); - files_defer_init(); - lg_lock_init(&files_lglock, "files_lglock"); percpu_counter_init(&nr_files, 0); } diff --git a/fs/filesystems.c b/fs/filesystems.c index 92567d95ba6..5797d45a78c 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -121,6 +121,7 @@ int unregister_filesystem(struct file_system_type * fs) EXPORT_SYMBOL(unregister_filesystem); +#ifdef CONFIG_SYSFS_SYSCALL static int fs_index(const char __user * __name) { struct file_system_type * tmp; @@ -199,6 +200,7 @@ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2) } return retval; } +#endif int __init get_filesystem_list(char *buf) { diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index f47df72cef1..363e3ae25f6 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -354,7 +354,7 @@ static void vxfs_i_callback(struct rcu_head *head) void vxfs_evict_inode(struct inode *ip) { - truncate_inode_pages(&ip->i_data, 0); + truncate_inode_pages_final(&ip->i_data); clear_inode(ip); call_rcu(&ip->i_rcu, vxfs_i_callback); } diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index 25d4099a4ae..99c7f0a37af 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -192,7 +192,7 @@ vxfs_inode_by_name(struct inode *dip, struct dentry *dp) * vxfs_lookup - lookup pathname component * @dip: dir in which we lookup * @dp: dentry we lookup - * @nd: lookup nameidata + * @flags: lookup flags * * Description: * vxfs_lookup tries to lookup the pathname component described diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index e37eb274e49..7ca8c75d50d 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -124,6 +124,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp) static int vxfs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_RDONLY; return 0; } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 9f4935b8f20..be568b7311d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -26,6 +26,7 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/tracepoint.h> +#include <linux/device.h> #include "internal.h" /* @@ -88,16 +89,31 @@ static inline struct inode *wb_inode(struct list_head *head) #define CREATE_TRACE_POINTS #include <trace/events/writeback.h> +EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage); + +static void bdi_wakeup_thread(struct backing_dev_info *bdi) +{ + spin_lock_bh(&bdi->wb_lock); + if (test_bit(BDI_registered, &bdi->state)) + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); + spin_unlock_bh(&bdi->wb_lock); +} + static void bdi_queue_work(struct backing_dev_info *bdi, struct wb_writeback_work *work) { trace_writeback_queue(bdi, work); spin_lock_bh(&bdi->wb_lock); + if (!test_bit(BDI_registered, &bdi->state)) { + if (work->done) + complete(work->done); + goto out_unlock; + } list_add_tail(&work->list, &bdi->work_list); - spin_unlock_bh(&bdi->wb_lock); - mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); +out_unlock: + spin_unlock_bh(&bdi->wb_lock); } static void @@ -113,7 +129,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { trace_writeback_nowork(bdi); - mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); + bdi_wakeup_thread(bdi); return; } @@ -160,7 +176,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) * writeback as soon as there is no other work to do. */ trace_writeback_wake_background(bdi); - mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); + bdi_wakeup_thread(bdi); } /* @@ -510,13 +526,16 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, } WARN_ON(inode->i_state & I_SYNC); /* - * Skip inode if it is clean. We don't want to mess with writeback - * lists in this function since flusher thread may be doing for example - * sync in parallel and if we move the inode, it could get skipped. So - * here we make sure inode is on some writeback list and leave it there - * unless we have completely cleaned the inode. + * Skip inode if it is clean and we have no outstanding writeback in + * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this + * function since flusher thread may be doing for example sync in + * parallel and if we move the inode, it could get skipped. So here we + * make sure inode is on some writeback list and leave it there unless + * we have completely cleaned the inode. */ - if (!(inode->i_state & I_DIRTY)) + if (!(inode->i_state & I_DIRTY) && + (wbc->sync_mode != WB_SYNC_ALL || + !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) goto out; inode->i_state |= I_SYNC; spin_unlock(&inode->i_lock); @@ -1013,7 +1032,7 @@ void bdi_writeback_workfn(struct work_struct *work) current->flags |= PF_SWAPWRITE; if (likely(!current_is_workqueue_rescuer() || - list_empty(&bdi->bdi_list))) { + !test_bit(BDI_registered, &bdi->state))) { /* * The normal path. Keep writing back @bdi until its * work_list is empty. Note that this path is also taken @@ -1035,10 +1054,10 @@ void bdi_writeback_workfn(struct work_struct *work) trace_writeback_pages_written(pages_written); } - if (!list_empty(&bdi->work_list) || - (wb_has_dirty_io(wb) && dirty_writeback_interval)) - queue_delayed_work(bdi_wq, &wb->dwork, - msecs_to_jiffies(dirty_writeback_interval * 10)); + if (!list_empty(&bdi->work_list)) + mod_delayed_work(bdi_wq, &wb->dwork, 0); + else if (wb_has_dirty_io(wb) && dirty_writeback_interval) + bdi_wakeup_thread_delayed(bdi); current->flags &= ~PF_SWAPWRITE; } diff --git a/fs/fs_struct.c b/fs/fs_struct.c index d8ac61d0c93..7dca743b2ce 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -161,6 +161,6 @@ EXPORT_SYMBOL(current_umask); struct fs_struct init_fs = { .users = 1, .lock = __SPIN_LOCK_UNLOCKED(init_fs.lock), - .seq = SEQCNT_ZERO, + .seq = SEQCNT_ZERO(init_fs.seq), .umask = 0022, }; diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index f7cff367db7..56cce7fdd39 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -280,15 +280,15 @@ int fscache_add_cache(struct fscache_cache *cache, spin_unlock(&fscache_fsdef_index.lock); up_write(&fscache_addremove_sem); - printk(KERN_NOTICE "FS-Cache: Cache \"%s\" added (type %s)\n", - cache->tag->name, cache->ops->name); + pr_notice("Cache \"%s\" added (type %s)\n", + cache->tag->name, cache->ops->name); kobject_uevent(cache->kobj, KOBJ_ADD); _leave(" = 0 [%s]", cache->identifier); return 0; tag_in_use: - printk(KERN_ERR "FS-Cache: Cache tag '%s' already in use\n", tagname); + pr_err("Cache tag '%s' already in use\n", tagname); __fscache_release_cache_tag(tag); _leave(" = -EXIST"); return -EEXIST; @@ -317,8 +317,7 @@ EXPORT_SYMBOL(fscache_add_cache); void fscache_io_error(struct fscache_cache *cache) { if (!test_and_set_bit(FSCACHE_IOERROR, &cache->flags)) - printk(KERN_ERR "FS-Cache:" - " Cache '%s' stopped due to I/O error\n", + pr_err("Cache '%s' stopped due to I/O error\n", cache->ops->name); } EXPORT_SYMBOL(fscache_io_error); @@ -369,8 +368,8 @@ void fscache_withdraw_cache(struct fscache_cache *cache) _enter(""); - printk(KERN_NOTICE "FS-Cache: Withdrawing cache \"%s\"\n", - cache->tag->name); + pr_notice("Withdrawing cache \"%s\"\n", + cache->tag->name); /* make the cache unavailable for cookie acquisition */ if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags)) diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index b2a86e324aa..aec01be91b0 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -58,15 +58,16 @@ void fscache_cookie_init_once(void *_cookie) struct fscache_cookie *__fscache_acquire_cookie( struct fscache_cookie *parent, const struct fscache_cookie_def *def, - void *netfs_data) + void *netfs_data, + bool enable) { struct fscache_cookie *cookie; BUG_ON(!def); - _enter("{%s},{%s},%p", + _enter("{%s},{%s},%p,%u", parent ? (char *) parent->def->name : "<no-parent>", - def->name, netfs_data); + def->name, netfs_data, enable); fscache_stat(&fscache_n_acquires); @@ -106,7 +107,7 @@ struct fscache_cookie *__fscache_acquire_cookie( cookie->def = def; cookie->parent = parent; cookie->netfs_data = netfs_data; - cookie->flags = 0; + cookie->flags = (1 << FSCACHE_COOKIE_NO_DATA_YET); /* radix tree insertion won't use the preallocation pool unless it's * told it may not wait */ @@ -124,16 +125,22 @@ struct fscache_cookie *__fscache_acquire_cookie( break; } - /* if the object is an index then we need do nothing more here - we - * create indices on disk when we need them as an index may exist in - * multiple caches */ - if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { - if (fscache_acquire_non_index_cookie(cookie) < 0) { - atomic_dec(&parent->n_children); - __fscache_cookie_put(cookie); - fscache_stat(&fscache_n_acquires_nobufs); - _leave(" = NULL"); - return NULL; + if (enable) { + /* if the object is an index then we need do nothing more here + * - we create indices on disk when we need them as an index + * may exist in multiple caches */ + if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { + if (fscache_acquire_non_index_cookie(cookie) == 0) { + set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); + } else { + atomic_dec(&parent->n_children); + __fscache_cookie_put(cookie); + fscache_stat(&fscache_n_acquires_nobufs); + _leave(" = NULL"); + return NULL; + } + } else { + set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); } } @@ -144,6 +151,39 @@ struct fscache_cookie *__fscache_acquire_cookie( EXPORT_SYMBOL(__fscache_acquire_cookie); /* + * Enable a cookie to permit it to accept new operations. + */ +void __fscache_enable_cookie(struct fscache_cookie *cookie, + bool (*can_enable)(void *data), + void *data) +{ + _enter("%p", cookie); + + wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + + if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) + goto out_unlock; + + if (can_enable && !can_enable(data)) { + /* The netfs decided it didn't want to enable after all */ + } else if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { + /* Wait for outstanding disablement to complete */ + __fscache_wait_on_invalidate(cookie); + + if (fscache_acquire_non_index_cookie(cookie) == 0) + set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); + } else { + set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); + } + +out_unlock: + clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags); + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK); +} +EXPORT_SYMBOL(__fscache_enable_cookie); + +/* * acquire a non-index cookie * - this must make sure the index chain is instantiated and instantiate the * object representation too @@ -157,7 +197,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) _enter(""); - cookie->flags = 1 << FSCACHE_COOKIE_UNAVAILABLE; + set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); /* now we need to see whether the backing objects for this cookie yet * exist, if not there'll be nothing to search */ @@ -180,9 +220,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) _debug("cache %s", cache->tag->name); - cookie->flags = - (1 << FSCACHE_COOKIE_LOOKING_UP) | - (1 << FSCACHE_COOKIE_NO_DATA_YET); + set_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); /* ask the cache to allocate objects for this cookie and its parent * chain */ @@ -398,7 +436,8 @@ void __fscache_invalidate(struct fscache_cookie *cookie) if (!hlist_empty(&cookie->backing_objects)) { spin_lock(&cookie->lock); - if (!hlist_empty(&cookie->backing_objects) && + if (fscache_cookie_enabled(cookie) && + !hlist_empty(&cookie->backing_objects) && !test_and_set_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { object = hlist_entry(cookie->backing_objects.first, @@ -452,10 +491,14 @@ void __fscache_update_cookie(struct fscache_cookie *cookie) spin_lock(&cookie->lock); - /* update the index entry on disk in each cache backing this cookie */ - hlist_for_each_entry(object, - &cookie->backing_objects, cookie_link) { - fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE); + if (fscache_cookie_enabled(cookie)) { + /* update the index entry on disk in each cache backing this + * cookie. + */ + hlist_for_each_entry(object, + &cookie->backing_objects, cookie_link) { + fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE); + } } spin_unlock(&cookie->lock); @@ -464,15 +507,80 @@ void __fscache_update_cookie(struct fscache_cookie *cookie) EXPORT_SYMBOL(__fscache_update_cookie); /* + * Disable a cookie to stop it from accepting new requests from the netfs. + */ +void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) +{ + struct fscache_object *object; + bool awaken = false; + + _enter("%p,%u", cookie, invalidate); + + ASSERTCMP(atomic_read(&cookie->n_active), >, 0); + + if (atomic_read(&cookie->n_children) != 0) { + pr_err("Cookie '%s' still has children\n", + cookie->def->name); + BUG(); + } + + wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) + goto out_unlock_enable; + + /* If the cookie is being invalidated, wait for that to complete first + * so that we can reuse the flag. + */ + __fscache_wait_on_invalidate(cookie); + + /* Dispose of the backing objects */ + set_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags); + + spin_lock(&cookie->lock); + if (!hlist_empty(&cookie->backing_objects)) { + hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) { + if (invalidate) + set_bit(FSCACHE_OBJECT_RETIRED, &object->flags); + fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL); + } + } else { + if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) + awaken = true; + } + spin_unlock(&cookie->lock); + if (awaken) + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); + + /* Wait for cessation of activity requiring access to the netfs (when + * n_active reaches 0). This makes sure outstanding reads and writes + * have completed. + */ + if (!atomic_dec_and_test(&cookie->n_active)) + wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t, + TASK_UNINTERRUPTIBLE); + + /* Reset the cookie state if it wasn't relinquished */ + if (!test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags)) { + atomic_inc(&cookie->n_active); + set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + } + +out_unlock_enable: + clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags); + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK); + _leave(""); +} +EXPORT_SYMBOL(__fscache_disable_cookie); + +/* * release a cookie back to the cache * - the object will be marked as recyclable on disk if retire is true * - all dependents of this cookie must have already been unregistered * (indices/files/pages) */ -void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) +void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire) { - struct fscache_object *object; - fscache_stat(&fscache_n_relinquishes); if (retire) fscache_stat(&fscache_n_relinquishes_retire); @@ -487,31 +595,10 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) cookie, cookie->def->name, cookie->netfs_data, atomic_read(&cookie->n_active), retire); - ASSERTCMP(atomic_read(&cookie->n_active), >, 0); - - if (atomic_read(&cookie->n_children) != 0) { - printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n", - cookie->def->name); - BUG(); - } - /* No further netfs-accessing operations on this cookie permitted */ set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags); - if (retire) - set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags); - spin_lock(&cookie->lock); - hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) { - fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL); - } - spin_unlock(&cookie->lock); - - /* Wait for cessation of activity requiring access to the netfs (when - * n_active reaches 0). - */ - if (!atomic_dec_and_test(&cookie->n_active)) - wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t, - TASK_UNINTERRUPTIBLE); + __fscache_disable_cookie(cookie, retire); /* Clear pointers back to the netfs */ cookie->netfs_data = NULL; @@ -568,6 +655,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) { struct fscache_operation *op; struct fscache_object *object; + bool wake_cookie = false; int ret; _enter("%p,", cookie); @@ -591,7 +679,8 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) spin_lock(&cookie->lock); - if (hlist_empty(&cookie->backing_objects)) + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) goto inconsistent; object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); @@ -600,7 +689,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) op->debug_id = atomic_inc_return(&fscache_op_debug_id); - atomic_inc(&cookie->n_active); + __fscache_use_cookie(cookie); if (fscache_submit_op(object, op) < 0) goto submit_failed; @@ -622,9 +711,11 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) return ret; submit_failed: - atomic_dec(&cookie->n_active); + wake_cookie = __fscache_unuse_cookie(cookie); inconsistent: spin_unlock(&cookie->lock); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); kfree(op); _leave(" = -ESTALE"); return -ESTALE; diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c index 10a2ade0bdf..5a117df2a9e 100644 --- a/fs/fscache/fsdef.c +++ b/fs/fscache/fsdef.c @@ -59,6 +59,7 @@ struct fscache_cookie fscache_fsdef_index = { .lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock), .backing_objects = HLIST_HEAD_INIT, .def = &fscache_fsdef_index_def, + .flags = 1 << FSCACHE_COOKIE_ENABLED, }; EXPORT_SYMBOL(fscache_fsdef_index); diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c index bad496748a5..7d637e2335f 100644 --- a/fs/fscache/histogram.c +++ b/fs/fscache/histogram.c @@ -31,12 +31,10 @@ static int fscache_histogram_show(struct seq_file *m, void *v) switch ((unsigned long) v) { case 1: - seq_puts(m, "JIFS SECS OBJ INST OP RUNS OBJ RUNS " - " RETRV DLY RETRIEVLS\n"); + seq_puts(m, "JIFS SECS OBJ INST OP RUNS OBJ RUNS RETRV DLY RETRIEVLS\n"); return 0; case 2: - seq_puts(m, "===== ===== ========= ========= =========" - " ========= =========\n"); + seq_puts(m, "===== ===== ========= ========= ========= ========= =========\n"); return 0; default: index = (unsigned long) v - 3; diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 4226f6680b0..bc6c08fcfdd 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -22,6 +22,12 @@ * */ +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) "FS-Cache: " fmt + #include <linux/fscache-cache.h> #include <linux/sched.h> @@ -413,8 +419,8 @@ do { \ #define ASSERT(X) \ do { \ if (unlikely(!(X))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ BUG(); \ } \ } while (0) @@ -422,9 +428,9 @@ do { \ #define ASSERTCMP(X, OP, Y) \ do { \ if (unlikely(!((X) OP (Y)))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ - printk(KERN_ERR "%lx " #OP " %lx is false\n", \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + pr_err("%lx " #OP " %lx is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ BUG(); \ } \ @@ -433,8 +439,8 @@ do { \ #define ASSERTIF(C, X) \ do { \ if (unlikely((C) && !(X))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ BUG(); \ } \ } while (0) @@ -442,9 +448,9 @@ do { \ #define ASSERTIFCMP(C, X, OP, Y) \ do { \ if (unlikely((C) && !((X) OP (Y)))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ - printk(KERN_ERR "%lx " #OP " %lx is false\n", \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + pr_err("%lx " #OP " %lx is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ BUG(); \ } \ diff --git a/fs/fscache/main.c b/fs/fscache/main.c index 7c27907e650..63f868e869b 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -67,7 +67,7 @@ static int fscache_max_active_sysctl(struct ctl_table *table, int write, return ret; } -ctl_table fscache_sysctls[] = { +struct ctl_table fscache_sysctls[] = { { .procname = "object_max_active", .data = &fscache_object_max_active, @@ -87,7 +87,7 @@ ctl_table fscache_sysctls[] = { {} }; -ctl_table fscache_sysctls_root[] = { +struct ctl_table fscache_sysctls_root[] = { { .procname = "fscache", .mode = 0555, @@ -146,8 +146,7 @@ static int __init fscache_init(void) 0, fscache_cookie_init_once); if (!fscache_cookie_jar) { - printk(KERN_NOTICE - "FS-Cache: Failed to allocate a cookie jar\n"); + pr_notice("Failed to allocate a cookie jar\n"); ret = -ENOMEM; goto error_cookie_jar; } @@ -156,7 +155,7 @@ static int __init fscache_init(void) if (!fscache_root) goto error_kobj; - printk(KERN_NOTICE "FS-Cache: Loaded\n"); + pr_notice("Loaded\n"); return 0; error_kobj: @@ -192,7 +191,7 @@ static void __exit fscache_exit(void) fscache_proc_cleanup(); destroy_workqueue(fscache_op_wq); destroy_workqueue(fscache_object_wq); - printk(KERN_NOTICE "FS-Cache: Unloaded\n"); + pr_notice("Unloaded\n"); } module_exit(fscache_exit); diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c index b1bb6117473..6d941f56faf 100644 --- a/fs/fscache/netfs.c +++ b/fs/fscache/netfs.c @@ -45,6 +45,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs) netfs->primary_index->def = &fscache_fsdef_netfs_def; netfs->primary_index->parent = &fscache_fsdef_index; netfs->primary_index->netfs_data = netfs; + netfs->primary_index->flags = 1 << FSCACHE_COOKIE_ENABLED; atomic_inc(&netfs->primary_index->parent->usage); atomic_inc(&netfs->primary_index->parent->n_children); @@ -64,8 +65,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs) list_add(&netfs->link, &fscache_netfs_list); ret = 0; - printk(KERN_NOTICE "FS-Cache: Netfs '%s' registered for caching\n", - netfs->name); + pr_notice("Netfs '%s' registered for caching\n", netfs->name); already_registered: up_write(&fscache_addremove_sem); @@ -96,8 +96,8 @@ void __fscache_unregister_netfs(struct fscache_netfs *netfs) up_write(&fscache_addremove_sem); - printk(KERN_NOTICE "FS-Cache: Netfs '%s' unregistered from caching\n", - netfs->name); + pr_notice("Netfs '%s' unregistered from caching\n", + netfs->name); _leave(""); } diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index e1959efad64..b8179ca6bf9 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -50,6 +50,8 @@ void fscache_objlist_add(struct fscache_object *obj) struct fscache_object *xobj; struct rb_node **p = &fscache_object_list.rb_node, *parent = NULL; + ASSERT(RB_EMPTY_NODE(&obj->objlist_link)); + write_lock(&fscache_object_list_lock); while (*p) { @@ -75,6 +77,9 @@ void fscache_objlist_add(struct fscache_object *obj) */ void fscache_objlist_remove(struct fscache_object *obj) { + if (RB_EMPTY_NODE(&obj->objlist_link)) + return; + write_lock(&fscache_object_list_lock); BUG_ON(RB_EMPTY_ROOT(&fscache_object_list)); @@ -280,20 +285,20 @@ static int fscache_objlist_show(struct seq_file *m, void *v) fscache_unuse_cookie(obj); if (keylen > 0 || auxlen > 0) { - seq_printf(m, " "); + seq_puts(m, " "); for (p = buf; keylen > 0; keylen--) seq_printf(m, "%02x", *p++); if (auxlen > 0) { if (config & FSCACHE_OBJLIST_CONFIG_KEY) - seq_printf(m, ", "); + seq_puts(m, ", "); for (; auxlen > 0; auxlen--) seq_printf(m, "%02x", *p++); } } - seq_printf(m, "\n"); + seq_puts(m, "\n"); } else { - seq_printf(m, "<no_netfs>\n"); + seq_puts(m, "<no_netfs>\n"); } return 0; } diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 86d75a60b20..d3b4539f165 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -314,6 +314,9 @@ void fscache_object_init(struct fscache_object *object, object->cache = cache; object->cookie = cookie; object->parent = NULL; +#ifdef CONFIG_FSCACHE_OBJECT_LIST + RB_CLEAR_NODE(&object->objlist_link); +#endif object->oob_event_mask = 0; for (t = object->oob_table; t->events; t++) @@ -495,6 +498,7 @@ void fscache_object_lookup_negative(struct fscache_object *object) * returning ENODATA. */ set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); _debug("wake up lookup %p", &cookie->flags); clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); @@ -527,6 +531,7 @@ void fscache_obtained_object(struct fscache_object *object) /* We do (presumably) have data */ clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); /* Allow write requests to begin stacking up and read requests * to begin shovelling data. @@ -679,7 +684,8 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob */ spin_lock(&cookie->lock); hlist_del_init(&object->cookie_link); - if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) + if (hlist_empty(&cookie->backing_objects) && + test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) awaken = true; spin_unlock(&cookie->lock); @@ -796,7 +802,7 @@ void fscache_enqueue_object(struct fscache_object *object) */ bool fscache_object_sleep_till_congested(signed long *timeoutp) { - wait_queue_head_t *cong_wq = &__get_cpu_var(fscache_object_cong_wait); + wait_queue_head_t *cong_wq = this_cpu_ptr(&fscache_object_cong_wait); DEFINE_WAIT(wait); if (fscache_object_congested()) @@ -927,7 +933,7 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj */ if (!fscache_use_cookie(object)) { ASSERT(object->cookie->stores.rnode == NULL); - set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags); + set_bit(FSCACHE_OBJECT_RETIRED, &object->flags); _leave(" [no cookie]"); return transit_to(KILL_OBJECT); } diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 318071aca21..e7b87a0e518 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -51,8 +51,7 @@ void fscache_enqueue_operation(struct fscache_operation *op) _debug("queue for caller's attention"); break; default: - printk(KERN_ERR "FS-Cache: Unexpected op type %lx", - op->flags); + pr_err("Unexpected op type %lx", op->flags); BUG(); break; } diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 73899c1c344..ed70714503f 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -163,12 +163,10 @@ static void fscache_attr_changed_op(struct fscache_operation *op) fscache_stat(&fscache_n_attr_changed_calls); - if (fscache_object_is_active(object) && - fscache_use_cookie(object)) { + if (fscache_object_is_active(object)) { fscache_stat(&fscache_n_cop_attr_changed); ret = object->cache->ops->attr_changed(object); fscache_stat_d(&fscache_n_cop_attr_changed); - fscache_unuse_cookie(object); if (ret < 0) fscache_abort_object(object); } @@ -184,6 +182,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) { struct fscache_operation *op; struct fscache_object *object; + bool wake_cookie; _enter("%p", cookie); @@ -199,15 +198,19 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) } fscache_operation_init(op, fscache_attr_changed_op, NULL); - op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE); + op->flags = FSCACHE_OP_ASYNC | + (1 << FSCACHE_OP_EXCLUSIVE) | + (1 << FSCACHE_OP_UNUSE_COOKIE); spin_lock(&cookie->lock); - if (hlist_empty(&cookie->backing_objects)) + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) goto nobufs; object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); + __fscache_use_cookie(cookie); if (fscache_submit_exclusive_op(object, op) < 0) goto nobufs; spin_unlock(&cookie->lock); @@ -217,8 +220,11 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) return 0; nobufs: + wake_cookie = __fscache_unuse_cookie(cookie); spin_unlock(&cookie->lock); kfree(op); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); fscache_stat(&fscache_n_attr_changed_nobufs); _leave(" = %d", -ENOBUFS); return -ENOBUFS; @@ -263,7 +269,6 @@ static struct fscache_retrieval *fscache_alloc_retrieval( } fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op); - atomic_inc(&cookie->n_active); op->op.flags = FSCACHE_OP_MYTHREAD | (1UL << FSCACHE_OP_WAITING) | (1UL << FSCACHE_OP_UNUSE_COOKIE); @@ -384,6 +389,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, { struct fscache_retrieval *op; struct fscache_object *object; + bool wake_cookie = false; int ret; _enter("%p,%p,,,", cookie, page); @@ -405,7 +411,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, return -ERESTARTSYS; op = fscache_alloc_retrieval(cookie, page->mapping, - end_io_func,context); + end_io_func, context); if (!op) { _leave(" = -ENOMEM"); return -ENOMEM; @@ -414,13 +420,15 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, spin_lock(&cookie->lock); - if (hlist_empty(&cookie->backing_objects)) + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) goto nobufs_unlock; object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)); + __fscache_use_cookie(cookie); atomic_inc(&object->n_reads); __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); @@ -475,9 +483,11 @@ error: nobufs_unlock_dec: atomic_dec(&object->n_reads); + wake_cookie = __fscache_unuse_cookie(cookie); nobufs_unlock: spin_unlock(&cookie->lock); - atomic_dec(&cookie->n_active); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); kfree(op); nobufs: fscache_stat(&fscache_n_retrievals_nobufs); @@ -514,6 +524,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, { struct fscache_retrieval *op; struct fscache_object *object; + bool wake_cookie = false; int ret; _enter("%p,,%d,,,", cookie, *nr_pages); @@ -542,11 +553,13 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, spin_lock(&cookie->lock); - if (hlist_empty(&cookie->backing_objects)) + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) goto nobufs_unlock; object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); + __fscache_use_cookie(cookie); atomic_inc(&object->n_reads); __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); @@ -601,10 +614,12 @@ error: nobufs_unlock_dec: atomic_dec(&object->n_reads); + wake_cookie = __fscache_unuse_cookie(cookie); nobufs_unlock: spin_unlock(&cookie->lock); - atomic_dec(&cookie->n_active); kfree(op); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); nobufs: fscache_stat(&fscache_n_retrievals_nobufs); _leave(" = -ENOBUFS"); @@ -626,6 +641,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, { struct fscache_retrieval *op; struct fscache_object *object; + bool wake_cookie = false; int ret; _enter("%p,%p,,,", cookie, page); @@ -653,13 +669,15 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, spin_lock(&cookie->lock); - if (hlist_empty(&cookie->backing_objects)) + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) goto nobufs_unlock; object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); + __fscache_use_cookie(cookie); if (fscache_submit_op(object, &op->op) < 0) - goto nobufs_unlock; + goto nobufs_unlock_dec; spin_unlock(&cookie->lock); fscache_stat(&fscache_n_alloc_ops); @@ -689,10 +707,13 @@ error: _leave(" = %d", ret); return ret; +nobufs_unlock_dec: + wake_cookie = __fscache_unuse_cookie(cookie); nobufs_unlock: spin_unlock(&cookie->lock); - atomic_dec(&cookie->n_active); kfree(op); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); nobufs: fscache_stat(&fscache_n_allocs_nobufs); _leave(" = -ENOBUFS"); @@ -889,6 +910,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, { struct fscache_storage *op; struct fscache_object *object; + bool wake_cookie = false; int ret; _enter("%p,%x,", cookie, (u32) page->flags); @@ -920,7 +942,8 @@ int __fscache_write_page(struct fscache_cookie *cookie, ret = -ENOBUFS; spin_lock(&cookie->lock); - if (hlist_empty(&cookie->backing_objects)) + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) goto nobufs; object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); @@ -957,7 +980,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, op->op.debug_id = atomic_inc_return(&fscache_op_debug_id); op->store_limit = object->store_limit; - atomic_inc(&cookie->n_active); + __fscache_use_cookie(cookie); if (fscache_submit_op(object, &op->op) < 0) goto submit_failed; @@ -984,10 +1007,10 @@ already_pending: return 0; submit_failed: - atomic_dec(&cookie->n_active); spin_lock(&cookie->stores_lock); radix_tree_delete(&cookie->stores, page->index); spin_unlock(&cookie->stores_lock); + wake_cookie = __fscache_unuse_cookie(cookie); page_cache_release(page); ret = -ENOBUFS; goto nobufs; @@ -999,6 +1022,8 @@ nobufs: spin_unlock(&cookie->lock); radix_tree_preload_end(); kfree(op); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); fscache_stat(&fscache_n_stores_nobufs); _leave(" = -ENOBUFS"); return -ENOBUFS; @@ -1083,10 +1108,8 @@ void fscache_mark_page_cached(struct fscache_retrieval *op, struct page *page) static bool once_only; if (!once_only) { once_only = true; - printk(KERN_WARNING "FS-Cache:" - " Cookie type %s marked page %lx" - " multiple times\n", - cookie->def->name, page->index); + pr_warn("Cookie type %s marked page %lx multiple times\n", + cookie->def->name, page->index); } } diff --git a/fs/fuse/control.c b/fs/fuse/control.c index a0b0855d00a..205e0d5d530 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -348,7 +348,7 @@ int __init fuse_ctl_init(void) return register_filesystem(&fuse_ctl_fs_type); } -void fuse_ctl_cleanup(void) +void __exit fuse_ctl_cleanup(void) { unregister_filesystem(&fuse_ctl_fs_type); } diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index adbfd66b380..966ace8b243 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -94,8 +94,10 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count, loff_t pos = 0; struct iovec iov = { .iov_base = buf, .iov_len = count }; struct fuse_io_priv io = { .async = 0, .file = file }; + struct iov_iter ii; + iov_iter_init(&ii, READ, &iov, 1, count); - return fuse_direct_io(&io, &iov, 1, count, &pos, 0); + return fuse_direct_io(&io, &ii, &pos, FUSE_DIO_CUSE); } static ssize_t cuse_write(struct file *file, const char __user *buf, @@ -104,12 +106,15 @@ static ssize_t cuse_write(struct file *file, const char __user *buf, loff_t pos = 0; struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; struct fuse_io_priv io = { .async = 0, .file = file }; + struct iov_iter ii; + iov_iter_init(&ii, WRITE, &iov, 1, count); /* * No locking or generic_write_checks(), the server is * responsible for locking and sanity checks. */ - return fuse_direct_io(&io, &iov, 1, count, &pos, 1); + return fuse_direct_io(&io, &ii, &pos, + FUSE_DIO_WRITE | FUSE_DIO_CUSE); } static int cuse_open(struct inode *inode, struct file *file) @@ -473,7 +478,7 @@ err: static void cuse_fc_release(struct fuse_conn *fc) { struct cuse_conn *cc = fc_to_cc(fc); - kfree(cc); + kfree_rcu(cc, fc.rcu); } /** @@ -568,7 +573,7 @@ static ssize_t cuse_class_waiting_show(struct device *dev, return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting)); } -static DEVICE_ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL); +static DEVICE_ATTR(waiting, 0400, cuse_class_waiting_show, NULL); static ssize_t cuse_class_abort_store(struct device *dev, struct device_attribute *attr, @@ -579,7 +584,7 @@ static ssize_t cuse_class_abort_store(struct device *dev, fuse_abort_conn(&cc->fc); return count; } -static DEVICE_ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store); +static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store); static struct attribute *cuse_class_dev_attrs[] = { &dev_attr_waiting.attr, @@ -589,11 +594,14 @@ static struct attribute *cuse_class_dev_attrs[] = { ATTRIBUTE_GROUPS(cuse_class_dev); static struct miscdevice cuse_miscdev = { - .minor = MISC_DYNAMIC_MINOR, + .minor = CUSE_MINOR, .name = "cuse", .fops = &cuse_channel_fops, }; +MODULE_ALIAS_MISCDEV(CUSE_MINOR); +MODULE_ALIAS("devname:cuse"); + static int __init cuse_init(void) { int i, rc; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ef74ad5fd36..ca887314aba 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -643,9 +643,8 @@ struct fuse_copy_state { unsigned long seglen; unsigned long addr; struct page *pg; - void *mapaddr; - void *buf; unsigned len; + unsigned offset; unsigned move_pages:1; }; @@ -666,23 +665,17 @@ static void fuse_copy_finish(struct fuse_copy_state *cs) if (cs->currbuf) { struct pipe_buffer *buf = cs->currbuf; - if (!cs->write) { - buf->ops->unmap(cs->pipe, buf, cs->mapaddr); - } else { - kunmap(buf->page); + if (cs->write) buf->len = PAGE_SIZE - cs->len; - } cs->currbuf = NULL; - cs->mapaddr = NULL; - } else if (cs->mapaddr) { - kunmap(cs->pg); + } else if (cs->pg) { if (cs->write) { flush_dcache_page(cs->pg); set_page_dirty_lock(cs->pg); } put_page(cs->pg); - cs->mapaddr = NULL; } + cs->pg = NULL; } /* @@ -691,7 +684,7 @@ static void fuse_copy_finish(struct fuse_copy_state *cs) */ static int fuse_copy_fill(struct fuse_copy_state *cs) { - unsigned long offset; + struct page *page; int err; unlock_request(cs->fc, cs->req); @@ -706,14 +699,12 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) BUG_ON(!cs->nr_segs); cs->currbuf = buf; - cs->mapaddr = buf->ops->map(cs->pipe, buf, 0); + cs->pg = buf->page; + cs->offset = buf->offset; cs->len = buf->len; - cs->buf = cs->mapaddr + buf->offset; cs->pipebufs++; cs->nr_segs--; } else { - struct page *page; - if (cs->nr_segs == cs->pipe->buffers) return -EIO; @@ -726,8 +717,8 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) buf->len = 0; cs->currbuf = buf; - cs->mapaddr = kmap(page); - cs->buf = cs->mapaddr; + cs->pg = page; + cs->offset = 0; cs->len = PAGE_SIZE; cs->pipebufs++; cs->nr_segs++; @@ -740,14 +731,13 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->iov++; cs->nr_segs--; } - err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg); + err = get_user_pages_fast(cs->addr, 1, cs->write, &page); if (err < 0) return err; BUG_ON(err != 1); - offset = cs->addr % PAGE_SIZE; - cs->mapaddr = kmap(cs->pg); - cs->buf = cs->mapaddr + offset; - cs->len = min(PAGE_SIZE - offset, cs->seglen); + cs->pg = page; + cs->offset = cs->addr % PAGE_SIZE; + cs->len = min(PAGE_SIZE - cs->offset, cs->seglen); cs->seglen -= cs->len; cs->addr += cs->len; } @@ -760,15 +750,20 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) { unsigned ncpy = min(*size, cs->len); if (val) { + void *pgaddr = kmap_atomic(cs->pg); + void *buf = pgaddr + cs->offset; + if (cs->write) - memcpy(cs->buf, *val, ncpy); + memcpy(buf, *val, ncpy); else - memcpy(*val, cs->buf, ncpy); + memcpy(*val, buf, ncpy); + + kunmap_atomic(pgaddr); *val += ncpy; } *size -= ncpy; cs->len -= ncpy; - cs->buf += ncpy; + cs->offset += ncpy; return ncpy; } @@ -874,8 +869,8 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) out_fallback_unlock: unlock_page(newpage); out_fallback: - cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); - cs->buf = cs->mapaddr + buf->offset; + cs->pg = buf->page; + cs->offset = buf->offset; err = lock_request(cs->fc, cs->req); if (err) @@ -1296,22 +1291,6 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs)); } -static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - return 1; -} - -static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = { - .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, - .confirm = generic_pipe_buf_confirm, - .release = generic_pipe_buf_release, - .steal = fuse_dev_pipe_buf_steal, - .get = generic_pipe_buf_get, -}; - static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) @@ -1358,7 +1337,11 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, buf->page = bufs[page_nr].page; buf->offset = bufs[page_nr].offset; buf->len = bufs[page_nr].len; - buf->ops = &fuse_dev_pipe_buf_ops; + /* + * Need to be careful about this. Having buf->ops in module + * code can Oops if the buffer persists after module unload. + */ + buf->ops = &nosteal_pipe_buf_ops; pipe->nrbufs++; page_nr++; @@ -1599,7 +1582,8 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset); err = fuse_copy_page(cs, &page, offset, this_num, 0); - if (!err && offset == 0 && (num != 0 || file_size == end)) + if (!err && offset == 0 && + (this_num == PAGE_CACHE_SIZE || file_size == end)) SetPageUptodate(page); unlock_page(page); page_cache_release(page); @@ -1625,7 +1609,7 @@ out_finish: static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) { - release_pages(req->pages, req->num_pages, 0); + release_pages(req->pages, req->num_pages, false); } static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index b7989f2ab4c..0c6048247a3 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -112,6 +112,16 @@ void fuse_invalidate_attr(struct inode *inode) get_fuse_inode(inode)->i_time = 0; } +/** + * Mark the attributes as stale due to an atime change. Avoid the invalidate if + * atime is not used. + */ +void fuse_invalidate_atime(struct inode *inode) +{ + if (!IS_RDONLY(inode)) + fuse_invalidate_attr(inode); +} + /* * Just mark the entry as stale, so that a next attempt to look it up * will result in a new lookup call to userspace @@ -188,7 +198,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) inode = ACCESS_ONCE(entry->d_inode); if (inode && is_bad_inode(inode)) goto invalid; - else if (fuse_dentry_time(entry) < get_jiffies_64()) { + else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || + (flags & LOOKUP_REVAL)) { int err; struct fuse_entry_out outarg; struct fuse_req *req; @@ -342,24 +353,6 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, return err; } -static struct dentry *fuse_materialise_dentry(struct dentry *dentry, - struct inode *inode) -{ - struct dentry *newent; - - if (inode && S_ISDIR(inode->i_mode)) { - struct fuse_conn *fc = get_fuse_conn(inode); - - mutex_lock(&fc->inst_mutex); - newent = d_materialise_unique(dentry, inode); - mutex_unlock(&fc->inst_mutex); - } else { - newent = d_materialise_unique(dentry, inode); - } - - return newent; -} - static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, unsigned int flags) { @@ -382,7 +375,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, if (inode && get_node_id(inode) == FUSE_ROOT_ID) goto out_iput; - newent = fuse_materialise_dentry(entry, inode); + newent = d_materialise_unique(entry, inode); err = PTR_ERR(newent); if (IS_ERR(newent)) goto out_err; @@ -601,21 +594,9 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, } kfree(forget); - if (S_ISDIR(inode->i_mode)) { - struct dentry *alias; - mutex_lock(&fc->inst_mutex); - alias = d_find_alias(inode); - if (alias) { - /* New directory must have moved since mkdir */ - mutex_unlock(&fc->inst_mutex); - dput(alias); - iput(inode); - return -EBUSY; - } - d_instantiate(entry, inode); - mutex_unlock(&fc->inst_mutex); - } else - d_instantiate(entry, inode); + err = d_instantiate_no_diralias(entry, inode); + if (err) + return err; fuse_change_entry_timeout(entry, &outarg); fuse_invalidate_attr(dir); @@ -699,6 +680,14 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, return create_new_entry(fc, req, dir, entry, S_IFLNK); } +static inline void fuse_update_ctime(struct inode *inode) +{ + if (!IS_NOCMTIME(inode)) { + inode->i_ctime = current_fs_time(inode->i_sb); + mark_inode_dirty_sync(inode); + } +} + static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; @@ -733,6 +722,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) fuse_invalidate_attr(inode); fuse_invalidate_attr(dir); fuse_invalidate_entry_cache(entry); + fuse_update_ctime(inode); } else if (err == -EINTR) fuse_invalidate_entry(entry); return err; @@ -763,23 +753,26 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) return err; } -static int fuse_rename(struct inode *olddir, struct dentry *oldent, - struct inode *newdir, struct dentry *newent) +static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags, int opcode, size_t argsize) { int err; - struct fuse_rename_in inarg; + struct fuse_rename2_in inarg; struct fuse_conn *fc = get_fuse_conn(olddir); - struct fuse_req *req = fuse_get_req_nopages(fc); + struct fuse_req *req; + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); - memset(&inarg, 0, sizeof(inarg)); + memset(&inarg, 0, argsize); inarg.newdir = get_node_id(newdir); - req->in.h.opcode = FUSE_RENAME; + inarg.flags = flags; + req->in.h.opcode = opcode; req->in.h.nodeid = get_node_id(olddir); req->in.numargs = 3; - req->in.args[0].size = sizeof(inarg); + req->in.args[0].size = argsize; req->in.args[0].value = &inarg; req->in.args[1].size = oldent->d_name.len + 1; req->in.args[1].value = oldent->d_name.name; @@ -791,15 +784,22 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, if (!err) { /* ctime changes */ fuse_invalidate_attr(oldent->d_inode); + fuse_update_ctime(oldent->d_inode); + + if (flags & RENAME_EXCHANGE) { + fuse_invalidate_attr(newent->d_inode); + fuse_update_ctime(newent->d_inode); + } fuse_invalidate_attr(olddir); if (olddir != newdir) fuse_invalidate_attr(newdir); /* newent will end up negative */ - if (newent->d_inode) { + if (!(flags & RENAME_EXCHANGE) && newent->d_inode) { fuse_invalidate_attr(newent->d_inode); fuse_invalidate_entry_cache(newent); + fuse_update_ctime(newent->d_inode); } } else if (err == -EINTR) { /* If request was interrupted, DEITY only knows if the @@ -815,6 +815,42 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, return err; } +static int fuse_rename2(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags) +{ + struct fuse_conn *fc = get_fuse_conn(olddir); + int err; + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if (flags) { + if (fc->no_rename2 || fc->minor < 23) + return -EINVAL; + + err = fuse_rename_common(olddir, oldent, newdir, newent, flags, + FUSE_RENAME2, + sizeof(struct fuse_rename2_in)); + if (err == -ENOSYS) { + fc->no_rename2 = 1; + err = -EINVAL; + } + } else { + err = fuse_rename_common(olddir, oldent, newdir, newent, 0, + FUSE_RENAME, + sizeof(struct fuse_rename_in)); + } + + return err; +} + +static int fuse_rename(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent) +{ + return fuse_rename2(olddir, oldent, newdir, newent, 0); +} + static int fuse_link(struct dentry *entry, struct inode *newdir, struct dentry *newent) { @@ -849,6 +885,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, inc_nlink(inode); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); + fuse_update_ctime(inode); } else if (err == -EINTR) { fuse_invalidate_attr(inode); } @@ -859,6 +896,16 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, struct kstat *stat) { unsigned int blkbits; + struct fuse_conn *fc = get_fuse_conn(inode); + + /* see the comment in fuse_change_attributes() */ + if (fc->writeback_cache && S_ISREG(inode->i_mode)) { + attr->size = i_size_read(inode); + attr->mtime = inode->i_mtime.tv_sec; + attr->mtimensec = inode->i_mtime.tv_nsec; + attr->ctime = inode->i_ctime.tv_sec; + attr->ctimensec = inode->i_ctime.tv_nsec; + } stat->dev = inode->i_sb->s_dev; stat->ino = attr->ino; @@ -945,7 +992,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat, int err; bool r; - if (fi->i_time < get_jiffies_64()) { + if (time_before64(fi->i_time, get_jiffies_64())) { r = true; err = fuse_do_getattr(inode, stat, file); } else { @@ -1131,7 +1178,7 @@ static int fuse_permission(struct inode *inode, int mask) ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) { struct fuse_inode *fi = get_fuse_inode(inode); - if (fi->i_time < get_jiffies_64()) { + if (time_before64(fi->i_time, get_jiffies_64())) { refreshed = true; err = fuse_perm_getattr(inode, mask); @@ -1284,7 +1331,7 @@ static int fuse_direntplus_link(struct file *file, if (!inode) goto out; - alias = fuse_materialise_dentry(dentry, inode); + alias = d_materialise_unique(dentry, inode); err = PTR_ERR(alias); if (IS_ERR(alias)) goto out; @@ -1401,7 +1448,7 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx) } __free_page(page); - fuse_invalidate_attr(inode); /* atime changed */ + fuse_invalidate_atime(inode); return err; } @@ -1434,7 +1481,7 @@ static char *read_link(struct dentry *dentry) link[req->out.args[0].size] = '\0'; out: fuse_put_request(fc, req); - fuse_invalidate_attr(inode); /* atime changed */ + fuse_invalidate_atime(inode); return link; } @@ -1497,12 +1544,16 @@ static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd, FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR); } -static bool update_mtime(unsigned ivalid) +static bool update_mtime(unsigned ivalid, bool trust_local_mtime) { /* Always update if mtime is explicitly set */ if (ivalid & ATTR_MTIME_SET) return true; + /* Or if kernel i_mtime is the official one */ + if (trust_local_mtime) + return true; + /* If it's an open(O_TRUNC) or an ftruncate(), don't update */ if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE))) return false; @@ -1511,7 +1562,8 @@ static bool update_mtime(unsigned ivalid) return true; } -static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg) +static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg, + bool trust_local_cmtime) { unsigned ivalid = iattr->ia_valid; @@ -1530,13 +1582,18 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg) if (!(ivalid & ATTR_ATIME_SET)) arg->valid |= FATTR_ATIME_NOW; } - if ((ivalid & ATTR_MTIME) && update_mtime(ivalid)) { + if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) { arg->valid |= FATTR_MTIME; arg->mtime = iattr->ia_mtime.tv_sec; arg->mtimensec = iattr->ia_mtime.tv_nsec; - if (!(ivalid & ATTR_MTIME_SET)) + if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime) arg->valid |= FATTR_MTIME_NOW; } + if ((ivalid & ATTR_CTIME) && trust_local_cmtime) { + arg->valid |= FATTR_CTIME; + arg->ctime = iattr->ia_ctime.tv_sec; + arg->ctimensec = iattr->ia_ctime.tv_nsec; + } } /* @@ -1583,6 +1640,62 @@ void fuse_release_nowrite(struct inode *inode) spin_unlock(&fc->lock); } +static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req, + struct inode *inode, + struct fuse_setattr_in *inarg_p, + struct fuse_attr_out *outarg_p) +{ + req->in.h.opcode = FUSE_SETATTR; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(*inarg_p); + req->in.args[0].value = inarg_p; + req->out.numargs = 1; + if (fc->minor < 9) + req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; + else + req->out.args[0].size = sizeof(*outarg_p); + req->out.args[0].value = outarg_p; +} + +/* + * Flush inode->i_mtime to the server + */ +int fuse_flush_times(struct inode *inode, struct fuse_file *ff) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + struct fuse_setattr_in inarg; + struct fuse_attr_out outarg; + int err; + + req = fuse_get_req_nopages(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + memset(&inarg, 0, sizeof(inarg)); + memset(&outarg, 0, sizeof(outarg)); + + inarg.valid = FATTR_MTIME; + inarg.mtime = inode->i_mtime.tv_sec; + inarg.mtimensec = inode->i_mtime.tv_nsec; + if (fc->minor >= 23) { + inarg.valid |= FATTR_CTIME; + inarg.ctime = inode->i_ctime.tv_sec; + inarg.ctimensec = inode->i_ctime.tv_nsec; + } + if (ff) { + inarg.valid |= FATTR_FH; + inarg.fh = ff->fh; + } + fuse_setattr_fill(fc, req, inode, &inarg, &outarg); + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + + return err; +} + /* * Set attributes, and at the same time refresh them. * @@ -1600,8 +1713,10 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, struct fuse_setattr_in inarg; struct fuse_attr_out outarg; bool is_truncate = false; + bool is_wb = fc->writeback_cache; loff_t oldsize; int err; + bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode); if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) attr->ia_valid |= ATTR_FORCE; @@ -1626,11 +1741,13 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, if (is_truncate) { fuse_set_nowrite(inode); set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + if (trust_local_cmtime && attr->ia_size != inode->i_size) + attr->ia_valid |= ATTR_MTIME | ATTR_CTIME; } memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); - iattr_to_fattr(attr, &inarg); + iattr_to_fattr(attr, &inarg, trust_local_cmtime); if (file) { struct fuse_file *ff = file->private_data; inarg.valid |= FATTR_FH; @@ -1641,17 +1758,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, inarg.valid |= FATTR_LOCKOWNER; inarg.lock_owner = fuse_lock_owner_id(fc, current->files); } - req->in.h.opcode = FUSE_SETATTR; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - req->out.numargs = 1; - if (fc->minor < 9) - req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; - else - req->out.args[0].size = sizeof(outarg); - req->out.args[0].value = &outarg; + fuse_setattr_fill(fc, req, inode, &inarg, &outarg); fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); @@ -1668,10 +1775,21 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, } spin_lock(&fc->lock); + /* the kernel maintains i_mtime locally */ + if (trust_local_cmtime) { + if (attr->ia_valid & ATTR_MTIME) + inode->i_mtime = attr->ia_mtime; + if (attr->ia_valid & ATTR_CTIME) + inode->i_ctime = attr->ia_ctime; + /* FIXME: clear I_DIRTY_SYNC? */ + } + fuse_change_attributes_common(inode, &outarg.attr, attr_timeout(&outarg)); oldsize = inode->i_size; - i_size_write(inode, outarg.attr.size); + /* see the comment in fuse_change_attributes() */ + if (!is_wb || is_truncate || !S_ISREG(inode->i_mode)) + i_size_write(inode, outarg.attr.size); if (is_truncate) { /* NOTE: this may release/reacquire fc->lock */ @@ -1683,7 +1801,8 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, * Only call invalidate_inode_pages2() after removing * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. */ - if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { + if ((is_truncate || !is_wb) && + S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { truncate_pagecache(inode, outarg.attr.size); invalidate_inode_pages2(inode->i_mapping); } @@ -1759,8 +1878,10 @@ static int fuse_setxattr(struct dentry *entry, const char *name, fc->no_setxattr = 1; err = -EOPNOTSUPP; } - if (!err) + if (!err) { fuse_invalidate_attr(inode); + fuse_update_ctime(inode); + } return err; } @@ -1890,8 +2011,10 @@ static int fuse_removexattr(struct dentry *entry, const char *name) fc->no_removexattr = 1; err = -EOPNOTSUPP; } - if (!err) + if (!err) { fuse_invalidate_attr(inode); + fuse_update_ctime(inode); + } return err; } @@ -1902,6 +2025,7 @@ static const struct inode_operations fuse_dir_inode_operations = { .unlink = fuse_unlink, .rmdir = fuse_rmdir, .rename = fuse_rename, + .rename2 = fuse_rename2, .link = fuse_link, .setattr = fuse_setattr, .create = fuse_create, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4598345ab87..40ac2628ddc 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -127,7 +127,15 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) if (atomic_dec_and_test(&ff->count)) { struct fuse_req *req = ff->reserved_req; - if (sync) { + if (ff->fc->no_open) { + /* + * Drop the release request when client does not + * implement 'open' + */ + req->background = 0; + path_put(&req->misc.release.path); + fuse_put_request(ff->fc, req); + } else if (sync) { req->background = 0; fuse_request_send(ff->fc, req); path_put(&req->misc.release.path); @@ -144,33 +152,58 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, bool isdir) { - struct fuse_open_out outarg; struct fuse_file *ff; - int err; int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; ff = fuse_file_alloc(fc); if (!ff) return -ENOMEM; - err = fuse_send_open(fc, nodeid, file, opcode, &outarg); - if (err) { - fuse_file_free(ff); - return err; + ff->fh = 0; + ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */ + if (!fc->no_open || isdir) { + struct fuse_open_out outarg; + int err; + + err = fuse_send_open(fc, nodeid, file, opcode, &outarg); + if (!err) { + ff->fh = outarg.fh; + ff->open_flags = outarg.open_flags; + + } else if (err != -ENOSYS || isdir) { + fuse_file_free(ff); + return err; + } else { + fc->no_open = 1; + } } if (isdir) - outarg.open_flags &= ~FOPEN_DIRECT_IO; + ff->open_flags &= ~FOPEN_DIRECT_IO; - ff->fh = outarg.fh; ff->nodeid = nodeid; - ff->open_flags = outarg.open_flags; file->private_data = fuse_file_get(ff); return 0; } EXPORT_SYMBOL_GPL(fuse_do_open); +static void fuse_link_write_file(struct file *file) +{ + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_file *ff = file->private_data; + /* + * file may be written through mmap, so chain it onto the + * inodes's write_file list + */ + spin_lock(&fc->lock); + if (list_empty(&ff->write_entry)) + list_add(&ff->write_entry, &fi->write_files); + spin_unlock(&fc->lock); +} + void fuse_finish_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; @@ -190,25 +223,37 @@ void fuse_finish_open(struct inode *inode, struct file *file) i_size_write(inode, 0); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); + if (fc->writeback_cache) + file_update_time(file); } + if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) + fuse_link_write_file(file); } int fuse_open_common(struct inode *inode, struct file *file, bool isdir) { struct fuse_conn *fc = get_fuse_conn(inode); int err; + bool lock_inode = (file->f_flags & O_TRUNC) && + fc->atomic_o_trunc && + fc->writeback_cache; err = generic_file_open(inode, file); if (err) return err; + if (lock_inode) + mutex_lock(&inode->i_mutex); + err = fuse_do_open(fc, get_node_id(inode), file, isdir); - if (err) - return err; - fuse_finish_open(inode, file); + if (!err) + fuse_finish_open(inode, file); - return 0; + if (lock_inode) + mutex_unlock(&inode->i_mutex); + + return err; } static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) @@ -275,6 +320,12 @@ static int fuse_open(struct inode *inode, struct file *file) static int fuse_release(struct inode *inode, struct file *file) { + struct fuse_conn *fc = get_fuse_conn(inode); + + /* see fuse_vma_close() for !writeback_cache case */ + if (fc->writeback_cache) + write_inode_now(inode, 1); + fuse_release_common(file, FUSE_RELEASE); /* return value is ignored by VFS */ @@ -316,12 +367,13 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) } /* - * Check if page is under writeback + * Check if any page in a range is under writeback * * This is currently done by walking the list of writepage requests * for the inode, which can be pretty inefficient. */ -static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) +static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, + pgoff_t idx_to) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -334,7 +386,8 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) BUG_ON(req->inode != inode); curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; - if (curr_index == index) { + if (idx_from < curr_index + req->num_pages && + curr_index <= idx_to) { found = true; break; } @@ -344,6 +397,11 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) return found; } +static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) +{ + return fuse_range_is_writeback(inode, index, index); +} + /* * Wait for page writeback to be completed. * @@ -358,6 +416,21 @@ static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) return 0; } +/* + * Wait for all pending writepages on the inode to finish. + * + * This is currently done by blocking further writes with FUSE_NOWRITE + * and waiting for all sent writes to complete. + * + * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage + * could conflict with truncation. + */ +static void fuse_sync_writes(struct inode *inode) +{ + fuse_set_nowrite(inode); + fuse_release_nowrite(inode); +} + static int fuse_flush(struct file *file, fl_owner_t id) { struct inode *inode = file_inode(file); @@ -373,6 +446,14 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (fc->no_flush) return 0; + err = write_inode_now(inode, 1); + if (err) + return err; + + mutex_lock(&inode->i_mutex); + fuse_sync_writes(inode); + mutex_unlock(&inode->i_mutex); + req = fuse_get_req_nofail_nopages(fc, file); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; @@ -393,21 +474,6 @@ static int fuse_flush(struct file *file, fl_owner_t id) return err; } -/* - * Wait for all pending writepages on the inode to finish. - * - * This is currently done by blocking further writes with FUSE_NOWRITE - * and waiting for all sent writes to complete. - * - * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage - * could conflict with truncation. - */ -static void fuse_sync_writes(struct inode *inode) -{ - fuse_set_nowrite(inode); - fuse_release_nowrite(inode); -} - int fuse_fsync_common(struct file *file, loff_t start, loff_t end, int datasync, int isdir) { @@ -421,13 +487,6 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, if (is_bad_inode(inode)) return -EIO; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (err) - return err; - - if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) - return 0; - mutex_lock(&inode->i_mutex); /* @@ -435,11 +494,17 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, * wait for all outstanding writes, before sending the FSYNC * request. */ - err = write_inode_now(inode, 0); + err = filemap_write_and_wait_range(inode->i_mapping, start, end); if (err) goto out; fuse_sync_writes(inode); + err = sync_inode_metadata(inode, 1); + if (err) + goto out; + + if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) + goto out; req = fuse_get_req_nopages(fc); if (IS_ERR(req)) { @@ -637,7 +702,33 @@ static void fuse_read_update_size(struct inode *inode, loff_t size, spin_unlock(&fc->lock); } -static int fuse_readpage(struct file *file, struct page *page) +static void fuse_short_read(struct fuse_req *req, struct inode *inode, + u64 attr_ver) +{ + size_t num_read = req->out.args[0].size; + struct fuse_conn *fc = get_fuse_conn(inode); + + if (fc->writeback_cache) { + /* + * A hole in a file. Some data after the hole are in page cache, + * but have not reached the client fs yet. So, the hole is not + * present there. + */ + int i; + int start_idx = num_read >> PAGE_CACHE_SHIFT; + size_t off = num_read & (PAGE_CACHE_SIZE - 1); + + for (i = start_idx; i < req->num_pages; i++) { + zero_user_segment(req->pages[i], off, PAGE_CACHE_SIZE); + off = 0; + } + } else { + loff_t pos = page_offset(req->pages[0]) + num_read; + fuse_read_update_size(inode, pos, attr_ver); + } +} + +static int fuse_do_readpage(struct file *file, struct page *page) { struct fuse_io_priv io = { .async = 0, .file = file }; struct inode *inode = page->mapping->host; @@ -649,10 +740,6 @@ static int fuse_readpage(struct file *file, struct page *page) u64 attr_ver; int err; - err = -EIO; - if (is_bad_inode(inode)) - goto out; - /* * Page writeback can extend beyond the lifetime of the * page-cache page, so make sure we read a properly synced @@ -661,9 +748,8 @@ static int fuse_readpage(struct file *file, struct page *page) fuse_wait_on_page_writeback(inode, page->index); req = fuse_get_req(fc, 1); - err = PTR_ERR(req); if (IS_ERR(req)) - goto out; + return PTR_ERR(req); attr_ver = fuse_get_attr_version(fc); @@ -674,19 +760,33 @@ static int fuse_readpage(struct file *file, struct page *page) req->page_descs[0].length = count; num_read = fuse_send_read(req, &io, pos, count, NULL); err = req->out.h.error; - fuse_put_request(fc, req); if (!err) { /* * Short read means EOF. If file size is larger, truncate it */ if (num_read < count) - fuse_read_update_size(inode, pos + num_read, attr_ver); + fuse_short_read(req, inode, attr_ver); SetPageUptodate(page); } - fuse_invalidate_attr(inode); /* atime changed */ + fuse_put_request(fc, req); + + return err; +} + +static int fuse_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + int err; + + err = -EIO; + if (is_bad_inode(inode)) + goto out; + + err = fuse_do_readpage(file, page); + fuse_invalidate_atime(inode); out: unlock_page(page); return err; @@ -708,14 +808,10 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) /* * Short read means EOF. If file size is larger, truncate it */ - if (!req->out.h.error && num_read < count) { - loff_t pos; + if (!req->out.h.error && num_read < count) + fuse_short_read(req, inode, req->misc.read.attr_ver); - pos = page_offset(req->pages[0]) + num_read; - fuse_read_update_size(inode, pos, - req->misc.read.attr_ver); - } - fuse_invalidate_attr(inode); /* atime changed */ + fuse_invalidate_atime(inode); } for (i = 0; i < req->num_pages; i++) { @@ -837,8 +933,7 @@ out: return err; } -static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = iocb->ki_filp->f_mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); @@ -849,14 +944,14 @@ static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov, * i_size is up to date). */ if (fc->auto_inval_data || - (pos + iov_length(iov, nr_segs) > i_size_read(inode))) { + (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) { int err; err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL); if (err) return err; } - return generic_file_aio_read(iocb, iov, nr_segs, pos); + return generic_file_read_iter(iocb, to); } static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff, @@ -904,16 +999,21 @@ static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io, return req->misc.write.out.size; } -void fuse_write_update_size(struct inode *inode, loff_t pos) +bool fuse_write_update_size(struct inode *inode, loff_t pos) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); + bool ret = false; spin_lock(&fc->lock); fi->attr_version = ++fc->attr_version; - if (pos > inode->i_size) + if (pos > inode->i_size) { i_size_write(inode, pos); + ret = true; + } spin_unlock(&fc->lock); + + return ret; } static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, @@ -985,13 +1085,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, if (mapping_writably_mapped(mapping)) flush_dcache_page(page); - pagefault_disable(); tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); - pagefault_enable(); flush_dcache_page(page); - mark_page_accessed(page); - if (!tmp) { unlock_page(page); page_cache_release(page); @@ -1084,28 +1180,27 @@ static ssize_t fuse_perform_write(struct file *file, return res > 0 ? res : err; } -static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; - size_t count = 0; - size_t ocount = 0; + size_t count = iov_iter_count(from); ssize_t written = 0; ssize_t written_buffered = 0; struct inode *inode = mapping->host; ssize_t err; - struct iov_iter i; loff_t endbyte = 0; + loff_t pos = iocb->ki_pos; - WARN_ON(iocb->ki_pos != pos); + if (get_fuse_conn(inode)->writeback_cache) { + /* Update size (EOF optimization) and mode (SUID clearing) */ + err = fuse_update_attributes(mapping->host, NULL, file, NULL); + if (err) + return err; - ocount = 0; - err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); - if (err) - return err; + return generic_file_write_iter(iocb, from); + } - count = ocount; mutex_lock(&inode->i_mutex); /* We can write back this queue in page reclaim */ @@ -1118,6 +1213,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (count == 0) goto out; + iov_iter_truncate(from, count); err = file_remove_suid(file); if (err) goto out; @@ -1127,17 +1223,13 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, goto out; if (file->f_flags & O_DIRECT) { - written = generic_file_direct_write(iocb, iov, &nr_segs, - pos, &iocb->ki_pos, - count, ocount); - if (written < 0 || written == count) + written = generic_file_direct_write(iocb, from, pos); + if (written < 0 || !iov_iter_count(from)) goto out; pos += written; - count -= written; - iov_iter_init(&i, iov, nr_segs, count, written); - written_buffered = fuse_perform_write(file, mapping, &i, pos); + written_buffered = fuse_perform_write(file, mapping, from, pos); if (written_buffered < 0) { err = written_buffered; goto out; @@ -1156,8 +1248,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, written += written_buffered; iocb->ki_pos = pos + written_buffered; } else { - iov_iter_init(&i, iov, nr_segs, count, 0); - written = fuse_perform_write(file, mapping, &i, pos); + written = fuse_perform_write(file, mapping, from, pos); if (written >= 0) iocb->ki_pos = pos + written; } @@ -1195,7 +1286,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, size_t nbytes = 0; /* # bytes already packed in req */ /* Special case for kernel I/O: can copy directly into the buffer */ - if (segment_eq(get_fs(), KERNEL_DS)) { + if (ii->type & ITER_KVEC) { unsigned long user_addr = fuse_get_user_addr(ii); size_t frag_size = fuse_get_frag_size(ii, *nbytesp); @@ -1211,35 +1302,26 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, while (nbytes < *nbytesp && req->num_pages < req->max_pages) { unsigned npages; - unsigned long user_addr = fuse_get_user_addr(ii); - unsigned offset = user_addr & ~PAGE_MASK; - size_t frag_size = fuse_get_frag_size(ii, *nbytesp - nbytes); - int ret; - + size_t start; unsigned n = req->max_pages - req->num_pages; - frag_size = min_t(size_t, frag_size, n << PAGE_SHIFT); - - npages = (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; - npages = clamp(npages, 1U, n); - - ret = get_user_pages_fast(user_addr, npages, !write, - &req->pages[req->num_pages]); + ssize_t ret = iov_iter_get_pages(ii, + &req->pages[req->num_pages], + n * PAGE_SIZE, &start); if (ret < 0) return ret; - npages = ret; - frag_size = min_t(size_t, frag_size, - (npages << PAGE_SHIFT) - offset); - iov_iter_advance(ii, frag_size); + iov_iter_advance(ii, ret); + nbytes += ret; + + ret += start; + npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE; - req->page_descs[req->num_pages].offset = offset; + req->page_descs[req->num_pages].offset = start; fuse_page_descs_length_init(req, req->num_pages, npages); req->num_pages += npages; req->page_descs[req->num_pages - 1].length -= - (npages << PAGE_SHIFT) - offset - frag_size; - - nbytes += frag_size; + (PAGE_SIZE - ret) & (PAGE_SIZE - 1); } if (write) @@ -1254,48 +1336,46 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, static inline int fuse_iter_npages(const struct iov_iter *ii_p) { - struct iov_iter ii = *ii_p; - int npages = 0; - - while (iov_iter_count(&ii) && npages < FUSE_MAX_PAGES_PER_REQ) { - unsigned long user_addr = fuse_get_user_addr(&ii); - unsigned offset = user_addr & ~PAGE_MASK; - size_t frag_size = iov_iter_single_seg_count(&ii); - - npages += (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; - iov_iter_advance(&ii, frag_size); - } - - return min(npages, FUSE_MAX_PAGES_PER_REQ); + return iov_iter_npages(ii_p, FUSE_MAX_PAGES_PER_REQ); } -ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, - unsigned long nr_segs, size_t count, loff_t *ppos, - int write) +ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, + loff_t *ppos, int flags) { + int write = flags & FUSE_DIO_WRITE; + int cuse = flags & FUSE_DIO_CUSE; struct file *file = io->file; + struct inode *inode = file->f_mapping->host; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; size_t nmax = write ? fc->max_write : fc->max_read; loff_t pos = *ppos; + size_t count = iov_iter_count(iter); + pgoff_t idx_from = pos >> PAGE_CACHE_SHIFT; + pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT; ssize_t res = 0; struct fuse_req *req; - struct iov_iter ii; - - iov_iter_init(&ii, iov, nr_segs, count, 0); if (io->async) - req = fuse_get_req_for_background(fc, fuse_iter_npages(&ii)); + req = fuse_get_req_for_background(fc, fuse_iter_npages(iter)); else - req = fuse_get_req(fc, fuse_iter_npages(&ii)); + req = fuse_get_req(fc, fuse_iter_npages(iter)); if (IS_ERR(req)) return PTR_ERR(req); + if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { + if (!write) + mutex_lock(&inode->i_mutex); + fuse_sync_writes(inode); + if (!write) + mutex_unlock(&inode->i_mutex); + } + while (count) { size_t nres; fl_owner_t owner = current->files; size_t nbytes = min(count, nmax); - int err = fuse_get_user_pages(req, &ii, &nbytes, write); + int err = fuse_get_user_pages(req, iter, &nbytes, write); if (err) { res = err; break; @@ -1325,9 +1405,9 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, fuse_put_request(fc, req); if (io->async) req = fuse_get_req_for_background(fc, - fuse_iter_npages(&ii)); + fuse_iter_npages(iter)); else - req = fuse_get_req(fc, fuse_iter_npages(&ii)); + req = fuse_get_req(fc, fuse_iter_npages(iter)); if (IS_ERR(req)) break; } @@ -1342,9 +1422,8 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, EXPORT_SYMBOL_GPL(fuse_direct_io); static ssize_t __fuse_direct_read(struct fuse_io_priv *io, - const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos, - size_t count) + struct iov_iter *iter, + loff_t *ppos) { ssize_t res; struct file *file = io->file; @@ -1353,7 +1432,7 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io, if (is_bad_inode(inode)) return -EIO; - res = fuse_direct_io(io, iov, nr_segs, count, ppos, 0); + res = fuse_direct_io(io, iter, ppos, 0); fuse_invalidate_attr(inode); @@ -1365,21 +1444,26 @@ static ssize_t fuse_direct_read(struct file *file, char __user *buf, { struct fuse_io_priv io = { .async = 0, .file = file }; struct iovec iov = { .iov_base = buf, .iov_len = count }; - return __fuse_direct_read(&io, &iov, 1, ppos, count); + struct iov_iter ii; + iov_iter_init(&ii, READ, &iov, 1, count); + return __fuse_direct_read(&io, &ii, ppos); } static ssize_t __fuse_direct_write(struct fuse_io_priv *io, - const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) + struct iov_iter *iter, + loff_t *ppos) { struct file *file = io->file; struct inode *inode = file_inode(file); - size_t count = iov_length(iov, nr_segs); + size_t count = iov_iter_count(iter); ssize_t res; + res = generic_write_checks(file, ppos, &count, 0); - if (!res) - res = fuse_direct_io(io, iov, nr_segs, count, ppos, 1); + if (!res) { + iov_iter_truncate(iter, count); + res = fuse_direct_io(io, iter, ppos, FUSE_DIO_WRITE); + } fuse_invalidate_attr(inode); @@ -1393,13 +1477,15 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf, struct inode *inode = file_inode(file); ssize_t res; struct fuse_io_priv io = { .async = 0, .file = file }; + struct iov_iter ii; + iov_iter_init(&ii, WRITE, &iov, 1, count); if (is_bad_inode(inode)) return -EIO; /* Don't allow parallel writes to the same file */ mutex_lock(&inode->i_mutex); - res = __fuse_direct_write(&io, &iov, 1, ppos); + res = __fuse_direct_write(&io, &ii, ppos); if (res > 0) fuse_write_update_size(inode, *ppos); mutex_unlock(&inode->i_mutex); @@ -1409,8 +1495,13 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf, static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) { - __free_page(req->pages[0]); - fuse_file_put(req->ff, false); + int i; + + for (i = 0; i < req->num_pages; i++) + __free_page(req->pages[i]); + + if (req->ff) + fuse_file_put(req->ff, false); } static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) @@ -1418,30 +1509,34 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) struct inode *inode = req->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; + int i; list_del(&req->writepages_entry); - dec_bdi_stat(bdi, BDI_WRITEBACK); - dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP); - bdi_writeout_inc(bdi); + for (i = 0; i < req->num_pages; i++) { + dec_bdi_stat(bdi, BDI_WRITEBACK); + dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP); + bdi_writeout_inc(bdi); + } wake_up(&fi->page_waitq); } /* Called under fc->lock, may release and reacquire it */ -static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req, + loff_t size) __releases(fc->lock) __acquires(fc->lock) { struct fuse_inode *fi = get_fuse_inode(req->inode); - loff_t size = i_size_read(req->inode); struct fuse_write_in *inarg = &req->misc.write.in; + __u64 data_size = req->num_pages * PAGE_CACHE_SIZE; if (!fc->connected) goto out_free; - if (inarg->offset + PAGE_CACHE_SIZE <= size) { - inarg->size = PAGE_CACHE_SIZE; + if (inarg->offset + data_size <= size) { + inarg->size = data_size; } else if (inarg->offset < size) { - inarg->size = size & (PAGE_CACHE_SIZE - 1); + inarg->size = size - inarg->offset; } else { /* Got truncated off completely */ goto out_free; @@ -1472,12 +1567,13 @@ __acquires(fc->lock) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); + size_t crop = i_size_read(inode); struct fuse_req *req; while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) { req = list_entry(fi->queued_writes.next, struct fuse_req, list); list_del_init(&req->list); - fuse_send_writepage(fc, req); + fuse_send_writepage(fc, req, crop); } } @@ -1488,12 +1584,85 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) mapping_set_error(inode->i_mapping, req->out.h.error); spin_lock(&fc->lock); + while (req->misc.write.next) { + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_write_in *inarg = &req->misc.write.in; + struct fuse_req *next = req->misc.write.next; + req->misc.write.next = next->misc.write.next; + next->misc.write.next = NULL; + next->ff = fuse_file_get(req->ff); + list_add(&next->writepages_entry, &fi->writepages); + + /* + * Skip fuse_flush_writepages() to make it easy to crop requests + * based on primary request size. + * + * 1st case (trivial): there are no concurrent activities using + * fuse_set/release_nowrite. Then we're on safe side because + * fuse_flush_writepages() would call fuse_send_writepage() + * anyway. + * + * 2nd case: someone called fuse_set_nowrite and it is waiting + * now for completion of all in-flight requests. This happens + * rarely and no more than once per page, so this should be + * okay. + * + * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle + * of fuse_set_nowrite..fuse_release_nowrite section. The fact + * that fuse_set_nowrite returned implies that all in-flight + * requests were completed along with all of their secondary + * requests. Further primary requests are blocked by negative + * writectr. Hence there cannot be any in-flight requests and + * no invocations of fuse_writepage_end() while we're in + * fuse_set_nowrite..fuse_release_nowrite section. + */ + fuse_send_writepage(fc, next, inarg->offset + inarg->size); + } fi->writectr--; fuse_writepage_finish(fc, req); spin_unlock(&fc->lock); fuse_writepage_free(fc, req); } +static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, + struct fuse_inode *fi) +{ + struct fuse_file *ff = NULL; + + spin_lock(&fc->lock); + if (!list_empty(&fi->write_files)) { + ff = list_entry(fi->write_files.next, struct fuse_file, + write_entry); + fuse_file_get(ff); + } + spin_unlock(&fc->lock); + + return ff; +} + +static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, + struct fuse_inode *fi) +{ + struct fuse_file *ff = __fuse_write_file_get(fc, fi); + WARN_ON(!ff); + return ff; +} + +int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_file *ff; + int err; + + ff = __fuse_write_file_get(fc, fi); + err = fuse_flush_times(inode, ff); + if (ff) + fuse_file_put(ff, 0); + + return err; +} + static int fuse_writepage_locked(struct page *page) { struct address_space *mapping = page->mapping; @@ -1501,8 +1670,8 @@ static int fuse_writepage_locked(struct page *page) struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_req *req; - struct fuse_file *ff; struct page *tmp_page; + int error = -ENOMEM; set_page_writeback(page); @@ -1515,16 +1684,16 @@ static int fuse_writepage_locked(struct page *page) if (!tmp_page) goto err_free; - spin_lock(&fc->lock); - BUG_ON(list_empty(&fi->write_files)); - ff = list_entry(fi->write_files.next, struct fuse_file, write_entry); - req->ff = fuse_file_get(ff); - spin_unlock(&fc->lock); + error = -EIO; + req->ff = fuse_write_file_get(fc, fi); + if (!req->ff) + goto err_nofile; - fuse_write_fill(req, ff, page_offset(page), 0); + fuse_write_fill(req, req->ff, page_offset(page), 0); copy_highpage(tmp_page, page); req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; + req->misc.write.next = NULL; req->in.argpages = 1; req->num_pages = 1; req->pages[0] = tmp_page; @@ -1546,23 +1715,340 @@ static int fuse_writepage_locked(struct page *page) return 0; +err_nofile: + __free_page(tmp_page); err_free: fuse_request_free(req); err: end_page_writeback(page); - return -ENOMEM; + return error; } static int fuse_writepage(struct page *page, struct writeback_control *wbc) { int err; + if (fuse_page_is_writeback(page->mapping->host, page->index)) { + /* + * ->writepages() should be called for sync() and friends. We + * should only get here on direct reclaim and then we are + * allowed to skip a page which is already in flight + */ + WARN_ON(wbc->sync_mode == WB_SYNC_ALL); + + redirty_page_for_writepage(wbc, page); + return 0; + } + err = fuse_writepage_locked(page); unlock_page(page); return err; } +struct fuse_fill_wb_data { + struct fuse_req *req; + struct fuse_file *ff; + struct inode *inode; + struct page **orig_pages; +}; + +static void fuse_writepages_send(struct fuse_fill_wb_data *data) +{ + struct fuse_req *req = data->req; + struct inode *inode = data->inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + int num_pages = req->num_pages; + int i; + + req->ff = fuse_file_get(data->ff); + spin_lock(&fc->lock); + list_add_tail(&req->list, &fi->queued_writes); + fuse_flush_writepages(inode); + spin_unlock(&fc->lock); + + for (i = 0; i < num_pages; i++) + end_page_writeback(data->orig_pages[i]); +} + +static bool fuse_writepage_in_flight(struct fuse_req *new_req, + struct page *page) +{ + struct fuse_conn *fc = get_fuse_conn(new_req->inode); + struct fuse_inode *fi = get_fuse_inode(new_req->inode); + struct fuse_req *tmp; + struct fuse_req *old_req; + bool found = false; + pgoff_t curr_index; + + BUG_ON(new_req->num_pages != 0); + + spin_lock(&fc->lock); + list_del(&new_req->writepages_entry); + list_for_each_entry(old_req, &fi->writepages, writepages_entry) { + BUG_ON(old_req->inode != new_req->inode); + curr_index = old_req->misc.write.in.offset >> PAGE_CACHE_SHIFT; + if (curr_index <= page->index && + page->index < curr_index + old_req->num_pages) { + found = true; + break; + } + } + if (!found) { + list_add(&new_req->writepages_entry, &fi->writepages); + goto out_unlock; + } + + new_req->num_pages = 1; + for (tmp = old_req; tmp != NULL; tmp = tmp->misc.write.next) { + BUG_ON(tmp->inode != new_req->inode); + curr_index = tmp->misc.write.in.offset >> PAGE_CACHE_SHIFT; + if (tmp->num_pages == 1 && + curr_index == page->index) { + old_req = tmp; + } + } + + if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT || + old_req->state == FUSE_REQ_PENDING)) { + struct backing_dev_info *bdi = page->mapping->backing_dev_info; + + copy_highpage(old_req->pages[0], page); + spin_unlock(&fc->lock); + + dec_bdi_stat(bdi, BDI_WRITEBACK); + dec_zone_page_state(page, NR_WRITEBACK_TEMP); + bdi_writeout_inc(bdi); + fuse_writepage_free(fc, new_req); + fuse_request_free(new_req); + goto out; + } else { + new_req->misc.write.next = old_req->misc.write.next; + old_req->misc.write.next = new_req; + } +out_unlock: + spin_unlock(&fc->lock); +out: + return found; +} + +static int fuse_writepages_fill(struct page *page, + struct writeback_control *wbc, void *_data) +{ + struct fuse_fill_wb_data *data = _data; + struct fuse_req *req = data->req; + struct inode *inode = data->inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct page *tmp_page; + bool is_writeback; + int err; + + if (!data->ff) { + err = -EIO; + data->ff = fuse_write_file_get(fc, get_fuse_inode(inode)); + if (!data->ff) + goto out_unlock; + } + + /* + * Being under writeback is unlikely but possible. For example direct + * read to an mmaped fuse file will set the page dirty twice; once when + * the pages are faulted with get_user_pages(), and then after the read + * completed. + */ + is_writeback = fuse_page_is_writeback(inode, page->index); + + if (req && req->num_pages && + (is_writeback || req->num_pages == FUSE_MAX_PAGES_PER_REQ || + (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_write || + data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) { + fuse_writepages_send(data); + data->req = NULL; + } + err = -ENOMEM; + tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!tmp_page) + goto out_unlock; + + /* + * The page must not be redirtied until the writeout is completed + * (i.e. userspace has sent a reply to the write request). Otherwise + * there could be more than one temporary page instance for each real + * page. + * + * This is ensured by holding the page lock in page_mkwrite() while + * checking fuse_page_is_writeback(). We already hold the page lock + * since clear_page_dirty_for_io() and keep it held until we add the + * request to the fi->writepages list and increment req->num_pages. + * After this fuse_page_is_writeback() will indicate that the page is + * under writeback, so we can release the page lock. + */ + if (data->req == NULL) { + struct fuse_inode *fi = get_fuse_inode(inode); + + err = -ENOMEM; + req = fuse_request_alloc_nofs(FUSE_MAX_PAGES_PER_REQ); + if (!req) { + __free_page(tmp_page); + goto out_unlock; + } + + fuse_write_fill(req, data->ff, page_offset(page), 0); + req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; + req->misc.write.next = NULL; + req->in.argpages = 1; + req->background = 1; + req->num_pages = 0; + req->end = fuse_writepage_end; + req->inode = inode; + + spin_lock(&fc->lock); + list_add(&req->writepages_entry, &fi->writepages); + spin_unlock(&fc->lock); + + data->req = req; + } + set_page_writeback(page); + + copy_highpage(tmp_page, page); + req->pages[req->num_pages] = tmp_page; + req->page_descs[req->num_pages].offset = 0; + req->page_descs[req->num_pages].length = PAGE_SIZE; + + inc_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK); + inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); + + err = 0; + if (is_writeback && fuse_writepage_in_flight(req, page)) { + end_page_writeback(page); + data->req = NULL; + goto out_unlock; + } + data->orig_pages[req->num_pages] = page; + + /* + * Protected by fc->lock against concurrent access by + * fuse_page_is_writeback(). + */ + spin_lock(&fc->lock); + req->num_pages++; + spin_unlock(&fc->lock); + +out_unlock: + unlock_page(page); + + return err; +} + +static int fuse_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + struct fuse_fill_wb_data data; + int err; + + err = -EIO; + if (is_bad_inode(inode)) + goto out; + + data.inode = inode; + data.req = NULL; + data.ff = NULL; + + err = -ENOMEM; + data.orig_pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, + sizeof(struct page *), + GFP_NOFS); + if (!data.orig_pages) + goto out; + + err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); + if (data.req) { + /* Ignore errors if we can write at least one page */ + BUG_ON(!data.req->num_pages); + fuse_writepages_send(&data); + err = 0; + } + if (data.ff) + fuse_file_put(data.ff, false); + + kfree(data.orig_pages); +out: + return err; +} + +/* + * It's worthy to make sure that space is reserved on disk for the write, + * but how to implement it without killing performance need more thinking. + */ +static int fuse_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct fuse_conn *fc = get_fuse_conn(file->f_dentry->d_inode); + struct page *page; + loff_t fsize; + int err = -ENOMEM; + + WARN_ON(!fc->writeback_cache); + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + goto error; + + fuse_wait_on_page_writeback(mapping->host, page->index); + + if (PageUptodate(page) || len == PAGE_CACHE_SIZE) + goto success; + /* + * Check if the start this page comes after the end of file, in which + * case the readpage can be optimized away. + */ + fsize = i_size_read(mapping->host); + if (fsize <= (pos & PAGE_CACHE_MASK)) { + size_t off = pos & ~PAGE_CACHE_MASK; + if (off) + zero_user_segment(page, 0, off); + goto success; + } + err = fuse_do_readpage(file, page); + if (err) + goto cleanup; +success: + *pagep = page; + return 0; + +cleanup: + unlock_page(page); + page_cache_release(page); +error: + return err; +} + +static int fuse_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + + if (!PageUptodate(page)) { + /* Zero any unwritten bytes at the end of the page */ + size_t endoff = (pos + copied) & ~PAGE_CACHE_MASK; + if (endoff) + zero_user_segment(page, endoff, PAGE_CACHE_SIZE); + SetPageUptodate(page); + } + + fuse_write_update_size(inode, pos + copied); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + + return copied; +} + static int fuse_launder_page(struct page *page) { int err = 0; @@ -1602,39 +2088,32 @@ static void fuse_vma_close(struct vm_area_struct *vma) static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; - /* - * Don't use page->mapping as it may become NULL from a - * concurrent truncate. - */ - struct inode *inode = vma->vm_file->f_mapping->host; + struct inode *inode = file_inode(vma->vm_file); + + file_update_time(vma->vm_file); + lock_page(page); + if (page->mapping != inode->i_mapping) { + unlock_page(page); + return VM_FAULT_NOPAGE; + } fuse_wait_on_page_writeback(inode, page->index); - return 0; + return VM_FAULT_LOCKED; } static const struct vm_operations_struct fuse_file_vm_ops = { .close = fuse_vma_close, .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = fuse_page_mkwrite, .remap_pages = generic_file_remap_pages, }; static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) { - struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_file *ff = file->private_data; - /* - * file may be written through mmap, so chain it onto the - * inodes's write_file list - */ - spin_lock(&fc->lock); - if (list_empty(&ff->write_entry)) - list_add(&ff->write_entry, &fi->write_files); - spin_unlock(&fc->lock); - } + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + fuse_link_write_file(file); + file_accessed(file); vma->vm_ops = &fuse_file_vm_ops; return 0; @@ -1792,7 +2271,6 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl) struct fuse_file *ff = file->private_data; /* emulate flock with POSIX locks */ - fl->fl_owner = (fl_owner_t) file; ff->flock = true; err = fuse_setlk(file, fl, 1); } @@ -1863,7 +2341,7 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, if (!bytes) return 0; - iov_iter_init(&ii, iov, nr_segs, bytes, 0); + iov_iter_init(&ii, to_user ? READ : WRITE, iov, nr_segs, bytes); while (iov_iter_count(&ii)) { struct page *page = pages[page_idx++]; @@ -2281,7 +2759,7 @@ static void fuse_register_polled_file(struct fuse_conn *fc, { spin_lock(&fc->lock); if (RB_EMPTY_NODE(&ff->polled_node)) { - struct rb_node **link, *parent; + struct rb_node **link, *uninitialized_var(parent); link = fuse_find_polled_node(fc, ff->kh, &parent); BUG_ON(*link); @@ -2385,8 +2863,8 @@ static inline loff_t fuse_round_up(loff_t off) } static ssize_t -fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs) +fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) { ssize_t ret = 0; struct file *file = iocb->ki_filp; @@ -2395,18 +2873,22 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos = 0; struct inode *inode; loff_t i_size; - size_t count = iov_length(iov, nr_segs); + size_t count = iov_iter_count(iter); struct fuse_io_priv *io; pos = offset; inode = file->f_mapping->host; i_size = i_size_read(inode); + if ((rw == READ) && (offset > i_size)) + return 0; + /* optimization for short read */ if (async_dio && rw != WRITE && offset + count > i_size) { if (offset >= i_size) return 0; count = min_t(loff_t, count, fuse_round_up(i_size - offset)); + iov_iter_truncate(iter, count); } io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL); @@ -2436,9 +2918,9 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, io->async = false; if (rw == WRITE) - ret = __fuse_direct_write(io, iov, nr_segs, &pos); + ret = __fuse_direct_write(io, iter, &pos); else - ret = __fuse_direct_read(io, iov, nr_segs, &pos, count); + ret = __fuse_direct_read(io, iter, &pos); if (io->async) { fuse_aio_complete(io, ret < 0 ? ret : 0, -1); @@ -2480,6 +2962,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || (mode & FALLOC_FL_PUNCH_HOLE); + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + if (fc->no_fallocate) return -EOPNOTSUPP; @@ -2522,8 +3007,12 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, goto out; /* we could have extended the file */ - if (!(mode & FALLOC_FL_KEEP_SIZE)) - fuse_write_update_size(inode, offset + length); + if (!(mode & FALLOC_FL_KEEP_SIZE)) { + bool changed = fuse_write_update_size(inode, offset + length); + + if (changed && fc->writeback_cache) + file_update_time(file); + } if (mode & FALLOC_FL_PUNCH_HOLE) truncate_pagecache_range(inode, offset, offset + length - 1); @@ -2542,10 +3031,10 @@ out: static const struct file_operations fuse_file_operations = { .llseek = fuse_file_llseek, - .read = do_sync_read, - .aio_read = fuse_file_aio_read, - .write = do_sync_write, - .aio_write = fuse_file_aio_write, + .read = new_sync_read, + .read_iter = fuse_file_read_iter, + .write = new_sync_write, + .write_iter = fuse_file_write_iter, .mmap = fuse_file_mmap, .open = fuse_open, .flush = fuse_flush, @@ -2581,11 +3070,14 @@ static const struct file_operations fuse_direct_io_file_operations = { static const struct address_space_operations fuse_file_aops = { .readpage = fuse_readpage, .writepage = fuse_writepage, + .writepages = fuse_writepages, .launder_page = fuse_launder_page, .readpages = fuse_readpages, .set_page_dirty = __set_page_dirty_nobuffers, .bmap = fuse_bmap, .direct_IO = fuse_direct_IO, + .write_begin = fuse_write_begin, + .write_end = fuse_write_end, }; void fuse_init_file_inode(struct inode *inode) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 5b9e6f3b6ae..e8e47a6ab51 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -321,6 +321,7 @@ struct fuse_req { struct { struct fuse_write_in in; struct fuse_write_out out; + struct fuse_req *next; } write; struct fuse_notify_retrieve_in retrieve_in; struct fuse_lk_in lk_in; @@ -374,12 +375,11 @@ struct fuse_conn { /** Lock protecting accessess to members of this structure */ spinlock_t lock; - /** Mutex protecting against directory alias creation */ - struct mutex inst_mutex; - /** Refcount */ atomic_t count; + struct rcu_head rcu; + /** The user id for this mount */ kuid_t user_id; @@ -480,11 +480,17 @@ struct fuse_conn { /** Set if bdi is valid */ unsigned bdi_initialized:1; + /** write-back cache policy (default is write-through) */ + unsigned writeback_cache:1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction */ + /** Is open/release not implemented by fs? */ + unsigned no_open:1; + /** Is fsync not implemented by fs? */ unsigned no_fsync:1; @@ -536,6 +542,9 @@ struct fuse_conn { /** Is fallocate not implemented by fs? */ unsigned no_fallocate:1; + /** Is rename with flags implemented by fs? */ + unsigned no_rename2:1; + /** Use enhanced/automatic page cache invalidation. */ unsigned auto_inval_data:1; @@ -717,7 +726,7 @@ int fuse_dev_init(void); void fuse_dev_cleanup(void); int fuse_ctl_init(void); -void fuse_ctl_cleanup(void); +void __exit fuse_ctl_cleanup(void); /** * Allocate a request @@ -788,6 +797,8 @@ void fuse_invalidate_attr(struct inode *inode); void fuse_invalidate_entry_cache(struct dentry *entry); +void fuse_invalidate_atime(struct inode *inode); + /** * Acquire reference to fuse_conn */ @@ -858,9 +869,19 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, bool isdir); -ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, - unsigned long nr_segs, size_t count, loff_t *ppos, - int write); + +/** + * fuse_direct_io() flags + */ + +/** If set, it is WRITE; otherwise - READ */ +#define FUSE_DIO_WRITE (1 << 0) + +/** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */ +#define FUSE_DIO_CUSE (1 << 1) + +ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, + loff_t *ppos, int flags); long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, unsigned int flags); long fuse_ioctl_common(struct file *file, unsigned int cmd, @@ -868,7 +889,10 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd, unsigned fuse_file_poll(struct file *file, poll_table *wait); int fuse_dev_release(struct inode *inode, struct file *file); -void fuse_write_update_size(struct inode *inode, loff_t pos); +bool fuse_write_update_size(struct inode *inode, loff_t pos); + +int fuse_flush_times(struct inode *inode, struct fuse_file *ff); +int fuse_write_inode(struct inode *inode, struct writeback_control *wbc); int fuse_do_setattr(struct inode *inode, struct iattr *attr, struct file *file); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index a8ce6dab60a..03246cd9d47 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -123,7 +123,7 @@ static void fuse_destroy_inode(struct inode *inode) static void fuse_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (inode->i_sb->s_flags & MS_ACTIVE) { struct fuse_conn *fc = get_fuse_conn(inode); @@ -135,6 +135,7 @@ static void fuse_evict_inode(struct inode *inode) static int fuse_remount_fs(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); if (*flags & MS_MANDLOCK) return -EINVAL; @@ -170,10 +171,13 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_blocks = attr->blocks; inode->i_atime.tv_sec = attr->atime; inode->i_atime.tv_nsec = attr->atimensec; - inode->i_mtime.tv_sec = attr->mtime; - inode->i_mtime.tv_nsec = attr->mtimensec; - inode->i_ctime.tv_sec = attr->ctime; - inode->i_ctime.tv_nsec = attr->ctimensec; + /* mtime from server may be stale due to local buffered write */ + if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) { + inode->i_mtime.tv_sec = attr->mtime; + inode->i_mtime.tv_nsec = attr->mtimensec; + inode->i_ctime.tv_sec = attr->ctime; + inode->i_ctime.tv_nsec = attr->ctimensec; + } if (attr->blksize != 0) inode->i_blkbits = ilog2(attr->blksize); @@ -197,6 +201,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); + bool is_wb = fc->writeback_cache; loff_t oldsize; struct timespec old_mtime; @@ -211,10 +216,16 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, fuse_change_attributes_common(inode, attr, attr_valid); oldsize = inode->i_size; - i_size_write(inode, attr->size); + /* + * In case of writeback_cache enabled, the cached writes beyond EOF + * extend local i_size without keeping userspace server in sync. So, + * attr->size coming from server can be stale. We cannot trust it. + */ + if (!is_wb || !S_ISREG(inode->i_mode)) + i_size_write(inode, attr->size); spin_unlock(&fc->lock); - if (S_ISREG(inode->i_mode)) { + if (!is_wb && S_ISREG(inode->i_mode)) { bool inval = false; if (oldsize != attr->size) { @@ -243,6 +254,10 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) { inode->i_mode = attr->mode & S_IFMT; inode->i_size = attr->size; + inode->i_mtime.tv_sec = attr->mtime; + inode->i_mtime.tv_nsec = attr->mtimensec; + inode->i_ctime.tv_sec = attr->ctime; + inode->i_ctime.tv_nsec = attr->ctimensec; if (S_ISREG(inode->i_mode)) { fuse_init_common(inode); fuse_init_file_inode(inode); @@ -289,7 +304,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, return NULL; if ((inode->i_state & I_NEW)) { - inode->i_flags |= S_NOATIME|S_NOCMTIME; + inode->i_flags |= S_NOATIME; + if (!fc->writeback_cache || !S_ISREG(attr->mode)) + inode->i_flags |= S_NOCMTIME; inode->i_generation = generation; inode->i_data.backing_dev_info = &fc->bdi; fuse_init_inode(inode, attr); @@ -461,6 +478,17 @@ static const match_table_t tokens = { {OPT_ERR, NULL} }; +static int fuse_match_uint(substring_t *s, unsigned int *res) +{ + int err = -ENOMEM; + char *buf = match_strdup(s); + if (buf) { + err = kstrtouint(buf, 10, res); + kfree(buf); + } + return err; +} + static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) { char *p; @@ -471,6 +499,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) while ((p = strsep(&opt, ",")) != NULL) { int token; int value; + unsigned uv; substring_t args[MAX_OPT_ARGS]; if (!*p) continue; @@ -494,18 +523,18 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) break; case OPT_USER_ID: - if (match_int(&args[0], &value)) + if (fuse_match_uint(&args[0], &uv)) return 0; - d->user_id = make_kuid(current_user_ns(), value); + d->user_id = make_kuid(current_user_ns(), uv); if (!uid_valid(d->user_id)) return 0; d->user_id_present = 1; break; case OPT_GROUP_ID: - if (match_int(&args[0], &value)) + if (fuse_match_uint(&args[0], &uv)) return 0; - d->group_id = make_kgid(current_user_ns(), value); + d->group_id = make_kgid(current_user_ns(), uv); if (!gid_valid(d->group_id)) return 0; d->group_id_present = 1; @@ -565,7 +594,6 @@ void fuse_conn_init(struct fuse_conn *fc) { memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); - mutex_init(&fc->inst_mutex); init_rwsem(&fc->killsb); atomic_set(&fc->count, 1); init_waitqueue_head(&fc->waitq); @@ -596,7 +624,6 @@ void fuse_conn_put(struct fuse_conn *fc) if (atomic_dec_and_test(&fc->count)) { if (fc->destroy_req) fuse_request_free(fc->destroy_req); - mutex_destroy(&fc->inst_mutex); fc->release(fc); } } @@ -775,6 +802,7 @@ static const struct super_operations fuse_super_operations = { .alloc_inode = fuse_alloc_inode, .destroy_inode = fuse_destroy_inode, .evict_inode = fuse_evict_inode, + .write_inode = fuse_write_inode, .drop_inode = generic_delete_inode, .remount_fs = fuse_remount_fs, .put_super = fuse_put_super, @@ -875,6 +903,10 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) } if (arg->flags & FUSE_ASYNC_DIO) fc->async_dio = 1; + if (arg->flags & FUSE_WRITEBACK_CACHE) + fc->writeback_cache = 1; + if (arg->time_gran && arg->time_gran <= 1000000000) + fc->sb->s_time_gran = arg->time_gran; } else { ra_pages = fc->max_read / PAGE_CACHE_SIZE; fc->no_lock = 1; @@ -902,7 +934,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | - FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO; + FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | + FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -920,7 +953,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) static void fuse_free_conn(struct fuse_conn *fc) { - kfree(fc); + kfree_rcu(fc, rcu); } static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) @@ -980,9 +1013,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_flags & MS_MANDLOCK) goto err; - sb->s_flags &= ~MS_NOSEC; + sb->s_flags &= ~(MS_NOSEC | MS_I_VERSION); - if (!parse_fuse_opt((char *) data, &d, is_bdev)) + if (!parse_fuse_opt(data, &d, is_bdev)) goto err; if (is_bdev) { diff --git a/fs/generic_acl.c b/fs/generic_acl.c deleted file mode 100644 index b3f3676796d..00000000000 --- a/fs/generic_acl.c +++ /dev/null @@ -1,184 +0,0 @@ -/* - * (C) 2005 Andreas Gruenbacher <agruen@suse.de> - * - * This file is released under the GPL. - * - * Generic ACL support for in-memory filesystems. - */ - -#include <linux/sched.h> -#include <linux/gfp.h> -#include <linux/fs.h> -#include <linux/generic_acl.h> -#include <linux/posix_acl.h> -#include <linux/posix_acl_xattr.h> - - -static size_t -generic_acl_list(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - struct posix_acl *acl; - const char *xname; - size_t size; - - acl = get_cached_acl(dentry->d_inode, type); - if (!acl) - return 0; - posix_acl_release(acl); - - switch (type) { - case ACL_TYPE_ACCESS: - xname = POSIX_ACL_XATTR_ACCESS; - break; - case ACL_TYPE_DEFAULT: - xname = POSIX_ACL_XATTR_DEFAULT; - break; - default: - return 0; - } - size = strlen(xname) + 1; - if (list && size <= list_size) - memcpy(list, xname, size); - return size; -} - -static int -generic_acl_get(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) -{ - struct posix_acl *acl; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - - acl = get_cached_acl(dentry->d_inode, type); - if (!acl) - return -ENODATA; - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int -generic_acl_set(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags, int type) -{ - struct inode *inode = dentry->d_inode; - struct posix_acl *acl = NULL; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(inode)) - return -EPERM; - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (acl) { - error = posix_acl_valid(acl); - if (error) - goto failed; - switch (type) { - case ACL_TYPE_ACCESS: - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) - goto failed; - inode->i_ctime = CURRENT_TIME; - if (error == 0) { - posix_acl_release(acl); - acl = NULL; - } - break; - case ACL_TYPE_DEFAULT: - if (!S_ISDIR(inode->i_mode)) { - error = -EINVAL; - goto failed; - } - break; - } - } - set_cached_acl(inode, type, acl); - error = 0; -failed: - posix_acl_release(acl); - return error; -} - -/** - * generic_acl_init - Take care of acl inheritance at @inode create time - * - * Files created inside a directory with a default ACL inherit the - * directory's default ACL. - */ -int -generic_acl_init(struct inode *inode, struct inode *dir) -{ - struct posix_acl *acl = NULL; - int error; - - if (!S_ISLNK(inode->i_mode)) - acl = get_cached_acl(dir, ACL_TYPE_DEFAULT); - if (acl) { - if (S_ISDIR(inode->i_mode)) - set_cached_acl(inode, ACL_TYPE_DEFAULT, acl); - error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); - if (error < 0) - return error; - if (error > 0) - set_cached_acl(inode, ACL_TYPE_ACCESS, acl); - } else { - inode->i_mode &= ~current_umask(); - } - error = 0; - - posix_acl_release(acl); - return error; -} - -/** - * generic_acl_chmod - change the access acl of @inode upon chmod() - * - * A chmod also changes the permissions of the owner, group/mask, and - * other ACL entries. - */ -int -generic_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; - int error = 0; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - acl = get_cached_acl(inode, ACL_TYPE_ACCESS); - if (acl) { - error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (error) - return error; - set_cached_acl(inode, ACL_TYPE_ACCESS, acl); - posix_acl_release(acl); - } - return error; -} - -const struct xattr_handler generic_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = generic_acl_list, - .get = generic_acl_get, - .set = generic_acl_set, -}; - -const struct xattr_handler generic_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = generic_acl_list, - .get = generic_acl_get, - .set = generic_acl_set, -}; diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index f69ac0af549..3088e2a38e3 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -49,10 +49,6 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type) if (!ip->i_eattr) return NULL; - acl = get_cached_acl(&ip->i_inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - name = gfs2_acl_name(type); if (name == NULL) return ERR_PTR(-EINVAL); @@ -68,19 +64,7 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type) return acl; } -static int gfs2_set_mode(struct inode *inode, umode_t mode) -{ - int error = 0; - - if (mode != inode->i_mode) { - inode->i_mode = mode; - mark_inode_dirty(inode); - } - - return error; -} - -static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl) +int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int error; int len; @@ -88,219 +72,50 @@ static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl) const char *name = gfs2_acl_name(type); BUG_ON(name == NULL); - len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0); - if (len == 0) - return 0; - data = kmalloc(len, GFP_NOFS); - if (data == NULL) - return -ENOMEM; - error = posix_acl_to_xattr(&init_user_ns, acl, data, len); - if (error < 0) - goto out; - error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS); - if (!error) - set_cached_acl(inode, type, acl); -out: - kfree(data); - return error; -} - -int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode) -{ - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct posix_acl *acl; - umode_t mode = inode->i_mode; - int error = 0; - - if (!sdp->sd_args.ar_posix_acl) - return 0; - if (S_ISLNK(inode->i_mode)) - return 0; - - acl = gfs2_get_acl(&dip->i_inode, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (!acl) { - mode &= ~current_umask(); - return gfs2_set_mode(inode, mode); - } - - if (S_ISDIR(inode->i_mode)) { - error = gfs2_acl_set(inode, ACL_TYPE_DEFAULT, acl); - if (error) - goto out; - } - - error = posix_acl_create(&acl, GFP_NOFS, &mode); - if (error < 0) - return error; - - if (error == 0) - goto munge; - - error = gfs2_acl_set(inode, ACL_TYPE_ACCESS, acl); - if (error) - goto out; -munge: - error = gfs2_set_mode(inode, mode); -out: - posix_acl_release(acl); - return error; -} - -int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) -{ - struct inode *inode = &ip->i_inode; - struct posix_acl *acl; - char *data; - unsigned int len; - int error; - - acl = gfs2_get_acl(&ip->i_inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (!acl) - return gfs2_setattr_simple(inode, attr); - - error = posix_acl_chmod(&acl, GFP_NOFS, attr->ia_mode); - if (error) - return error; - - len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0); - data = kmalloc(len, GFP_NOFS); - error = -ENOMEM; - if (data == NULL) - goto out; - posix_acl_to_xattr(&init_user_ns, acl, data, len); - error = gfs2_xattr_acl_chmod(ip, attr, data); - kfree(data); - set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl); - -out: - posix_acl_release(acl); - return error; -} - -static int gfs2_acl_type(const char *name) -{ - if (strcmp(name, GFS2_POSIX_ACL_ACCESS) == 0) - return ACL_TYPE_ACCESS; - if (strcmp(name, GFS2_POSIX_ACL_DEFAULT) == 0) - return ACL_TYPE_DEFAULT; - return -EINVAL; -} - -static int gfs2_xattr_system_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int xtype) -{ - struct inode *inode = dentry->d_inode; - struct gfs2_sbd *sdp = GFS2_SB(inode); - struct posix_acl *acl; - int type; - int error; - - if (!sdp->sd_args.ar_posix_acl) - return -EOPNOTSUPP; - - type = gfs2_acl_type(name); - if (type < 0) - return type; - - acl = gfs2_get_acl(inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int gfs2_xattr_system_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, - int xtype) -{ - struct inode *inode = dentry->d_inode; - struct gfs2_sbd *sdp = GFS2_SB(inode); - struct posix_acl *acl = NULL; - int error = 0, type; - - if (!sdp->sd_args.ar_posix_acl) - return -EOPNOTSUPP; - type = gfs2_acl_type(name); - if (type < 0) - return type; - if (flags & XATTR_CREATE) - return -EINVAL; - if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) - return value ? -EACCES : 0; - if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_FOWNER)) - return -EPERM; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - if (!value) - goto set_acl; - - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (!acl) { - /* - * acl_set_file(3) may request that we set default ACLs with - * zero length -- defend (gracefully) against that here. - */ - goto out; - } - if (IS_ERR(acl)) { - error = PTR_ERR(acl); - goto out; - } - - error = posix_acl_valid(acl); - if (error) - goto out_release; - - error = -EINVAL; - if (acl->a_count > GFS2_ACL_MAX_ENTRIES) - goto out_release; + if (acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode))) + return -E2BIG; if (type == ACL_TYPE_ACCESS) { umode_t mode = inode->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + if (error < 0) + return error; - if (error <= 0) { - posix_acl_release(acl); + if (error == 0) acl = NULL; - if (error < 0) - return error; + if (mode != inode->i_mode) { + inode->i_mode = mode; + mark_inode_dirty(inode); } - - error = gfs2_set_mode(inode, mode); - if (error) - goto out_release; } -set_acl: - error = __gfs2_xattr_set(inode, name, value, size, 0, GFS2_EATYPE_SYS); - if (!error) { - if (acl) - set_cached_acl(inode, type, acl); - else - forget_cached_acl(inode, type); + if (acl) { + len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0); + if (len == 0) + return 0; + data = kmalloc(len, GFP_NOFS); + if (data == NULL) + return -ENOMEM; + error = posix_acl_to_xattr(&init_user_ns, acl, data, len); + if (error < 0) + goto out; + } else { + data = NULL; + len = 0; } -out_release: - posix_acl_release(acl); + + error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS); + if (error) + goto out; + + if (acl) + set_cached_acl(inode, type, acl); + else + forget_cached_acl(inode, type); out: + kfree(data); return error; } - -const struct xattr_handler gfs2_xattr_system_handler = { - .prefix = XATTR_SYSTEM_PREFIX, - .flags = GFS2_EATYPE_SYS, - .get = gfs2_xattr_system_get, - .set = gfs2_xattr_system_set, -}; - diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index 0da38dc7efe..2d65ec4cd4b 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -14,11 +14,9 @@ #define GFS2_POSIX_ACL_ACCESS "posix_acl_access" #define GFS2_POSIX_ACL_DEFAULT "posix_acl_default" -#define GFS2_ACL_MAX_ENTRIES 25 +#define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12) extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type); -extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode); -extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); -extern const struct xattr_handler gfs2_xattr_system_handler; +extern int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); #endif /* __ACL_DOT_H__ */ diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 1f7d8057ea6..805b37fed63 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -21,6 +21,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/backing-dev.h> #include <linux/aio.h> +#include <trace/events/writeback.h> #include "gfs2.h" #include "incore.h" @@ -230,13 +231,11 @@ static int gfs2_writepages(struct address_space *mapping, static int gfs2_write_jdata_pagevec(struct address_space *mapping, struct writeback_control *wbc, struct pagevec *pvec, - int nr_pages, pgoff_t end) + int nr_pages, pgoff_t end, + pgoff_t *done_index) { struct inode *inode = mapping->host; struct gfs2_sbd *sdp = GFS2_SB(inode); - loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; - unsigned offset = i_size & (PAGE_CACHE_SIZE-1); unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize); int i; int ret; @@ -248,40 +247,83 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping, for(i = 0; i < nr_pages; i++) { struct page *page = pvec->pages[i]; + /* + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), or + * even swizzled back from swapper_space to tmpfs file + * mapping. However, page->index will not change + * because we have a reference on the page. + */ + if (page->index > end) { + /* + * can't be range_cyclic (1st pass) because + * end == -1 in that case. + */ + ret = 1; + break; + } + + *done_index = page->index; + lock_page(page); if (unlikely(page->mapping != mapping)) { +continue_unlock: unlock_page(page); continue; } - if (!wbc->range_cyclic && page->index > end) { - ret = 1; - unlock_page(page); - continue; + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; } - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { - unlock_page(page); - continue; + if (PageWriteback(page)) { + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + else + goto continue_unlock; } - /* Is the page fully outside i_size? (truncate in progress) */ - if (page->index > end_index || (page->index == end_index && !offset)) { - page->mapping->a_ops->invalidatepage(page, 0, - PAGE_CACHE_SIZE); - unlock_page(page); - continue; - } + BUG_ON(PageWriteback(page)); + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + trace_wbc_writepage(wbc, mapping->backing_dev_info); ret = __gfs2_jdata_writepage(page, wbc); + if (unlikely(ret)) { + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + } else { + + /* + * done_index is set past this page, + * so media errors will not choke + * background writeout for the entire + * file. This has consequences for + * range_cyclic semantics (ie. it may + * not be suitable for data integrity + * writeout). + */ + *done_index = page->index + 1; + ret = 1; + break; + } + } - if (ret || (--(wbc->nr_to_write) <= 0)) + /* + * We stop writing back only if we are not doing + * integrity sync. In case of integrity sync we have to + * keep going until we have written all the pages + * we tagged for writeback prior to entering this loop. + */ + if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { ret = 1; + break; + } + } gfs2_trans_end(sdp); return ret; @@ -306,51 +348,69 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, int done = 0; struct pagevec pvec; int nr_pages; + pgoff_t uninitialized_var(writeback_index); pgoff_t index; pgoff_t end; - int scanned = 0; + pgoff_t done_index; + int cycled; int range_whole = 0; + int tag; pagevec_init(&pvec, 0); if (wbc->range_cyclic) { - index = mapping->writeback_index; /* Start from prev offset */ + writeback_index = mapping->writeback_index; /* prev offset */ + index = writeback_index; + if (index == 0) + cycled = 1; + else + cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - scanned = 1; + cycled = 1; /* ignore range_cyclic tests */ } + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; retry: - while (!done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { - scanned = 1; - ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end); + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, index, end); + done_index = index; + while (!done && (index <= end)) { + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end, &done_index); if (ret) done = 1; if (ret > 0) ret = 0; - pagevec_release(&pvec); cond_resched(); } - if (!scanned && !done) { + if (!cycled && !done) { /* + * range_cyclic: * We hit the last page and there is more work to be done: wrap * back to the start of the file */ - scanned = 1; + cycled = 1; index = 0; + end = writeback_index - 1; goto retry; } if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - mapping->writeback_index = index; + mapping->writeback_index = done_index; + return ret; } @@ -371,7 +431,7 @@ static int gfs2_jdata_writepages(struct address_space *mapping, ret = gfs2_write_cache_jdata(mapping, wbc); if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) { - gfs2_log_flush(sdp, ip->i_gl); + gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH); ret = gfs2_write_cache_jdata(mapping, wbc); } return ret; @@ -517,7 +577,6 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, p = kmap_atomic(page); memcpy(buf + copied, p + offset, amt); kunmap_atomic(p); - mark_page_accessed(page); page_cache_release(page); copied += amt; index++; @@ -611,12 +670,14 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); if (alloc_required) { + struct gfs2_alloc_parms ap = { .aflags = 0, }; error = gfs2_quota_lock_check(ip); if (error) goto out_unlock; requested = data_blocks + ind_blocks; - error = gfs2_inplace_reserve(ip, requested, 0); + ap.target = requested; + error = gfs2_inplace_reserve(ip, &ap); if (error) goto out_qunlock; } @@ -979,11 +1040,11 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset) static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + struct address_space *mapping = inode->i_mapping; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int rv; @@ -1004,9 +1065,39 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, if (rv != 1) goto out; /* dio not valid, fall back to buffered i/o */ - rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, gfs2_get_block_direct, - NULL, NULL, 0); + /* + * Now since we are holding a deferred (CW) lock at this point, you + * might be wondering why this is ever needed. There is a case however + * where we've granted a deferred local lock against a cached exclusive + * glock. That is ok provided all granted local locks are deferred, but + * it also means that it is possible to encounter pages which are + * cached and possibly also mapped. So here we check for that and sort + * them out ahead of the dio. The glock state machine will take care of + * everything else. + * + * If in fact the cached glock state (gl->gl_state) is deferred (CW) in + * the first place, mapping->nr_pages will always be zero. + */ + if (mapping->nrpages) { + loff_t lstart = offset & (PAGE_CACHE_SIZE - 1); + loff_t len = iov_iter_count(iter); + loff_t end = PAGE_ALIGN(offset + len) - 1; + + rv = 0; + if (len == 0) + goto out; + if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) + unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len); + rv = filemap_write_and_wait_range(mapping, lstart, end); + if (rv) + goto out; + if (rw == WRITE) + truncate_inode_pages_range(mapping, lstart, end); + } + + rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, + iter, offset, + gfs2_get_block_direct, NULL, NULL, 0); out: gfs2_glock_dq(&gh); gfs2_holder_uninit(&gh); @@ -1048,30 +1139,22 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) bh = bh->b_this_page; } while(bh != head); spin_unlock(&sdp->sd_ail_lock); - gfs2_log_unlock(sdp); head = bh = page_buffers(page); do { - gfs2_log_lock(sdp); bd = bh->b_private; if (bd) { gfs2_assert_warn(sdp, bd->bd_bh == bh); - if (!list_empty(&bd->bd_list)) { - if (!buffer_pinned(bh)) - list_del_init(&bd->bd_list); - else - bd = NULL; - } - if (bd) - bd->bd_bh = NULL; + if (!list_empty(&bd->bd_list)) + list_del_init(&bd->bd_list); + bd->bd_bh = NULL; bh->b_private = NULL; - } - gfs2_log_unlock(sdp); - if (bd) kmem_cache_free(gfs2_bufdata_cachep, bd); + } bh = bh->b_this_page; } while (bh != head); + gfs2_log_unlock(sdp); return try_to_free_buffers(page); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 62a65fc448d..e6ee5b6e8d9 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -707,7 +707,7 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi * @top: The first pointer in the buffer * @bottom: One more than the last pointer * @height: the height this buffer is at - * @data: a pointer to a struct strip_mine + * @sm: a pointer to a struct strip_mine * * Returns: errno */ @@ -992,6 +992,8 @@ unlock: return err; } +#define GFS2_JTRUNC_REVOKES 8192 + /** * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files * @inode: The inode being truncated @@ -1003,8 +1005,6 @@ unlock: * if the number of pages being truncated gets too large. */ -#define GFS2_JTRUNC_REVOKES 8192 - static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize) { struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -1216,6 +1216,7 @@ static int do_grow(struct inode *inode, u64 size) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_alloc_parms ap = { .target = 1, }; struct buffer_head *dibh; int error; int unstuff = 0; @@ -1226,7 +1227,7 @@ static int do_grow(struct inode *inode, u64 size) if (error) return error; - error = gfs2_inplace_reserve(ip, 1, 0); + error = gfs2_inplace_reserve(ip, &ap); if (error) goto do_grow_qunlock; unstuff = 1; @@ -1279,6 +1280,7 @@ do_grow_qunlock: int gfs2_setattr_size(struct inode *inode, u64 newsize) { + struct gfs2_inode *ip = GFS2_I(inode); int ret; u64 oldsize; @@ -1294,7 +1296,7 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) inode_dio_wait(inode); - ret = gfs2_rs_alloc(GFS2_I(inode)); + ret = gfs2_rs_alloc(ip); if (ret) goto out; @@ -1304,6 +1306,7 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) goto out; } + gfs2_rs_deltree(ip->i_res); ret = do_shrink(inode, oldsize, newsize); out: put_write_access(inode); @@ -1325,6 +1328,121 @@ int gfs2_file_dealloc(struct gfs2_inode *ip) } /** + * gfs2_free_journal_extents - Free cached journal bmap info + * @jd: The journal + * + */ + +void gfs2_free_journal_extents(struct gfs2_jdesc *jd) +{ + struct gfs2_journal_extent *jext; + + while(!list_empty(&jd->extent_list)) { + jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list); + list_del(&jext->list); + kfree(jext); + } +} + +/** + * gfs2_add_jextent - Add or merge a new extent to extent cache + * @jd: The journal descriptor + * @lblock: The logical block at start of new extent + * @dblock: The physical block at start of new extent + * @blocks: Size of extent in fs blocks + * + * Returns: 0 on success or -ENOMEM + */ + +static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks) +{ + struct gfs2_journal_extent *jext; + + if (!list_empty(&jd->extent_list)) { + jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list); + if ((jext->dblock + jext->blocks) == dblock) { + jext->blocks += blocks; + return 0; + } + } + + jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS); + if (jext == NULL) + return -ENOMEM; + jext->dblock = dblock; + jext->lblock = lblock; + jext->blocks = blocks; + list_add_tail(&jext->list, &jd->extent_list); + jd->nr_extents++; + return 0; +} + +/** + * gfs2_map_journal_extents - Cache journal bmap info + * @sdp: The super block + * @jd: The journal to map + * + * Create a reusable "extent" mapping from all logical + * blocks to all physical blocks for the given journal. This will save + * us time when writing journal blocks. Most journals will have only one + * extent that maps all their logical blocks. That's because gfs2.mkfs + * arranges the journal blocks sequentially to maximize performance. + * So the extent would map the first block for the entire file length. + * However, gfs2_jadd can happen while file activity is happening, so + * those journals may not be sequential. Less likely is the case where + * the users created their own journals by mounting the metafs and + * laying it out. But it's still possible. These journals might have + * several extents. + * + * Returns: 0 on success, or error on failure + */ + +int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) +{ + u64 lblock = 0; + u64 lblock_stop; + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct buffer_head bh; + unsigned int shift = sdp->sd_sb.sb_bsize_shift; + u64 size; + int rc; + + lblock_stop = i_size_read(jd->jd_inode) >> shift; + size = (lblock_stop - lblock) << shift; + jd->nr_extents = 0; + WARN_ON(!list_empty(&jd->extent_list)); + + do { + bh.b_state = 0; + bh.b_blocknr = 0; + bh.b_size = size; + rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0); + if (rc || !buffer_mapped(&bh)) + goto fail; + rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift); + if (rc) + goto fail; + size -= bh.b_size; + lblock += (bh.b_size >> ip->i_inode.i_blkbits); + } while(size > 0); + + fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid, + jd->nr_extents); + return 0; + +fail: + fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n", + rc, jd->jd_jid, + (unsigned long long)(i_size_read(jd->jd_inode) - size), + jd->nr_extents); + fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n", + rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr, + bh.b_state, (unsigned long long)bh.b_size); + gfs2_free_journal_extents(jd); + return rc; +} + +/** * gfs2_write_alloc_required - figure out if a write will require an allocation * @ip: the file being written to * @offset: the offset to write to diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index 42fea03e2bd..81ded5e2aaa 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -55,5 +55,7 @@ extern int gfs2_truncatei_resume(struct gfs2_inode *ip); extern int gfs2_file_dealloc(struct gfs2_inode *ip); extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, unsigned int len); +extern int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd); +extern void gfs2_free_journal_extents(struct gfs2_jdesc *jd); #endif /* __BMAP_DOT_H__ */ diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 2e5fc268d32..1a349f9a968 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -53,6 +53,8 @@ * but never before the maximum hash table size has been reached. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/buffer_head.h> @@ -507,8 +509,8 @@ static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset, goto error; return 0; error: - printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg, - first ? "first in block" : "not first in block"); + pr_warn("%s: %s (%s)\n", + __func__, msg, first ? "first in block" : "not first in block"); return -EIO; } @@ -531,8 +533,7 @@ static int gfs2_dirent_offset(const void *buf) } return offset; wrong_type: - printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n", - be32_to_cpu(h->mh_type)); + pr_warn("%s: wrong block type %u\n", __func__, be32_to_cpu(h->mh_type)); return -1; } @@ -728,7 +729,7 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no, error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp); if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) { - /* printk(KERN_INFO "block num=%llu\n", leaf_no); */ + /* pr_info("block num=%llu\n", leaf_no); */ error = -EIO; } @@ -834,6 +835,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, struct gfs2_leaf *leaf; struct gfs2_dirent *dent; struct qstr name = { .name = "" }; + struct timespec tv = CURRENT_TIME; error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); if (error) @@ -850,7 +852,11 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, leaf->lf_entries = 0; leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE); leaf->lf_next = 0; - memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved)); + leaf->lf_inode = cpu_to_be64(ip->i_no_addr); + leaf->lf_dist = cpu_to_be32(1); + leaf->lf_nsec = cpu_to_be32(tv.tv_nsec); + leaf->lf_sec = cpu_to_be64(tv.tv_sec); + memset(leaf->lf_reserved2, 0, sizeof(leaf->lf_reserved2)); dent = (struct gfs2_dirent *)(leaf+1); gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent); *pbh = bh; @@ -1001,7 +1007,8 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth)); half_len = len >> 1; if (!half_len) { - printk(KERN_WARNING "i_depth %u lf_depth %u index %u\n", dip->i_depth, be16_to_cpu(oleaf->lf_depth), index); + pr_warn("i_depth %u lf_depth %u index %u\n", + dip->i_depth, be16_to_cpu(oleaf->lf_depth), index); gfs2_consist_inode(dip); error = -EIO; goto fail_brelse; @@ -1612,11 +1619,31 @@ out: return ret; } +/** + * dir_new_leaf - Add a new leaf onto hash chain + * @inode: The directory + * @name: The name we are adding + * + * This adds a new dir leaf onto an existing leaf when there is not + * enough space to add a new dir entry. This is a last resort after + * we've expanded the hash table to max size and also split existing + * leaf blocks, so it will only occur for very large directories. + * + * The dist parameter is set to 1 for leaf blocks directly attached + * to the hash table, 2 for one layer of indirection, 3 for two layers + * etc. We are thus able to tell the difference between an old leaf + * with dist set to zero (i.e. "don't know") and a new one where we + * set this information for debug/fsck purposes. + * + * Returns: 0 on success, or -ve on error + */ + static int dir_new_leaf(struct inode *inode, const struct qstr *name) { struct buffer_head *bh, *obh; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_leaf *leaf, *oleaf; + u32 dist = 1; int error; u32 index; u64 bn; @@ -1626,6 +1653,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) if (error) return error; do { + dist++; oleaf = (struct gfs2_leaf *)obh->b_data; bn = be64_to_cpu(oleaf->lf_next); if (!bn) @@ -1643,6 +1671,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) brelse(obh); return -ENOSPC; } + leaf->lf_dist = cpu_to_be32(dist); oleaf->lf_next = cpu_to_be64(bh->b_blocknr); brelse(bh); brelse(obh); @@ -1657,41 +1686,64 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) return 0; } +static u16 gfs2_inode_ra_len(const struct gfs2_inode *ip) +{ + u64 where = ip->i_no_addr + 1; + if (ip->i_eattr == where) + return 1; + return 0; +} + /** * gfs2_dir_add - Add new filename into directory - * @dip: The GFS2 inode - * @filename: The new name - * @inode: The inode number of the entry - * @type: The type of the entry + * @inode: The directory inode + * @name: The new name + * @nip: The GFS2 inode to be linked in to the directory + * @da: The directory addition info + * + * If the call to gfs2_diradd_alloc_required resulted in there being + * no need to allocate any new directory blocks, then it will contain + * a pointer to the directory entry and the bh in which it resides. We + * can use that without having to repeat the search. If there was no + * free space, then we must now create more space. * * Returns: 0 on success, error code on failure */ int gfs2_dir_add(struct inode *inode, const struct qstr *name, - const struct gfs2_inode *nip) + const struct gfs2_inode *nip, struct gfs2_diradd *da) { struct gfs2_inode *ip = GFS2_I(inode); - struct buffer_head *bh; - struct gfs2_dirent *dent; + struct buffer_head *bh = da->bh; + struct gfs2_dirent *dent = da->dent; + struct timespec tv; struct gfs2_leaf *leaf; int error; while(1) { - dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, - &bh); + if (da->bh == NULL) { + dent = gfs2_dirent_search(inode, name, + gfs2_dirent_find_space, &bh); + } if (dent) { if (IS_ERR(dent)) return PTR_ERR(dent); dent = gfs2_init_dirent(inode, dent, name, bh); gfs2_inum_out(nip, dent); dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode)); + dent->de_rahead = cpu_to_be16(gfs2_inode_ra_len(nip)); + tv = CURRENT_TIME; if (ip->i_diskflags & GFS2_DIF_EXHASH) { leaf = (struct gfs2_leaf *)bh->b_data; be16_add_cpu(&leaf->lf_entries, 1); + leaf->lf_nsec = cpu_to_be32(tv.tv_nsec); + leaf->lf_sec = cpu_to_be64(tv.tv_sec); } + da->dent = NULL; + da->bh = NULL; brelse(bh); ip->i_entries++; - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + ip->i_inode.i_mtime = ip->i_inode.i_ctime = tv; if (S_ISDIR(nip->i_inode.i_mode)) inc_nlink(&ip->i_inode); mark_inode_dirty(inode); @@ -1742,6 +1794,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) const struct qstr *name = &dentry->d_name; struct gfs2_dirent *dent, *prev = NULL; struct buffer_head *bh; + struct timespec tv = CURRENT_TIME; /* Returns _either_ the entry (if its first in block) or the previous entry otherwise */ @@ -1767,13 +1820,15 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) if (!entries) gfs2_consist_inode(dip); leaf->lf_entries = cpu_to_be16(--entries); + leaf->lf_nsec = cpu_to_be32(tv.tv_nsec); + leaf->lf_sec = cpu_to_be64(tv.tv_sec); } brelse(bh); if (!dip->i_entries) gfs2_consist_inode(dip); dip->i_entries--; - dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; + dip->i_inode.i_mtime = dip->i_inode.i_ctime = tv; if (S_ISDIR(dentry->d_inode->i_mode)) drop_nlink(&dip->i_inode); mark_inode_dirty(&dip->i_inode); @@ -2017,22 +2072,36 @@ out: * gfs2_diradd_alloc_required - find if adding entry will require an allocation * @ip: the file being written to * @filname: the filename that's going to be added + * @da: The structure to return dir alloc info * - * Returns: 1 if alloc required, 0 if not, -ve on error + * Returns: 0 if ok, -ve on error */ -int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name) +int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name, + struct gfs2_diradd *da) { + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + const unsigned int extra = sizeof(struct gfs2_dinode) - sizeof(struct gfs2_leaf); struct gfs2_dirent *dent; struct buffer_head *bh; + da->nr_blocks = 0; + da->bh = NULL; + da->dent = NULL; + dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh); if (!dent) { - return 1; + da->nr_blocks = sdp->sd_max_dirres; + if (!(ip->i_diskflags & GFS2_DIF_EXHASH) && + (GFS2_DIRENT_SIZE(name->len) < extra)) + da->nr_blocks = 1; + return 0; } if (IS_ERR(dent)) return PTR_ERR(dent); - brelse(bh); + da->bh = bh; + da->dent = dent; return 0; } diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index 4f03bbd1873..126c65dda02 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -16,6 +16,14 @@ struct inode; struct gfs2_inode; struct gfs2_inum; +struct buffer_head; +struct gfs2_dirent; + +struct gfs2_diradd { + unsigned nr_blocks; + struct gfs2_dirent *dent; + struct buffer_head *bh; +}; extern struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename, @@ -23,7 +31,13 @@ extern struct inode *gfs2_dir_search(struct inode *dir, extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename, const struct gfs2_inode *ip); extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename, - const struct gfs2_inode *ip); + const struct gfs2_inode *ip, struct gfs2_diradd *da); +static inline void gfs2_dir_no_add(struct gfs2_diradd *da) +{ + if (da->bh) + brelse(da->bh); + da->bh = NULL; +} extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry); extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, struct file_ra_state *f_ra); @@ -33,7 +47,8 @@ extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); extern int gfs2_diradd_alloc_required(struct inode *dir, - const struct qstr *filename); + const struct qstr *filename, + struct gfs2_diradd *da); extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, struct buffer_head **bhp); extern void gfs2_dir_hash_inval(struct gfs2_inode *ip); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 0621b46d474..26b3f952e6b 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -203,9 +203,9 @@ void gfs2_set_inode_flags(struct inode *inode) GFS2_DIF_INHERIT_JDATA) /** - * gfs2_set_flags - set flags on an inode - * @inode: The inode - * @flags: The flags to set + * do_gfs2_set_flags - set flags on an inode + * @filp: file pointer + * @reqflags: The flags to set * @mask: Indicates which flags are valid * */ @@ -256,7 +256,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) } if ((flags ^ new_flags) & GFS2_DIF_JDATA) { if (flags & GFS2_DIF_JDATA) - gfs2_log_flush(sdp, ip->i_gl); + gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH); error = filemap_fdatawrite(inode->i_mapping); if (error) goto out; @@ -318,7 +318,7 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) /** * gfs2_size_hint - Give a hint to the size of a write request - * @file: The struct file + * @filep: The struct file * @offset: The file offset of the write * @size: The length of the write * @@ -371,7 +371,7 @@ static int gfs2_allocate_page_backing(struct page *page) /** * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable * @vma: The virtual memory area - * @page: The page which is about to become writable + * @vmf: The virtual memory fault containing the page to become writable * * When the page becomes writable, we need to ensure that we have * blocks allocated on disk to back that page. @@ -383,6 +383,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = file_inode(vma->vm_file); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_alloc_parms ap = { .aflags = 0, }; unsigned long last_index; u64 pos = page->index << PAGE_CACHE_SHIFT; unsigned int data_blocks, ind_blocks, rblocks; @@ -430,7 +431,8 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret) goto out_unlock; gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); - ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0); + ap.target = data_blocks + ind_blocks; + ret = gfs2_inplace_reserve(ip, &ap); if (ret) goto out_quota_unlock; @@ -492,6 +494,7 @@ out: static const struct vm_operations_struct gfs2_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = gfs2_page_mkwrite, .remap_pages = generic_file_remap_pages, }; @@ -620,7 +623,7 @@ static int gfs2_release(struct inode *inode, struct file *file) if (!(file->f_mode & FMODE_WRITE)) return 0; - gfs2_rs_delete(ip); + gfs2_rs_delete(ip, &inode->i_writecount); return 0; } @@ -681,7 +684,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, } /** - * gfs2_file_aio_write - Perform a write to a file + * gfs2_file_write_iter - Perform a write to a file * @iocb: The io context * @iov: The data to write * @nr_segs: Number of @iov segments @@ -694,11 +697,9 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, * */ -static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - size_t writesize = iov_length(iov, nr_segs); struct gfs2_inode *ip = GFS2_I(file_inode(file)); int ret; @@ -706,7 +707,7 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (ret) return ret; - gfs2_size_hint(file, pos, writesize); + gfs2_size_hint(file, iocb->ki_pos, iov_iter_count(from)); if (file->f_flags & O_APPEND) { struct gfs2_holder gh; @@ -717,7 +718,7 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, gfs2_glock_dq_uninit(&gh); } - return generic_file_aio_write(iocb, iov, nr_segs, pos); + return generic_file_write_iter(iocb, from); } static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, @@ -800,6 +801,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, struct inode *inode = file_inode(file); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_alloc_parms ap = { .aflags = 0, }; unsigned int data_blocks = 0, ind_blocks = 0, rblocks; loff_t bytes, max_bytes; int error; @@ -808,6 +810,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1); loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; loff_t max_chunk_size = UINT_MAX & bsize_mask; + struct gfs2_holder gh; + next = (next + 1) << sdp->sd_sb.sb_bsize_shift; /* We only support the FALLOC_FL_KEEP_SIZE mode */ @@ -828,8 +832,10 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, if (error) return error; - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); - error = gfs2_glock_nq(&ip->i_gh); + mutex_lock(&inode->i_mutex); + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + error = gfs2_glock_nq(&gh); if (unlikely(error)) goto out_uninit; @@ -850,7 +856,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, retry: gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); - error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0); + ap.target = data_blocks + ind_blocks; + error = gfs2_inplace_reserve(ip, &ap); if (error) { if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { bytes >>= 1; @@ -896,9 +903,10 @@ out_trans_fail: out_qunlock: gfs2_quota_unlock(ip); out_unlock: - gfs2_glock_dq(&ip->i_gh); + gfs2_glock_dq(&gh); out_uninit: - gfs2_holder_uninit(&ip->i_gh); + gfs2_holder_uninit(&gh); + mutex_unlock(&inode->i_mutex); return error; } @@ -973,7 +981,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) int error = 0; state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; - flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE; + flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT; mutex_lock(&fp->f_fl_mutex); @@ -983,7 +991,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) goto out; flock_lock_file_wait(file, &(struct file_lock){.fl_type = F_UNLCK}); - gfs2_glock_dq_wait(fl_gh); + gfs2_glock_dq(fl_gh); gfs2_holder_reinit(state, flags, fl_gh); } else { error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr, @@ -1048,10 +1056,10 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) const struct file_operations gfs2_file_fops = { .llseek = gfs2_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = gfs2_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = gfs2_file_write_iter, .unlocked_ioctl = gfs2_ioctl, .mmap = gfs2_mmap, .open = gfs2_open, @@ -1060,7 +1068,7 @@ const struct file_operations gfs2_file_fops = { .lock = gfs2_lock, .flock = gfs2_flock, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, .setlease = gfs2_setlease, .fallocate = gfs2_fallocate, }; @@ -1080,17 +1088,17 @@ const struct file_operations gfs2_dir_fops = { const struct file_operations gfs2_file_fops_nolock = { .llseek = gfs2_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = gfs2_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = gfs2_file_write_iter, .unlocked_ioctl = gfs2_ioctl, .mmap = gfs2_mmap, .open = gfs2_open, .release = gfs2_release, .fsync = gfs2_fsync, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, .setlease = generic_setlease, .fallocate = gfs2_fallocate, }; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c2f41b4d00b..ee4e04fe60f 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -7,6 +7,8 @@ * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -31,6 +33,7 @@ #include <linux/bit_spinlock.h> #include <linux/percpu.h> #include <linux/list_sort.h> +#include <linux/lockref.h> #include "gfs2.h" #include "incore.h" @@ -129,10 +132,10 @@ void gfs2_glock_free(struct gfs2_glock *gl) * */ -void gfs2_glock_hold(struct gfs2_glock *gl) +static void gfs2_glock_hold(struct gfs2_glock *gl) { - GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0); - atomic_inc(&gl->gl_ref); + GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref)); + lockref_get(&gl->gl_lockref); } /** @@ -187,20 +190,6 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) } /** - * gfs2_glock_put_nolock() - Decrement reference count on glock - * @gl: The glock to put - * - * This function should only be used if the caller has its own reference - * to the glock, in addition to the one it is dropping. - */ - -void gfs2_glock_put_nolock(struct gfs2_glock *gl) -{ - if (atomic_dec_and_test(&gl->gl_ref)) - GLOCK_BUG_ON(gl, 1); -} - -/** * gfs2_glock_put() - Decrement reference count on glock * @gl: The glock to put * @@ -211,17 +200,22 @@ void gfs2_glock_put(struct gfs2_glock *gl) struct gfs2_sbd *sdp = gl->gl_sbd; struct address_space *mapping = gfs2_glock2aspace(gl); - if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) { - __gfs2_glock_remove_from_lru(gl); - spin_unlock(&lru_lock); - spin_lock_bucket(gl->gl_hash); - hlist_bl_del_rcu(&gl->gl_list); - spin_unlock_bucket(gl->gl_hash); - GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); - GLOCK_BUG_ON(gl, mapping && mapping->nrpages); - trace_gfs2_glock_put(gl); - sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); - } + if (lockref_put_or_lock(&gl->gl_lockref)) + return; + + lockref_mark_dead(&gl->gl_lockref); + + spin_lock(&lru_lock); + __gfs2_glock_remove_from_lru(gl); + spin_unlock(&lru_lock); + spin_unlock(&gl->gl_lockref.lock); + spin_lock_bucket(gl->gl_hash); + hlist_bl_del_rcu(&gl->gl_list); + spin_unlock_bucket(gl->gl_hash); + GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); + GLOCK_BUG_ON(gl, mapping && mapping->nrpages); + trace_gfs2_glock_put(gl); + sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); } /** @@ -244,7 +238,7 @@ static struct gfs2_glock *search_bucket(unsigned int hash, continue; if (gl->gl_sbd != sdp) continue; - if (atomic_inc_not_zero(&gl->gl_ref)) + if (lockref_get_not_dead(&gl->gl_lockref)) return gl; } @@ -283,7 +277,7 @@ static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holde static void gfs2_holder_wake(struct gfs2_holder *gh) { clear_bit(HIF_WAIT, &gh->gh_iflags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&gh->gh_iflags, HIF_WAIT); } @@ -396,10 +390,11 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state) held2 = (new_state != LM_ST_UNLOCKED); if (held1 != held2) { + GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref)); if (held2) - gfs2_glock_hold(gl); + gl->gl_lockref.count++; else - gfs2_glock_put_nolock(gl); + gl->gl_lockref.count--; } if (held1 && held2 && list_empty(&gl->gl_holders)) clear_bit(GLF_QUEUED, &gl->gl_flags); @@ -416,7 +411,7 @@ static void gfs2_demote_wake(struct gfs2_glock *gl) { gl->gl_demote_state = LM_ST_EXCLUSIVE; clear_bit(GLF_DEMOTE, &gl->gl_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&gl->gl_flags, GLF_DEMOTE); } @@ -475,7 +470,7 @@ retry: do_xmote(gl, gh, LM_ST_UNLOCKED); break; default: /* Everything else */ - printk(KERN_ERR "GFS2: wanted %u got %u\n", gl->gl_target, state); + pr_err("wanted %u got %u\n", gl->gl_target, state); GLOCK_BUG_ON(gl, 1); } spin_unlock(&gl->gl_spin); @@ -549,7 +544,7 @@ __acquires(&gl->gl_spin) /* lock_dlm */ ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags); if (ret) { - printk(KERN_ERR "GFS2: lm_lock ret %d\n", ret); + pr_err("lm_lock ret %d\n", ret); GLOCK_BUG_ON(gl, 1); } } else { /* lock_nolock */ @@ -625,15 +620,15 @@ out: out_sched: clear_bit(GLF_LOCK, &gl->gl_flags); - smp_mb__after_clear_bit(); - gfs2_glock_hold(gl); + smp_mb__after_atomic(); + gl->gl_lockref.count++; if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put_nolock(gl); + gl->gl_lockref.count--; return; out_unlock: clear_bit(GLF_LOCK, &gl->gl_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return; } @@ -736,14 +731,14 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, cachep = gfs2_glock_aspace_cachep; else cachep = gfs2_glock_cachep; - gl = kmem_cache_alloc(cachep, GFP_KERNEL); + gl = kmem_cache_alloc(cachep, GFP_NOFS); if (!gl) return -ENOMEM; memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); if (glops->go_flags & GLOF_LVB) { - gl->gl_lksb.sb_lvbptr = kzalloc(GFS2_MIN_LVB_SIZE, GFP_KERNEL); + gl->gl_lksb.sb_lvbptr = kzalloc(GFS2_MIN_LVB_SIZE, GFP_NOFS); if (!gl->gl_lksb.sb_lvbptr) { kmem_cache_free(cachep, gl); return -ENOMEM; @@ -754,7 +749,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl->gl_sbd = sdp; gl->gl_flags = 0; gl->gl_name = name; - atomic_set(&gl->gl_ref, 1); + gl->gl_lockref.count = 1; gl->gl_state = LM_ST_UNLOCKED; gl->gl_target = LM_ST_UNLOCKED; gl->gl_demote_state = LM_ST_EXCLUSIVE; @@ -942,7 +937,7 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) vaf.fmt = fmt; vaf.va = &args; - printk(KERN_ERR " %pV", &vaf); + pr_err("%pV", &vaf); } va_end(args); @@ -1017,13 +1012,13 @@ do_cancel: return; trap_recursive: - printk(KERN_ERR "original: %pSR\n", (void *)gh2->gh_ip); - printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid)); - printk(KERN_ERR "lock type: %d req lock state : %d\n", + pr_err("original: %pSR\n", (void *)gh2->gh_ip); + pr_err("pid: %d\n", pid_nr(gh2->gh_owner_pid)); + pr_err("lock type: %d req lock state : %d\n", gh2->gh_gl->gl_name.ln_type, gh2->gh_state); - printk(KERN_ERR "new: %pSR\n", (void *)gh->gh_ip); - printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid)); - printk(KERN_ERR "lock type: %d req lock state : %d\n", + pr_err("new: %pSR\n", (void *)gh->gh_ip); + pr_err("pid: %d\n", pid_nr(gh->gh_owner_pid)); + pr_err("lock type: %d req lock state : %d\n", gh->gh_gl->gl_name.ln_type, gh->gh_state); gfs2_dump_glock(NULL, gl); BUG(); @@ -1052,9 +1047,13 @@ int gfs2_glock_nq(struct gfs2_holder *gh) spin_lock(&gl->gl_spin); add_to_queue(gh); - if ((LM_FLAG_NOEXP & gh->gh_flags) && - test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) + if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) && + test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))) { set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + gl->gl_lockref.count++; + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gl->gl_lockref.count--; + } run_queue(gl, 1); spin_unlock(&gl->gl_spin); @@ -1356,10 +1355,10 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) } } - spin_unlock(&gl->gl_spin); + gl->gl_lockref.count++; set_bit(GLF_REPLY_PENDING, &gl->gl_flags); - smp_wmb(); - gfs2_glock_hold(gl); + spin_unlock(&gl->gl_spin); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gfs2_glock_put(gl); } @@ -1404,17 +1403,25 @@ __acquires(&lru_lock) while(!list_empty(list)) { gl = list_entry(list->next, struct gfs2_glock, gl_lru); list_del_init(&gl->gl_lru); + if (!spin_trylock(&gl->gl_spin)) { +add_back_to_lru: + list_add(&gl->gl_lru, &lru_list); + atomic_inc(&lru_count); + continue; + } + if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { + spin_unlock(&gl->gl_spin); + goto add_back_to_lru; + } clear_bit(GLF_LRU, &gl->gl_flags); - gfs2_glock_hold(gl); - spin_unlock(&lru_lock); - spin_lock(&gl->gl_spin); + gl->gl_lockref.count++; if (demote_ok(gl)) handle_callback(gl, LM_ST_UNLOCKED, 0, false); WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags)); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put_nolock(gl); + gl->gl_lockref.count--; spin_unlock(&gl->gl_spin); - spin_lock(&lru_lock); + cond_resched_lock(&lru_lock); } } @@ -1439,7 +1446,7 @@ static long gfs2_scan_glock_lru(int nr) gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru); /* Test for being demotable */ - if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { + if (!test_bit(GLF_LOCK, &gl->gl_flags)) { list_move(&gl->gl_lru, &dispose); atomic_dec(&lru_count); freed++; @@ -1493,7 +1500,7 @@ static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp, rcu_read_lock(); hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) { - if ((gl->gl_sbd == sdp) && atomic_inc_not_zero(&gl->gl_ref)) + if ((gl->gl_sbd == sdp) && lockref_get_not_dead(&gl->gl_lockref)) examiner(gl); } rcu_read_unlock(); @@ -1555,13 +1562,11 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp) glock_hash_walk(thaw_glock, sdp); } -static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl) +static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl) { - int ret; spin_lock(&gl->gl_spin); - ret = gfs2_dump_glock(seq, gl); + gfs2_dump_glock(seq, gl); spin_unlock(&gl->gl_spin); - return ret; } static void dump_glock_func(struct gfs2_glock *gl) @@ -1650,14 +1655,14 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags) * @seq: the seq_file struct * @gh: the glock holder * - * Returns: 0 on success, -ENOBUFS when we run out of space */ -static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) +static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) { struct task_struct *gh_owner = NULL; char flags_buf[32]; + rcu_read_lock(); if (gh->gh_owner_pid) gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n", @@ -1667,7 +1672,7 @@ static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1, gh_owner ? gh_owner->comm : "(ended)", (void *)gh->gh_ip); - return 0; + rcu_read_unlock(); } static const char *gflags2str(char *buf, const struct gfs2_glock *gl) @@ -1722,16 +1727,14 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) * example. The field's are n = number (id of the object), f = flags, * t = type, s = state, r = refcount, e = error, p = pid. * - * Returns: 0 on success, -ENOBUFS when we run out of space */ -int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) +void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) { const struct gfs2_glock_operations *glops = gl->gl_ops; unsigned long long dtime; const struct gfs2_holder *gh; char gflags_buf[32]; - int error = 0; dtime = jiffies - gl->gl_demote_time; dtime *= 1000000/HZ; /* demote time in uSec */ @@ -1746,17 +1749,13 @@ int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) state2str(gl->gl_demote_state), dtime, atomic_read(&gl->gl_ail_count), atomic_read(&gl->gl_revokes), - atomic_read(&gl->gl_ref), gl->gl_hold_time); + (int)gl->gl_lockref.count, gl->gl_hold_time); + + list_for_each_entry(gh, &gl->gl_holders, gh_list) + dump_holder(seq, gh); - list_for_each_entry(gh, &gl->gl_holders, gh_list) { - error = dump_holder(seq, gh); - if (error) - goto out; - } if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump) - error = glops->go_dump(seq, gl); -out: - return error; + glops->go_dump(seq, gl); } static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr) @@ -1902,7 +1901,8 @@ static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) gi->nhash = 0; } /* Skip entries for other sb and dead entries */ - } while (gi->sdp != gi->gl->gl_sbd || atomic_read(&gi->gl->gl_ref) == 0); + } while (gi->sdp != gi->gl->gl_sbd || + __lockref_is_dead(&gi->gl->gl_lockref)); return 0; } @@ -1953,7 +1953,8 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) { - return dump_glock(seq, iter_ptr); + dump_glock(seq, iter_ptr); + return 0; } static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 69f66e3d22b..32572f71f02 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -181,8 +181,6 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl) extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, int create, struct gfs2_glock **glp); -extern void gfs2_glock_hold(struct gfs2_glock *gl); -extern void gfs2_glock_put_nolock(struct gfs2_glock *gl); extern void gfs2_glock_put(struct gfs2_glock *gl); extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, struct gfs2_holder *gh); @@ -201,7 +199,7 @@ extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, struct gfs2_holder *gh); extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); -extern int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); +extern void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0) extern __printf(2, 3) void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index e2e0a90396e..2ffc67dce87 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -82,23 +82,30 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) struct gfs2_trans tr; memset(&tr, 0, sizeof(tr)); + INIT_LIST_HEAD(&tr.tr_buf); + INIT_LIST_HEAD(&tr.tr_databuf); tr.tr_revokes = atomic_read(&gl->gl_ail_count); if (!tr.tr_revokes) return; - /* A shortened, inline version of gfs2_trans_begin() */ + /* A shortened, inline version of gfs2_trans_begin() + * tr->alloced is not set since the transaction structure is + * on the stack */ tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64)); tr.tr_ip = (unsigned long)__builtin_return_address(0); sb_start_intwrite(sdp->sd_vfs); - gfs2_log_reserve(sdp, tr.tr_reserved); + if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0) { + sb_end_intwrite(sdp->sd_vfs); + return; + } WARN_ON_ONCE(current->journal_info); current->journal_info = &tr; __gfs2_ail_flush(gl, 0, tr.tr_revokes); gfs2_trans_end(sdp); - gfs2_log_flush(sdp, NULL); + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); } void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) @@ -119,7 +126,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) return; __gfs2_ail_flush(gl, fsync, max_revokes); gfs2_trans_end(sdp); - gfs2_log_flush(sdp, NULL); + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); } /** @@ -133,7 +140,8 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) static void rgrp_go_sync(struct gfs2_glock *gl) { - struct address_space *metamapping = gfs2_glock2aspace(gl); + struct gfs2_sbd *sdp = gl->gl_sbd; + struct address_space *mapping = &sdp->sd_aspace; struct gfs2_rgrpd *rgd; int error; @@ -141,10 +149,10 @@ static void rgrp_go_sync(struct gfs2_glock *gl) return; GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); - gfs2_log_flush(gl->gl_sbd, gl); - filemap_fdatawrite(metamapping); - error = filemap_fdatawait(metamapping); - mapping_set_error(metamapping, error); + gfs2_log_flush(sdp, gl, NORMAL_FLUSH); + filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end); + error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end); + mapping_set_error(mapping, error); gfs2_ail_empty_gl(gl); spin_lock(&gl->gl_spin); @@ -166,11 +174,12 @@ static void rgrp_go_sync(struct gfs2_glock *gl) static void rgrp_go_inval(struct gfs2_glock *gl, int flags) { - struct address_space *mapping = gfs2_glock2aspace(gl); + struct gfs2_sbd *sdp = gl->gl_sbd; + struct address_space *mapping = &sdp->sd_aspace; WARN_ON_ONCE(!(flags & DIO_METADATA)); - gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); - truncate_inode_pages(mapping, 0); + gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); + truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end); if (gl->gl_object) { struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object; @@ -192,14 +201,17 @@ static void inode_go_sync(struct gfs2_glock *gl) if (ip && !S_ISREG(ip->i_inode.i_mode)) ip = NULL; - if (ip && test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) - unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0); + if (ip) { + if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) + unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0); + inode_dio_wait(&ip->i_inode); + } if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) return; GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); - gfs2_log_flush(gl->gl_sbd, gl); + gfs2_log_flush(gl->gl_sbd, gl, NORMAL_FLUSH); filemap_fdatawrite(metamapping); if (ip) { struct address_space *mapping = ip->i_inode.i_mapping; @@ -214,7 +226,7 @@ static void inode_go_sync(struct gfs2_glock *gl) * Writeback of the data mapping may cause the dirty flag to be set * so we have to clear it again here. */ - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(GLF_DIRTY, &gl->gl_flags); } @@ -222,8 +234,8 @@ static void inode_go_sync(struct gfs2_glock *gl) * inode_go_inval - prepare a inode glock to be released * @gl: the glock * @flags: - * - * Normally we invlidate everything, but if we are moving into + * + * Normally we invalidate everything, but if we are moving into * LM_ST_DEFERRED from LM_ST_SHARED or LM_ST_EXCLUSIVE then we * can keep hold of the metadata, since it won't have changed. * @@ -246,7 +258,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) } if (ip == GFS2_I(gl->gl_sbd->sd_rindex)) { - gfs2_log_flush(gl->gl_sbd, NULL); + gfs2_log_flush(gl->gl_sbd, NULL, NORMAL_FLUSH); gl->gl_sbd->sd_rindex_uptodate = 0; } if (ip && S_ISREG(ip->i_inode.i_mode)) @@ -410,6 +422,9 @@ static int inode_go_lock(struct gfs2_holder *gh) return error; } + if (gh->gh_state != LM_ST_DEFERRED) + inode_dio_wait(&ip->i_inode); + if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) && (gl->gl_state == LM_ST_EXCLUSIVE) && (gh->gh_state == LM_ST_EXCLUSIVE)) { @@ -429,49 +444,55 @@ static int inode_go_lock(struct gfs2_holder *gh) * @seq: The iterator * @ip: the inode * - * Returns: 0 on success, -ENOBUFS when we run out of space */ -static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) +static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) { const struct gfs2_inode *ip = gl->gl_object; if (ip == NULL) - return 0; + return; gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n", (unsigned long long)ip->i_no_formal_ino, (unsigned long long)ip->i_no_addr, IF2DT(ip->i_inode.i_mode), ip->i_flags, (unsigned int)ip->i_diskflags, (unsigned long long)i_size_read(&ip->i_inode)); - return 0; } /** - * trans_go_sync - promote/demote the transaction glock + * freeze_go_sync - promote/demote the freeze glock * @gl: the glock * @state: the requested state * @flags: * */ -static void trans_go_sync(struct gfs2_glock *gl) +static void freeze_go_sync(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; + DEFINE_WAIT(wait); - if (gl->gl_state != LM_ST_UNLOCKED && + if (gl->gl_state == LM_ST_SHARED && test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { - gfs2_meta_syncfs(sdp); - gfs2_log_shutdown(sdp); + atomic_set(&sdp->sd_log_freeze, 1); + wake_up(&sdp->sd_logd_waitq); + do { + prepare_to_wait(&sdp->sd_log_frozen_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (atomic_read(&sdp->sd_log_freeze)) + io_schedule(); + } while(atomic_read(&sdp->sd_log_freeze)); + finish_wait(&sdp->sd_log_frozen_wait, &wait); } } /** - * trans_go_xmote_bh - After promoting/demoting the transaction glock + * freeze_go_xmote_bh - After promoting/demoting the freeze glock * @gl: the glock * */ -static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh) +static int freeze_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh) { struct gfs2_sbd *sdp = gl->gl_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); @@ -504,7 +525,7 @@ static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh) * Always returns 0 */ -static int trans_go_demote_ok(const struct gfs2_glock *gl) +static int freeze_go_demote_ok(const struct gfs2_glock *gl) { return 0; } @@ -525,9 +546,9 @@ static void iopen_go_callback(struct gfs2_glock *gl, bool remote) if (gl->gl_demote_state == LM_ST_UNLOCKED && gl->gl_state == LM_ST_SHARED && ip) { - gfs2_glock_hold(gl); + gl->gl_lockref.count++; if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) - gfs2_glock_put_nolock(gl); + gl->gl_lockref.count--; } } @@ -552,13 +573,13 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = { .go_unlock = gfs2_rgrp_go_unlock, .go_dump = gfs2_rgrp_dump, .go_type = LM_TYPE_RGRP, - .go_flags = GLOF_ASPACE | GLOF_LVB, + .go_flags = GLOF_LVB, }; -const struct gfs2_glock_operations gfs2_trans_glops = { - .go_sync = trans_go_sync, - .go_xmote_bh = trans_go_xmote_bh, - .go_demote_ok = trans_go_demote_ok, +const struct gfs2_glock_operations gfs2_freeze_glops = { + .go_sync = freeze_go_sync, + .go_xmote_bh = freeze_go_xmote_bh, + .go_demote_ok = freeze_go_demote_ok, .go_type = LM_TYPE_NONDISK, }; diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h index bf95a2dc166..7455d2629bc 100644 --- a/fs/gfs2/glops.h +++ b/fs/gfs2/glops.h @@ -15,7 +15,7 @@ extern const struct gfs2_glock_operations gfs2_meta_glops; extern const struct gfs2_glock_operations gfs2_inode_glops; extern const struct gfs2_glock_operations gfs2_rgrp_glops; -extern const struct gfs2_glock_operations gfs2_trans_glops; +extern const struct gfs2_glock_operations gfs2_freeze_glops; extern const struct gfs2_glock_operations gfs2_iopen_glops; extern const struct gfs2_glock_operations gfs2_flock_glops; extern const struct gfs2_glock_operations gfs2_nondisk_glops; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 26aabd7caba..67d310c9ada 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -21,6 +21,7 @@ #include <linux/rbtree.h> #include <linux/ktime.h> #include <linux/percpu.h> +#include <linux/lockref.h> #define DIO_WAIT 0x00000010 #define DIO_METADATA 0x00000020 @@ -51,7 +52,7 @@ struct gfs2_log_header_host { */ struct gfs2_log_operations { - void (*lo_before_commit) (struct gfs2_sbd *sdp); + void (*lo_before_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr); void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr); void (*lo_before_scan) (struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, int pass); @@ -71,6 +72,7 @@ struct gfs2_bitmap { u32 bi_offset; u32 bi_start; u32 bi_len; + u32 bi_blocks; }; struct gfs2_rgrpd { @@ -91,6 +93,7 @@ struct gfs2_rgrpd { struct gfs2_rgrp_lvb *rd_rgl; u32 rd_last_alloc; u32 rd_flags; + u32 rd_extfail_pt; /* extent failure point */ #define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ #define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */ #define GFS2_RDF_ERROR 0x40000000 /* error in rg */ @@ -101,19 +104,25 @@ struct gfs2_rgrpd { struct gfs2_rbm { struct gfs2_rgrpd *rgd; - struct gfs2_bitmap *bi; /* Bitmap must belong to the rgd */ u32 offset; /* The offset is bitmap relative */ + int bii; /* Bitmap index */ }; +static inline struct gfs2_bitmap *rbm_bi(const struct gfs2_rbm *rbm) +{ + return rbm->rgd->rd_bits + rbm->bii; +} + static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm) { - return rbm->rgd->rd_data0 + (rbm->bi->bi_start * GFS2_NBBY) + rbm->offset; + return rbm->rgd->rd_data0 + (rbm_bi(rbm)->bi_start * GFS2_NBBY) + + rbm->offset; } static inline bool gfs2_rbm_eq(const struct gfs2_rbm *rbm1, const struct gfs2_rbm *rbm2) { - return (rbm1->rgd == rbm2->rgd) && (rbm1->bi == rbm2->bi) && + return (rbm1->rgd == rbm2->rgd) && (rbm1->bii == rbm2->bii) && (rbm1->offset == rbm2->offset); } @@ -209,7 +218,7 @@ struct gfs2_glock_operations { int (*go_demote_ok) (const struct gfs2_glock *gl); int (*go_lock) (struct gfs2_holder *gh); void (*go_unlock) (struct gfs2_holder *gh); - int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); + void (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); void (*go_callback)(struct gfs2_glock *gl, bool remote); const int go_type; const unsigned long go_flags; @@ -278,6 +287,20 @@ struct gfs2_blkreserv { unsigned int rs_qa_qd_num; }; +/* + * Allocation parameters + * @target: The number of blocks we'd ideally like to allocate + * @aflags: The flags (e.g. Orlov flag) + * + * The intent is to gradually expand this structure over time in + * order to give more information, e.g. alignment, min extent size + * to the allocation code. + */ +struct gfs2_alloc_parms { + u32 target; + u32 aflags; +}; + enum { GLF_LOCK = 1, GLF_DEMOTE = 3, @@ -300,9 +323,9 @@ struct gfs2_glock { struct gfs2_sbd *gl_sbd; unsigned long gl_flags; /* GLF_... */ struct lm_lockname gl_name; - atomic_t gl_ref; - spinlock_t gl_spin; + struct lockref gl_lockref; +#define gl_spin gl_lockref.lock /* State fields protected by gl_spin */ unsigned int gl_state:2, /* Current state */ @@ -328,7 +351,15 @@ struct gfs2_glock { atomic_t gl_ail_count; atomic_t gl_revokes; struct delayed_work gl_work; - struct work_struct gl_delete; + union { + /* For inode and iopen glocks only */ + struct work_struct gl_delete; + /* For rgrp glocks only */ + struct { + loff_t start; + loff_t end; + } gl_vm; + }; struct rcu_head gl_rcu; }; @@ -340,6 +371,7 @@ enum { GIF_ALLOC_FAILED = 2, GIF_SW_PAGED = 3, GIF_ORDERED = 4, + GIF_FREE_VFS_INODE = 5, }; struct gfs2_inode { @@ -397,12 +429,14 @@ enum { }; struct gfs2_quota_data { + struct hlist_bl_node qd_hlist; struct list_head qd_list; - struct list_head qd_reclaim; - - atomic_t qd_count; - struct kqid qd_id; + struct gfs2_sbd *qd_sbd; + struct lockref qd_lockref; + struct list_head qd_lru; + unsigned qd_hash; + unsigned long qd_flags; /* QDF_... */ s64 qd_change; @@ -420,6 +454,7 @@ struct gfs2_quota_data { u64 qd_sync_gen; unsigned long qd_last_warn; + struct rcu_head qd_rcu; }; struct gfs2_trans { @@ -428,11 +463,9 @@ struct gfs2_trans { unsigned int tr_blocks; unsigned int tr_revokes; unsigned int tr_reserved; - - struct gfs2_holder tr_t_gh; - - int tr_touched; - int tr_attached; + unsigned int tr_touched:1; + unsigned int tr_attached:1; + unsigned int tr_alloced:1; unsigned int tr_num_buf_new; unsigned int tr_num_databuf_new; @@ -442,6 +475,8 @@ struct gfs2_trans { unsigned int tr_num_revoke_rm; struct list_head tr_list; + struct list_head tr_databuf; + struct list_head tr_buf; unsigned int tr_first; struct list_head tr_ail1_list; @@ -449,7 +484,7 @@ struct gfs2_trans { }; struct gfs2_journal_extent { - struct list_head extent_list; + struct list_head list; unsigned int lblock; /* First logical block */ u64 dblock; /* First disk block */ @@ -459,6 +494,7 @@ struct gfs2_journal_extent { struct gfs2_jdesc { struct list_head jd_list; struct list_head extent_list; + unsigned int nr_extents; struct work_struct jd_work; struct inode *jd_inode; unsigned long jd_flags; @@ -466,6 +502,15 @@ struct gfs2_jdesc { unsigned int jd_jid; unsigned int jd_blocks; int jd_recover_error; + /* Replay stuff */ + + unsigned int jd_found_blocks; + unsigned int jd_found_revokes; + unsigned int jd_replayed_blocks; + + struct list_head jd_revoke_list; + unsigned int jd_replay_tail; + }; struct gfs2_statfs_change_host { @@ -516,7 +561,6 @@ struct gfs2_tune { unsigned int gt_logd_secs; - unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */ unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */ unsigned int gt_quota_scale_num; /* Numerator */ unsigned int gt_quota_scale_den; /* Denominator */ @@ -636,7 +680,7 @@ struct gfs2_sbd { struct lm_lockstruct sd_lockstruct; struct gfs2_holder sd_live_gh; struct gfs2_glock *sd_rename_gl; - struct gfs2_glock *sd_trans_gl; + struct gfs2_glock *sd_freeze_gl; wait_queue_head_t sd_glock_wait; atomic_t sd_glock_disposal; struct completion sd_locking_init; @@ -684,6 +728,8 @@ struct gfs2_sbd { struct gfs2_holder sd_sc_gh; struct gfs2_holder sd_qc_gh; + struct completion sd_journal_ready; + /* Daemon stuff */ struct task_struct *sd_logd_process; @@ -694,35 +740,31 @@ struct gfs2_sbd { struct list_head sd_quota_list; atomic_t sd_quota_count; struct mutex sd_quota_mutex; + struct mutex sd_quota_sync_mutex; wait_queue_head_t sd_quota_wait; struct list_head sd_trunc_list; spinlock_t sd_trunc_lock; unsigned int sd_quota_slots; - unsigned int sd_quota_chunks; - unsigned char **sd_quota_bitmap; + unsigned long *sd_quota_bitmap; + spinlock_t sd_bitmap_lock; u64 sd_quota_sync_gen; /* Log stuff */ + struct address_space sd_aspace; + spinlock_t sd_log_lock; struct gfs2_trans *sd_log_tr; unsigned int sd_log_blks_reserved; - unsigned int sd_log_commited_buf; - unsigned int sd_log_commited_databuf; int sd_log_commited_revoke; atomic_t sd_log_pinned; - unsigned int sd_log_num_buf; unsigned int sd_log_num_revoke; - unsigned int sd_log_num_rg; - unsigned int sd_log_num_databuf; - struct list_head sd_log_le_buf; struct list_head sd_log_le_revoke; - struct list_head sd_log_le_databuf; struct list_head sd_log_le_ordered; spinlock_t sd_ordered_lock; @@ -750,17 +792,14 @@ struct gfs2_sbd { struct list_head sd_ail1_list; struct list_head sd_ail2_list; - /* Replay stuff */ - - struct list_head sd_revoke_list; - unsigned int sd_replay_tail; - - unsigned int sd_found_blocks; - unsigned int sd_found_revokes; - unsigned int sd_replayed_blocks; - /* For quiescing the filesystem */ struct gfs2_holder sd_freeze_gh; + struct gfs2_holder sd_freeze_root_gh; + struct gfs2_holder sd_thaw_gh; + atomic_t sd_log_freeze; + atomic_t sd_frozen_root; + wait_queue_head_t sd_frozen_root_wait; + wait_queue_head_t sd_log_frozen_wait; char sd_fsname[GFS2_FSNAME_LEN]; char sd_table_name[GFS2_FSNAME_LEN]; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index d2384f7c53e..e62e5947788 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -149,7 +149,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, ip = GFS2_I(inode); if (!inode) - return ERR_PTR(-ENOBUFS); + return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -376,25 +376,25 @@ static void munge_mode_uid_gid(const struct gfs2_inode *dip, inode->i_gid = current_fsgid(); } -static int alloc_dinode(struct gfs2_inode *ip, u32 flags) +static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc_parms ap = { .target = *dblocks, .aflags = flags, }; int error; - int dblocks = 1; error = gfs2_quota_lock_check(ip); if (error) goto out; - error = gfs2_inplace_reserve(ip, RES_DINODE, flags); + error = gfs2_inplace_reserve(ip, &ap); if (error) goto out_quota; - error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 0); + error = gfs2_trans_begin(sdp, (*dblocks * RES_RG_BIT) + RES_STATFS + RES_QUOTA, 0); if (error) goto out_ipreserv; - error = gfs2_alloc_blocks(ip, &ip->i_no_addr, &dblocks, 1, &ip->i_generation); + error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1, &ip->i_generation); ip->i_no_formal_ino = ip->i_generation; ip->i_inode.i_ino = ip->i_no_addr; ip->i_goal = ip->i_no_addr; @@ -427,6 +427,33 @@ static void gfs2_init_dir(struct buffer_head *dibh, } /** + * gfs2_init_xattr - Initialise an xattr block for a new inode + * @ip: The inode in question + * + * This sets up an empty xattr block for a new inode, ready to + * take any ACLs, LSM xattrs, etc. + */ + +static void gfs2_init_xattr(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *bh; + struct gfs2_ea_header *ea; + + bh = gfs2_meta_new(ip->i_gl, ip->i_eattr); + gfs2_trans_add_meta(ip->i_gl, bh); + gfs2_metatype_set(bh, GFS2_METATYPE_EA, GFS2_FORMAT_EA); + gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header)); + + ea = GFS2_EA_BH2FIRST(bh); + ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize); + ea->ea_type = GFS2_EATYPE_UNUSED; + ea->ea_flags = GFS2_EAFLAG_LAST; + + brelse(bh); +} + +/** * init_dinode - Fill in a new dinode structure * @dip: The directory this inode is being created in * @ip: The inode @@ -468,25 +495,45 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, brelse(dibh); } +/** + * gfs2_trans_da_blocks - Calculate number of blocks to link inode + * @dip: The directory we are linking into + * @da: The dir add information + * @nr_inodes: The number of inodes involved + * + * This calculate the number of blocks we need to reserve in a + * transaction to link @nr_inodes into a directory. In most cases + * @nr_inodes will be 2 (the directory plus the inode being linked in) + * but in case of rename, 4 may be required. + * + * Returns: Number of blocks + */ + +static unsigned gfs2_trans_da_blks(const struct gfs2_inode *dip, + const struct gfs2_diradd *da, + unsigned nr_inodes) +{ + return da->nr_blocks + gfs2_rg_blocks(dip, da->nr_blocks) + + (nr_inodes * RES_DINODE) + RES_QUOTA + RES_STATFS; +} + static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, - struct gfs2_inode *ip, int arq) + struct gfs2_inode *ip, struct gfs2_diradd *da) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_alloc_parms ap = { .target = da->nr_blocks, }; int error; - if (arq) { + if (da->nr_blocks) { error = gfs2_quota_lock_check(dip); if (error) goto fail_quota_locks; - error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres, 0); + error = gfs2_inplace_reserve(dip, &ap); if (error) goto fail_quota_locks; - error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - dip->i_rgd->rd_length + - 2 * RES_DINODE + - RES_STATFS + RES_QUOTA, 0); + error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, da, 2), 0); if (error) goto fail_ipreserv; } else { @@ -495,7 +542,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, goto fail_quota_locks; } - error = gfs2_dir_add(&dip->i_inode, name, ip); + error = gfs2_dir_add(&dip->i_inode, name, ip, da); if (error) goto fail_end_trans; @@ -524,13 +571,6 @@ static int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, return err; } -static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip, - const struct qstr *qstr) -{ - return security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr, - &gfs2_initxattrs, NULL); -} - /** * gfs2_create_inode - Create a new inode * @dir: The parent directory @@ -550,15 +590,17 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, unsigned int size, int excl, int *opened) { const struct qstr *name = &dentry->d_name; + struct posix_acl *default_acl, *acl; struct gfs2_holder ghs[2]; struct inode *inode = NULL; struct gfs2_inode *dip = GFS2_I(dir), *ip; struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_glock *io_gl; struct dentry *d; - int error; + int error, free_vfs_inode = 0; u32 aflags = 0; - int arq; + unsigned blocks = 1; + struct gfs2_diradd da = { .bh = NULL, }; if (!name->len || name->len > GFS2_FNAMESIZE) return -ENAMETOOLONG; @@ -583,24 +625,27 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = PTR_ERR(inode); if (!IS_ERR(inode)) { d = d_splice_alias(inode, dentry); + error = PTR_ERR(d); + if (IS_ERR(d)) + goto fail_gunlock; error = 0; - if (file && !IS_ERR(d)) { - if (d == NULL) - d = dentry; - if (S_ISREG(inode->i_mode)) - error = finish_open(file, d, gfs2_open_common, opened); - else + if (file) { + if (S_ISREG(inode->i_mode)) { + WARN_ON(d != NULL); + error = finish_open(file, dentry, gfs2_open_common, opened); + } else { error = finish_no_open(file, d); + } + } else { + dput(d); } gfs2_glock_dq_uninit(ghs); - if (IS_ERR(d)) - return PTR_ERR(d); return error; } else if (error != -ENOENT) { goto fail_gunlock; } - arq = error = gfs2_diradd_alloc_required(dir, name); + error = gfs2_diradd_alloc_required(dir, name, &da); if (error < 0) goto fail_gunlock; @@ -609,10 +654,14 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (!inode) goto fail_gunlock; + error = posix_acl_create(dir, &mode, &default_acl, &acl); + if (error) + goto fail_free_vfs_inode; + ip = GFS2_I(inode); error = gfs2_rs_alloc(ip); if (error) - goto fail_free_inode; + goto fail_free_acls; inode->i_mode = mode; set_nlink(inode, S_ISDIR(mode) ? 2 : 1); @@ -647,10 +696,15 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, (dip->i_diskflags & GFS2_DIF_TOPDIR)) aflags |= GFS2_AF_ORLOV; - error = alloc_dinode(ip, aflags); + if (default_acl || acl) + blocks++; + + error = alloc_dinode(ip, aflags, &blocks); if (error) goto fail_free_inode; + gfs2_set_inode_blocks(inode, blocks); + error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); if (error) goto fail_free_inode; @@ -660,10 +714,14 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_free_inode; - error = gfs2_trans_begin(sdp, RES_DINODE, 0); + error = gfs2_trans_begin(sdp, blocks, 0); if (error) goto fail_gunlock2; + if (blocks > 1) { + ip->i_eattr = ip->i_no_addr + 1; + gfs2_init_xattr(ip); + } init_dinode(dip, ip, symname); gfs2_trans_end(sdp); @@ -680,15 +738,25 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, gfs2_set_iop(inode); insert_inode_hash(inode); - error = gfs2_acl_create(dip, inode); + if (default_acl) { + error = gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS); + posix_acl_release(acl); + } + if (error) goto fail_gunlock3; - error = gfs2_security_init(dip, ip, name); + error = security_inode_init_security(&ip->i_inode, &dip->i_inode, name, + &gfs2_initxattrs, NULL); if (error) goto fail_gunlock3; - error = link_dinode(dip, name, ip, arq); + error = link_dinode(dip, name, ip, &da); if (error) goto fail_gunlock3; @@ -713,15 +781,23 @@ fail_gunlock2: fail_free_inode: if (ip->i_gl) gfs2_glock_put(ip->i_gl); - gfs2_rs_delete(ip); - free_inode_nonrcu(inode); - inode = NULL; + gfs2_rs_delete(ip, NULL); +fail_free_acls: + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); +fail_free_vfs_inode: + free_vfs_inode = 1; fail_gunlock: + gfs2_dir_no_add(&da); gfs2_glock_dq_uninit(ghs); if (inode && !IS_ERR(inode)) { clear_nlink(inode); - mark_inode_dirty(inode); - set_bit(GIF_ALLOC_FAILED, &GFS2_I(inode)->i_flags); + if (!free_vfs_inode) + mark_inode_dirty(inode); + set_bit(free_vfs_inode ? GIF_FREE_VFS_INODE : GIF_ALLOC_FAILED, + &GFS2_I(inode)->i_flags); iput(inode); } fail: @@ -777,12 +853,19 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry, } d = d_splice_alias(inode, dentry); + if (IS_ERR(d)) { + iput(inode); + gfs2_glock_dq_uninit(&gh); + return d; + } if (file && S_ISREG(inode->i_mode)) error = finish_open(file, dentry, gfs2_open_common, opened); gfs2_glock_dq_uninit(&gh); - if (error) + if (error) { + dput(d); return ERR_PTR(error); + } return d; } @@ -813,7 +896,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder ghs[2]; struct buffer_head *dibh; - int alloc_required; + struct gfs2_diradd da = { .bh = NULL, }; int error; if (S_ISDIR(inode->i_mode)) @@ -868,24 +951,21 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (ip->i_inode.i_nlink == (u32)-1) goto out_gunlock; - alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name); + error = gfs2_diradd_alloc_required(dir, &dentry->d_name, &da); if (error < 0) goto out_gunlock; - error = 0; - if (alloc_required) { + if (da.nr_blocks) { + struct gfs2_alloc_parms ap = { .target = da.nr_blocks, }; error = gfs2_quota_lock_check(dip); if (error) goto out_gunlock; - error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres, 0); + error = gfs2_inplace_reserve(dip, &ap); if (error) goto out_gunlock_q; - error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - gfs2_rg_blocks(dip, sdp->sd_max_dirres) + - 2 * RES_DINODE + RES_STATFS + - RES_QUOTA, 0); + error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, &da, 2), 0); if (error) goto out_ipres; } else { @@ -898,7 +978,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (error) goto out_end_trans; - error = gfs2_dir_add(dir, &dentry->d_name, ip); + error = gfs2_dir_add(dir, &dentry->d_name, ip, &da); if (error) goto out_brelse; @@ -914,12 +994,13 @@ out_brelse: out_end_trans: gfs2_trans_end(sdp); out_ipres: - if (alloc_required) + if (da.nr_blocks) gfs2_inplace_release(dip); out_gunlock_q: - if (alloc_required) + if (da.nr_blocks) gfs2_quota_unlock(dip); out_gunlock: + gfs2_dir_no_add(&da); gfs2_glock_dq(ghs + 1); out_child: gfs2_glock_dq(ghs); @@ -1163,14 +1244,19 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, d = __gfs2_lookup(dir, dentry, file, opened); if (IS_ERR(d)) return PTR_ERR(d); - if (d == NULL) - d = dentry; - if (d->d_inode) { - if (!(*opened & FILE_OPENED)) - return finish_no_open(file, d); + if (d != NULL) + dentry = d; + if (dentry->d_inode) { + if (!(*opened & FILE_OPENED)) { + if (d == NULL) + dget(dentry); + return finish_no_open(file, dentry); + } + dput(d); return 0; } + BUG_ON(d != NULL); if (!(flags & O_CREAT)) return -ENOENT; @@ -1208,6 +1294,10 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) } tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1); + if (!tmp) { + error = -ENOENT; + break; + } if (IS_ERR(tmp)) { error = PTR_ERR(tmp); break; @@ -1244,7 +1334,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, struct gfs2_rgrpd *nrgd; unsigned int num_gh; int dir_rename = 0; - int alloc_required = 0; + struct gfs2_diradd da = { .nr_blocks = 0, }; unsigned int x; int error; @@ -1378,25 +1468,24 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, goto out_gunlock; } - if (nip == NULL) - alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name); - error = alloc_required; - if (error < 0) - goto out_gunlock; + if (nip == NULL) { + error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name, &da); + if (error) + goto out_gunlock; + } - if (alloc_required) { + if (da.nr_blocks) { + struct gfs2_alloc_parms ap = { .target = da.nr_blocks, }; error = gfs2_quota_lock_check(ndip); if (error) goto out_gunlock; - error = gfs2_inplace_reserve(ndip, sdp->sd_max_dirres, 0); + error = gfs2_inplace_reserve(ndip, &ap); if (error) goto out_gunlock_q; - error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - gfs2_rg_blocks(ndip, sdp->sd_max_dirres) + - 4 * RES_DINODE + 4 * RES_LEAF + - RES_STATFS + RES_QUOTA + 4, 0); + error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(ndip, &da, 4) + + 4 * RES_LEAF + 4, 0); if (error) goto out_ipreserv; } else { @@ -1430,19 +1519,20 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) goto out_end_trans; - error = gfs2_dir_add(ndir, &ndentry->d_name, ip); + error = gfs2_dir_add(ndir, &ndentry->d_name, ip, &da); if (error) goto out_end_trans; out_end_trans: gfs2_trans_end(sdp); out_ipreserv: - if (alloc_required) + if (da.nr_blocks) gfs2_inplace_release(ndip); out_gunlock_q: - if (alloc_required) + if (da.nr_blocks) gfs2_quota_unlock(ndip); out_gunlock: + gfs2_dir_no_add(&da); while (x--) { gfs2_glock_dq(ghs + x); gfs2_holder_uninit(ghs + x); @@ -1523,18 +1613,26 @@ int gfs2_permission(struct inode *inode, int mask) { struct gfs2_inode *ip; struct gfs2_holder i_gh; + struct gfs2_sbd *sdp = GFS2_SB(inode); int error; int unlock = 0; + int frozen_root = 0; ip = GFS2_I(inode); if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); - if (error) - return error; - unlock = 1; + if (unlikely(gfs2_glock_is_held_excl(sdp->sd_freeze_gl) && + inode == sdp->sd_root_dir->d_inode && + atomic_inc_not_zero(&sdp->sd_frozen_root))) + frozen_root = 1; + else { + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (error) + return error; + unlock = 1; + } } if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode)) @@ -1543,6 +1641,8 @@ int gfs2_permission(struct inode *inode, int mask) error = generic_permission(inode, mask); if (unlock) gfs2_glock_dq_uninit(&i_gh); + else if (frozen_root && atomic_dec_and_test(&sdp->sd_frozen_root)) + wake_up(&sdp->sd_frozen_root_wait); return error; } @@ -1596,10 +1696,22 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid)) ogid = ngid = NO_GID_QUOTA_CHANGE; - error = gfs2_quota_lock(ip, nuid, ngid); + error = get_write_access(inode); if (error) return error; + error = gfs2_rs_alloc(ip); + if (error) + goto out; + + error = gfs2_rindex_update(sdp); + if (error) + goto out; + + error = gfs2_quota_lock(ip, nuid, ngid); + if (error) + goto out; + if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) { error = gfs2_quota_check(ip, nuid, ngid); @@ -1626,6 +1738,8 @@ out_end_trans: gfs2_trans_end(sdp); out_gunlock_q: gfs2_quota_unlock(ip); +out: + put_write_access(inode); return error; } @@ -1667,10 +1781,11 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) error = gfs2_setattr_size(inode, attr->ia_size); else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) error = setattr_chown(inode, attr); - else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) - error = gfs2_acl_chmod(ip, attr); - else + else { error = gfs2_setattr_simple(inode, attr); + if (!error && attr->ia_valid & ATTR_MODE) + error = posix_acl_chmod(inode, inode->i_mode); + } out: if (!error) @@ -1700,19 +1815,29 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct inode *inode = dentry->d_inode; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; + struct gfs2_sbd *sdp = GFS2_SB(inode); int error; int unlock = 0; + int frozen_root = 0; if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); - if (error) - return error; - unlock = 1; + if (unlikely(gfs2_glock_is_held_excl(sdp->sd_freeze_gl) && + inode == sdp->sd_root_dir->d_inode && + atomic_inc_not_zero(&sdp->sd_frozen_root))) + frozen_root = 1; + else { + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + if (error) + return error; + unlock = 1; + } } generic_fillattr(inode, stat); if (unlock) gfs2_glock_dq_uninit(&gh); + else if (frozen_root && atomic_dec_and_test(&sdp->sd_frozen_root)) + wake_up(&sdp->sd_frozen_root_wait); return 0; } @@ -1830,6 +1955,7 @@ const struct inode_operations gfs2_file_iops = { .removexattr = gfs2_removexattr, .fiemap = gfs2_fiemap, .get_acl = gfs2_get_acl, + .set_acl = gfs2_set_acl, }; const struct inode_operations gfs2_dir_iops = { @@ -1851,6 +1977,7 @@ const struct inode_operations gfs2_dir_iops = { .removexattr = gfs2_removexattr, .fiemap = gfs2_fiemap, .get_acl = gfs2_get_acl, + .set_acl = gfs2_set_acl, .atomic_open = gfs2_atomic_open, }; @@ -1866,6 +1993,5 @@ const struct inode_operations gfs2_symlink_iops = { .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, .fiemap = gfs2_fiemap, - .get_acl = gfs2_get_acl, }; diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index c8423d6de6c..4fafea1c9ec 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -7,6 +7,8 @@ * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/fs.h> #include <linux/dlm.h> #include <linux/slab.h> @@ -176,7 +178,7 @@ static void gdlm_bast(void *arg, int mode) gfs2_glock_cb(gl, LM_ST_SHARED); break; default: - printk(KERN_ERR "unknown bast mode %d", mode); + pr_err("unknown bast mode %d\n", mode); BUG(); } } @@ -195,7 +197,7 @@ static int make_mode(const unsigned int lmstate) case LM_ST_SHARED: return DLM_LOCK_PR; } - printk(KERN_ERR "unknown LM state %d", lmstate); + pr_err("unknown LM state %d\n", lmstate); BUG(); return -1; } @@ -308,7 +310,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl) error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK, NULL, gl); if (error) { - printk(KERN_ERR "gdlm_unlock %x,%llx err=%d\n", + pr_err("gdlm_unlock %x,%llx err=%d\n", gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number, error); return; @@ -466,19 +468,19 @@ static void gdlm_cancel(struct gfs2_glock *gl) static void control_lvb_read(struct lm_lockstruct *ls, uint32_t *lvb_gen, char *lvb_bits) { - uint32_t gen; + __le32 gen; memcpy(lvb_bits, ls->ls_control_lvb, GDLM_LVB_SIZE); - memcpy(&gen, lvb_bits, sizeof(uint32_t)); + memcpy(&gen, lvb_bits, sizeof(__le32)); *lvb_gen = le32_to_cpu(gen); } static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen, char *lvb_bits) { - uint32_t gen; + __le32 gen; memcpy(ls->ls_control_lvb, lvb_bits, GDLM_LVB_SIZE); gen = cpu_to_le32(lvb_gen); - memcpy(ls->ls_control_lvb, &gen, sizeof(uint32_t)); + memcpy(ls->ls_control_lvb, &gen, sizeof(__le32)); } static int all_jid_bits_clear(char *lvb) @@ -1034,8 +1036,8 @@ static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots, new_size = old_size + RECOVER_SIZE_INC; - submit = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS); - result = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS); + submit = kcalloc(new_size, sizeof(uint32_t), GFP_NOFS); + result = kcalloc(new_size, sizeof(uint32_t), GFP_NOFS); if (!submit || !result) { kfree(submit); kfree(result); @@ -1102,7 +1104,7 @@ static void gdlm_recover_slot(void *arg, struct dlm_slot *slot) } if (ls->ls_recover_submit[jid]) { - fs_info(sdp, "recover_slot jid %d gen %u prev %u", + fs_info(sdp, "recover_slot jid %d gen %u prev %u\n", jid, ls->ls_recover_block, ls->ls_recover_submit[jid]); } ls->ls_recover_submit[jid] = ls->ls_recover_block; @@ -1132,7 +1134,7 @@ static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots, queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0); clear_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY); spin_unlock(&ls->ls_recover_spin); } @@ -1269,7 +1271,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) ls->ls_first = !!test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags); clear_bit(SDF_NOJOURNALID, &sdp->sd_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); return 0; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 610613fb65b..3966fadbceb 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -18,6 +18,7 @@ #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/bio.h> +#include <linux/blkdev.h> #include <linux/writeback.h> #include <linux/list_sort.h> @@ -145,8 +146,10 @@ void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc) { struct list_head *head = &sdp->sd_ail1_list; struct gfs2_trans *tr; + struct blk_plug plug; trace_gfs2_ail_flush(sdp, wbc, 1); + blk_start_plug(&plug); spin_lock(&sdp->sd_ail_lock); restart: list_for_each_entry_reverse(tr, head, tr_list) { @@ -156,6 +159,7 @@ restart: goto restart; } spin_unlock(&sdp->sd_ail_lock); + blk_finish_plug(&plug); trace_gfs2_ail_flush(sdp, wbc, 0); } @@ -297,6 +301,23 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) } /** + * gfs2_log_release - Release a given number of log blocks + * @sdp: The GFS2 superblock + * @blks: The number of blocks + * + */ + +void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) +{ + + atomic_add(blks, &sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, blks); + gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= + sdp->sd_jdesc->jd_blocks); + up_read(&sdp->sd_log_flush_lock); +} + +/** * gfs2_log_reserve - Make a log reservation * @sdp: The GFS2 superblock * @blks: The number of blocks to reserve @@ -354,7 +375,10 @@ retry: wake_up(&sdp->sd_log_waitq); down_read(&sdp->sd_log_flush_lock); - + if (unlikely(!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))) { + gfs2_log_release(sdp, blks); + return -EROFS; + } return 0; } @@ -410,24 +434,22 @@ static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer static unsigned int calc_reserved(struct gfs2_sbd *sdp) { unsigned int reserved = 0; - unsigned int mbuf_limit, metabufhdrs_needed; - unsigned int dbuf_limit, databufhdrs_needed; - unsigned int revokes = 0; + unsigned int mbuf; + unsigned int dbuf; + struct gfs2_trans *tr = sdp->sd_log_tr; - mbuf_limit = buf_limit(sdp); - metabufhdrs_needed = (sdp->sd_log_commited_buf + - (mbuf_limit - 1)) / mbuf_limit; - dbuf_limit = databuf_limit(sdp); - databufhdrs_needed = (sdp->sd_log_commited_databuf + - (dbuf_limit - 1)) / dbuf_limit; + if (tr) { + mbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm; + dbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm; + reserved = mbuf + dbuf; + /* Account for header blocks */ + reserved += DIV_ROUND_UP(mbuf, buf_limit(sdp)); + reserved += DIV_ROUND_UP(dbuf, databuf_limit(sdp)); + } if (sdp->sd_log_commited_revoke > 0) - revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke, + reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke, sizeof(u64)); - - reserved = sdp->sd_log_commited_buf + metabufhdrs_needed + - sdp->sd_log_commited_databuf + databufhdrs_needed + - revokes; /* One for the overall header */ if (reserved) reserved++; @@ -551,10 +573,10 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) struct buffer_head *bh = bd->bd_bh; struct gfs2_glock *gl = bd->bd_gl; - gfs2_remove_from_ail(bd); - bd->bd_bh = NULL; bh->b_private = NULL; bd->bd_blkno = bh->b_blocknr; + gfs2_remove_from_ail(bd); /* drops ref on bh */ + bd->bd_bh = NULL; bd->bd_ops = &gfs2_revoke_lops; sdp->sd_log_num_revoke++; atomic_inc(&gl->gl_revokes); @@ -669,7 +691,8 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) * */ -void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) +void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, + enum gfs2_flush_type type) { struct gfs2_trans *tr; @@ -682,36 +705,25 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) } trace_gfs2_log_flush(sdp, 1); + sdp->sd_log_flush_head = sdp->sd_log_head; + sdp->sd_log_flush_wrapped = 0; tr = sdp->sd_log_tr; if (tr) { sdp->sd_log_tr = NULL; INIT_LIST_HEAD(&tr->tr_ail1_list); INIT_LIST_HEAD(&tr->tr_ail2_list); + tr->tr_first = sdp->sd_log_flush_head; } - if (sdp->sd_log_num_buf != sdp->sd_log_commited_buf) { - printk(KERN_INFO "GFS2: log buf %u %u\n", sdp->sd_log_num_buf, - sdp->sd_log_commited_buf); - gfs2_assert_withdraw(sdp, 0); - } - if (sdp->sd_log_num_databuf != sdp->sd_log_commited_databuf) { - printk(KERN_INFO "GFS2: log databuf %u %u\n", - sdp->sd_log_num_databuf, sdp->sd_log_commited_databuf); - gfs2_assert_withdraw(sdp, 0); - } gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke); - sdp->sd_log_flush_head = sdp->sd_log_head; - sdp->sd_log_flush_wrapped = 0; - if (tr) - tr->tr_first = sdp->sd_log_flush_head; - gfs2_ordered_write(sdp); - lops_before_commit(sdp); + lops_before_commit(sdp, tr); gfs2_log_flush_bio(sdp, WRITE); if (sdp->sd_log_head != sdp->sd_log_flush_head) { + log_flush_wait(sdp); log_write_header(sdp, 0); } else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ @@ -723,8 +735,6 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) gfs2_log_lock(sdp); sdp->sd_log_head = sdp->sd_log_flush_head; sdp->sd_log_blks_reserved = 0; - sdp->sd_log_commited_buf = 0; - sdp->sd_log_commited_databuf = 0; sdp->sd_log_commited_revoke = 0; spin_lock(&sdp->sd_ail_lock); @@ -734,40 +744,96 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) } spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); + + if (atomic_read(&sdp->sd_log_freeze)) + type = FREEZE_FLUSH; + if (type != NORMAL_FLUSH) { + if (!sdp->sd_log_idle) { + for (;;) { + gfs2_ail1_start(sdp); + gfs2_ail1_wait(sdp); + if (gfs2_ail1_empty(sdp)) + break; + } + atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ + trace_gfs2_log_blocks(sdp, -1); + sdp->sd_log_flush_wrapped = 0; + log_write_header(sdp, 0); + sdp->sd_log_head = sdp->sd_log_flush_head; + } + if (type == SHUTDOWN_FLUSH || type == FREEZE_FLUSH) + gfs2_log_shutdown(sdp); + if (type == FREEZE_FLUSH) { + int error; + + atomic_set(&sdp->sd_log_freeze, 0); + wake_up(&sdp->sd_log_frozen_wait); + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, + LM_ST_SHARED, 0, + &sdp->sd_thaw_gh); + if (error) { + printk(KERN_INFO "GFS2: couln't get freeze lock : %d\n", error); + gfs2_assert_withdraw(sdp, 0); + } + else + gfs2_glock_dq_uninit(&sdp->sd_thaw_gh); + } + } + trace_gfs2_log_flush(sdp, 0); up_write(&sdp->sd_log_flush_lock); kfree(tr); } +/** + * gfs2_merge_trans - Merge a new transaction into a cached transaction + * @old: Original transaction to be expanded + * @new: New transaction to be merged + */ + +static void gfs2_merge_trans(struct gfs2_trans *old, struct gfs2_trans *new) +{ + WARN_ON_ONCE(old->tr_attached != 1); + + old->tr_num_buf_new += new->tr_num_buf_new; + old->tr_num_databuf_new += new->tr_num_databuf_new; + old->tr_num_buf_rm += new->tr_num_buf_rm; + old->tr_num_databuf_rm += new->tr_num_databuf_rm; + old->tr_num_revoke += new->tr_num_revoke; + old->tr_num_revoke_rm += new->tr_num_revoke_rm; + + list_splice_tail_init(&new->tr_databuf, &old->tr_databuf); + list_splice_tail_init(&new->tr_buf, &old->tr_buf); +} + static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { unsigned int reserved; unsigned int unused; + unsigned int maxres; gfs2_log_lock(sdp); - sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm; - sdp->sd_log_commited_databuf += tr->tr_num_databuf_new - - tr->tr_num_databuf_rm; - gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) || - (((int)sdp->sd_log_commited_databuf) >= 0)); + if (sdp->sd_log_tr) { + gfs2_merge_trans(sdp->sd_log_tr, tr); + } else if (tr->tr_num_buf_new || tr->tr_num_databuf_new) { + gfs2_assert_withdraw(sdp, tr->tr_alloced); + sdp->sd_log_tr = tr; + tr->tr_attached = 1; + } + sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; reserved = calc_reserved(sdp); - gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved); - unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved; + maxres = sdp->sd_log_blks_reserved + tr->tr_reserved; + gfs2_assert_withdraw(sdp, maxres >= reserved); + unused = maxres - reserved; atomic_add(unused, &sdp->sd_log_blks_free); trace_gfs2_log_blocks(sdp, unused); gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); sdp->sd_log_blks_reserved = reserved; - if (sdp->sd_log_tr == NULL && - (tr->tr_num_buf_new || tr->tr_num_databuf_new)) { - gfs2_assert_withdraw(sdp, tr->tr_t_gh.gh_gl); - sdp->sd_log_tr = tr; - tr->tr_attached = 1; - } gfs2_log_unlock(sdp); } @@ -804,13 +870,8 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) void gfs2_log_shutdown(struct gfs2_sbd *sdp) { - down_write(&sdp->sd_log_flush_lock); - gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved); - gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf); gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); - gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg); - gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf); gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list)); sdp->sd_log_flush_head = sdp->sd_log_head; @@ -818,38 +879,16 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp) log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT); - gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks); gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail); gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list)); sdp->sd_log_head = sdp->sd_log_flush_head; sdp->sd_log_tail = sdp->sd_log_head; - - up_write(&sdp->sd_log_flush_lock); -} - - -/** - * gfs2_meta_syncfs - sync all the buffers in a filesystem - * @sdp: the filesystem - * - */ - -void gfs2_meta_syncfs(struct gfs2_sbd *sdp) -{ - gfs2_log_flush(sdp, NULL); - for (;;) { - gfs2_ail1_start(sdp); - gfs2_ail1_wait(sdp); - if (gfs2_ail1_empty(sdp)) - break; - } - gfs2_log_flush(sdp, NULL); } static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) { - return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1)); + return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1) || atomic_read(&sdp->sd_log_freeze)); } static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp) @@ -876,14 +915,14 @@ int gfs2_logd(void *data) if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { gfs2_ail1_empty(sdp); - gfs2_log_flush(sdp, NULL); + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); } if (gfs2_ail_flush_reqd(sdp)) { gfs2_ail1_start(sdp); gfs2_ail1_wait(sdp); gfs2_ail1_empty(sdp); - gfs2_log_flush(sdp, NULL); + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); } if (!gfs2_ail_flush_reqd(sdp)) diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 37216634f0a..9499a604921 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -63,14 +63,21 @@ extern void gfs2_ordered_del_inode(struct gfs2_inode *ip); extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, unsigned int ssize); +extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); -extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); +enum gfs2_flush_type { + NORMAL_FLUSH = 0, + SYNC_FLUSH, + SHUTDOWN_FLUSH, + FREEZE_FLUSH +}; +extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, + enum gfs2_flush_type type); extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd); extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc); extern void gfs2_log_shutdown(struct gfs2_sbd *sdp); -extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp); extern int gfs2_logd(void *data); extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); extern void gfs2_write_revokes(struct gfs2_sbd *sdp); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 010b9fb9fec..2c1ae861dc9 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -75,7 +75,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd) unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number; struct gfs2_bitmap *bi = rgd->rd_bits + index; - if (bi->bi_clone == 0) + if (bi->bi_clone == NULL) return; if (sdp->sd_args.ar_discard) gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi, 1, NULL); @@ -83,6 +83,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd) bd->bd_bh->b_data + bi->bi_offset, bi->bi_len); clear_bit(GBF_FULL, &bi->bi_flags); rgd->rd_free_clone = rgd->rd_free; + rgd->rd_extfail_pt = rgd->rd_free; } /** @@ -145,8 +146,8 @@ static u64 gfs2_log_bmap(struct gfs2_sbd *sdp) struct gfs2_journal_extent *je; u64 block; - list_for_each_entry(je, &sdp->sd_jdesc->extent_list, extent_list) { - if (lbn >= je->lblock && lbn < je->lblock + je->blocks) { + list_for_each_entry(je, &sdp->sd_jdesc->extent_list, list) { + if ((lbn >= je->lblock) && (lbn < (je->lblock + je->blocks))) { block = je->dblock + lbn - je->lblock; gfs2_log_incr_head(sdp); return block; @@ -272,7 +273,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno) nrvecs = max(nrvecs/2, 1U); } - bio->bi_sector = blkno * (sb->s_blocksize >> 9); + bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9); bio->bi_bdev = sb->s_bdev; bio->bi_end_io = gfs2_end_log_write; bio->bi_private = sdp; @@ -490,44 +491,40 @@ static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit, gfs2_log_unlock(sdp); } -static void buf_lo_before_commit(struct gfs2_sbd *sdp) +static void buf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { unsigned int limit = buf_limit(sdp); /* 503 for 4k blocks */ - - gfs2_before_commit(sdp, limit, sdp->sd_log_num_buf, - &sdp->sd_log_le_buf, 0); + unsigned int nbuf; + if (tr == NULL) + return; + nbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm; + gfs2_before_commit(sdp, limit, nbuf, &tr->tr_buf, 0); } static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { - struct list_head *head = &sdp->sd_log_le_buf; + struct list_head *head; struct gfs2_bufdata *bd; - if (tr == NULL) { - gfs2_assert(sdp, list_empty(head)); + if (tr == NULL) return; - } + head = &tr->tr_buf; while (!list_empty(head)) { bd = list_entry(head->next, struct gfs2_bufdata, bd_list); list_del_init(&bd->bd_list); - sdp->sd_log_num_buf--; - gfs2_unpin(sdp, bd->bd_bh, tr); } - gfs2_assert_warn(sdp, !sdp->sd_log_num_buf); } static void buf_lo_before_scan(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, int pass) { - struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); - if (pass != 0) return; - sdp->sd_found_blocks = 0; - sdp->sd_replayed_blocks = 0; + jd->jd_found_blocks = 0; + jd->jd_replayed_blocks = 0; } static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, @@ -550,9 +547,9 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { blkno = be64_to_cpu(*ptr++); - sdp->sd_found_blocks++; + jd->jd_found_blocks++; - if (gfs2_revoke_check(sdp, blkno, start)) + if (gfs2_revoke_check(jd, blkno, start)) continue; error = gfs2_replay_read_block(jd, start, &bh_log); @@ -573,7 +570,7 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, if (error) break; - sdp->sd_replayed_blocks++; + jd->jd_replayed_blocks++; } return error; @@ -588,8 +585,12 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, static void gfs2_meta_sync(struct gfs2_glock *gl) { struct address_space *mapping = gfs2_glock2aspace(gl); + struct gfs2_sbd *sdp = gl->gl_sbd; int error; + if (mapping == NULL) + mapping = &sdp->sd_aspace; + filemap_fdatawrite(mapping); error = filemap_fdatawait(mapping); @@ -612,10 +613,10 @@ static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) gfs2_meta_sync(ip->i_gl); fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n", - jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); + jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks); } -static void revoke_lo_before_commit(struct gfs2_sbd *sdp) +static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { struct gfs2_meta_header *mh; unsigned int offset; @@ -674,13 +675,11 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) static void revoke_lo_before_scan(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, int pass) { - struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); - if (pass != 0) return; - sdp->sd_found_revokes = 0; - sdp->sd_replay_tail = head->lh_tail; + jd->jd_found_revokes = 0; + jd->jd_replay_tail = head->lh_tail; } static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, @@ -712,13 +711,13 @@ static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) { blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset)); - error = gfs2_revoke_add(sdp, blkno, start); + error = gfs2_revoke_add(jd, blkno, start); if (error < 0) { brelse(bh); return error; } else if (error) - sdp->sd_found_revokes++; + jd->jd_found_revokes++; if (!--revokes) break; @@ -738,16 +737,16 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); if (error) { - gfs2_revoke_clean(sdp); + gfs2_revoke_clean(jd); return; } if (pass != 1) return; fs_info(sdp, "jid=%u: Found %u revoke tags\n", - jd->jd_jid, sdp->sd_found_revokes); + jd->jd_jid, jd->jd_found_revokes); - gfs2_revoke_clean(sdp); + gfs2_revoke_clean(jd); } /** @@ -755,12 +754,14 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) * */ -static void databuf_lo_before_commit(struct gfs2_sbd *sdp) +static void databuf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { - unsigned int limit = buf_limit(sdp) / 2; - - gfs2_before_commit(sdp, limit, sdp->sd_log_num_databuf, - &sdp->sd_log_le_databuf, 1); + unsigned int limit = databuf_limit(sdp); + unsigned int nbuf; + if (tr == NULL) + return; + nbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm; + gfs2_before_commit(sdp, limit, nbuf, &tr->tr_databuf, 1); } static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, @@ -784,9 +785,9 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, blkno = be64_to_cpu(*ptr++); esc = be64_to_cpu(*ptr++); - sdp->sd_found_blocks++; + jd->jd_found_blocks++; - if (gfs2_revoke_check(sdp, blkno, start)) + if (gfs2_revoke_check(jd, blkno, start)) continue; error = gfs2_replay_read_block(jd, start, &bh_log); @@ -806,7 +807,7 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, brelse(bh_log); brelse(bh_ip); - sdp->sd_replayed_blocks++; + jd->jd_replayed_blocks++; } return error; @@ -830,26 +831,23 @@ static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) gfs2_meta_sync(ip->i_gl); fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n", - jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); + jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks); } static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { - struct list_head *head = &sdp->sd_log_le_databuf; + struct list_head *head; struct gfs2_bufdata *bd; - if (tr == NULL) { - gfs2_assert(sdp, list_empty(head)); + if (tr == NULL) return; - } + head = &tr->tr_databuf; while (!list_empty(head)) { bd = list_entry(head->next, struct gfs2_bufdata, bd_list); list_del_init(&bd->bd_list); - sdp->sd_log_num_databuf--; gfs2_unpin(sdp, bd->bd_bh, tr); } - gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf); } diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 9ca2e643841..a65a7ba32ff 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -46,12 +46,13 @@ static inline unsigned int databuf_limit(struct gfs2_sbd *sdp) return limit; } -static inline void lops_before_commit(struct gfs2_sbd *sdp) +static inline void lops_before_commit(struct gfs2_sbd *sdp, + struct gfs2_trans *tr) { int x; for (x = 0; gfs2_log_ops[x]; x++) if (gfs2_log_ops[x]->lo_before_commit) - gfs2_log_ops[x]->lo_before_commit(sdp); + gfs2_log_ops[x]->lo_before_commit(sdp, tr); } static inline void lops_after_commit(struct gfs2_sbd *sdp, diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 351586e24e3..82b6ac82965 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -7,6 +7,8 @@ * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> @@ -31,12 +33,6 @@ struct workqueue_struct *gfs2_control_wq; -static struct shrinker qd_shrinker = { - .count_objects = gfs2_qd_shrink_count, - .scan_objects = gfs2_qd_shrink_scan, - .seeks = DEFAULT_SEEKS, -}; - static void gfs2_init_inode_once(void *foo) { struct gfs2_inode *ip = foo; @@ -82,11 +78,16 @@ static int __init init_gfs2_fs(void) gfs2_str2qstr(&gfs2_qdot, "."); gfs2_str2qstr(&gfs2_qdotdot, ".."); + gfs2_quota_hash_init(); error = gfs2_sys_init(); if (error) return error; + error = list_lru_init(&gfs2_qd_lru); + if (error) + goto fail_lru; + error = gfs2_glock_init(); if (error) goto fail; @@ -139,7 +140,7 @@ static int __init init_gfs2_fs(void) if (!gfs2_rsrv_cachep) goto fail; - register_shrinker(&qd_shrinker); + register_shrinker(&gfs2_qd_shrinker); error = register_filesystem(&gfs2_fs_type); if (error) @@ -166,7 +167,7 @@ static int __init init_gfs2_fs(void) gfs2_register_debugfs(); - printk("GFS2 installed\n"); + pr_info("GFS2 installed\n"); return 0; @@ -179,7 +180,9 @@ fail_wq: fail_unregister: unregister_filesystem(&gfs2_fs_type); fail: - unregister_shrinker(&qd_shrinker); + list_lru_destroy(&gfs2_qd_lru); +fail_lru: + unregister_shrinker(&gfs2_qd_shrinker); gfs2_glock_exit(); if (gfs2_rsrv_cachep) @@ -214,13 +217,14 @@ fail: static void __exit exit_gfs2_fs(void) { - unregister_shrinker(&qd_shrinker); + unregister_shrinker(&gfs2_qd_shrinker); gfs2_glock_exit(); gfs2_unregister_debugfs(); unregister_filesystem(&gfs2_fs_type); unregister_filesystem(&gfs2meta_fs_type); destroy_workqueue(gfs_recovery_wq); destroy_workqueue(gfs2_control_wq); + list_lru_destroy(&gfs2_qd_lru); rcu_barrier(); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 93241505054..b984a6e190b 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -97,6 +97,11 @@ const struct address_space_operations gfs2_meta_aops = { .releasepage = gfs2_releasepage, }; +const struct address_space_operations gfs2_rgrp_aops = { + .writepage = gfs2_aspace_writepage, + .releasepage = gfs2_releasepage, +}; + /** * gfs2_getbuf - Get a buffer with a given address space * @gl: the glock @@ -116,6 +121,9 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) unsigned long index; unsigned int bufnum; + if (mapping == NULL) + mapping = &sdp->sd_aspace; + shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift; index = blkno >> shift; /* convert block to page */ bufnum = blkno - (index << shift); /* block buf index within page */ @@ -128,7 +136,8 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) yield(); } } else { - page = find_lock_page(mapping, index); + page = find_get_page_flags(mapping, index, + FGP_LOCK|FGP_ACCESSED); if (!page) return NULL; } @@ -145,7 +154,6 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) map_bh(bh, sdp->sd_vfs, blkno); unlock_page(page); - mark_page_accessed(page); page_cache_release(page); return bh; @@ -258,27 +266,27 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int struct address_space *mapping = bh->b_page->mapping; struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); struct gfs2_bufdata *bd = bh->b_private; + int was_pinned = 0; if (test_clear_buffer_pinned(bh)) { trace_gfs2_pin(bd, 0); atomic_dec(&sdp->sd_log_pinned); list_del_init(&bd->bd_list); - if (meta) { - gfs2_assert_warn(sdp, sdp->sd_log_num_buf); - sdp->sd_log_num_buf--; + if (meta) tr->tr_num_buf_rm++; - } else { - gfs2_assert_warn(sdp, sdp->sd_log_num_databuf); - sdp->sd_log_num_databuf--; + else tr->tr_num_databuf_rm++; - } tr->tr_touched = 1; + was_pinned = 1; brelse(bh); } if (bd) { spin_lock(&sdp->sd_ail_lock); if (bd->bd_tr) { gfs2_trans_add_revoke(sdp, bd); + } else if (was_pinned) { + bh->b_private = NULL; + kmem_cache_free(gfs2_bufdata_cachep, bd); } spin_unlock(&sdp->sd_ail_lock); } diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index 4823b934208..ac5d8027d33 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -38,12 +38,15 @@ static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh, } extern const struct address_space_operations gfs2_meta_aops; +extern const struct address_space_operations gfs2_rgrp_aops; static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping) { struct inode *inode = mapping->host; if (mapping->a_ops == &gfs2_meta_aops) return (((struct gfs2_glock *)mapping) - 1)->gl_sbd; + else if (mapping->a_ops == &gfs2_rgrp_aops) + return container_of(mapping, struct gfs2_sbd, sd_aspace); else return inode->i_sb->s_fs_info; } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 19ff5e8c285..bc564c0d6d1 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -7,6 +7,8 @@ * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -36,6 +38,7 @@ #include "log.h" #include "quota.h" #include "dir.h" +#include "meta_io.h" #include "trace_gfs2.h" #define DO 0 @@ -51,7 +54,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt) { spin_lock_init(>->gt_spin); - gt->gt_quota_simul_sync = 64; gt->gt_quota_warn_period = 10; gt->gt_quota_scale_num = 1; gt->gt_quota_scale_den = 1; @@ -63,6 +65,7 @@ static void gfs2_tune_init(struct gfs2_tune *gt) static struct gfs2_sbd *init_sbd(struct super_block *sb) { struct gfs2_sbd *sdp; + struct address_space *mapping; sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL); if (!sdp) @@ -91,18 +94,30 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) INIT_LIST_HEAD(&sdp->sd_jindex_list); spin_lock_init(&sdp->sd_jindex_spin); mutex_init(&sdp->sd_jindex_mutex); + init_completion(&sdp->sd_journal_ready); INIT_LIST_HEAD(&sdp->sd_quota_list); mutex_init(&sdp->sd_quota_mutex); + mutex_init(&sdp->sd_quota_sync_mutex); init_waitqueue_head(&sdp->sd_quota_wait); INIT_LIST_HEAD(&sdp->sd_trunc_list); spin_lock_init(&sdp->sd_trunc_lock); + spin_lock_init(&sdp->sd_bitmap_lock); + + mapping = &sdp->sd_aspace; + + address_space_init_once(mapping); + mapping->a_ops = &gfs2_rgrp_aops; + mapping->host = sb->s_bdev->bd_inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_NOFS); + mapping->private_data = NULL; + mapping->backing_dev_info = sb->s_bdi; + mapping->writeback_index = 0; spin_lock_init(&sdp->sd_log_lock); atomic_set(&sdp->sd_log_pinned, 0); - INIT_LIST_HEAD(&sdp->sd_log_le_buf); INIT_LIST_HEAD(&sdp->sd_log_le_revoke); - INIT_LIST_HEAD(&sdp->sd_log_le_databuf); INIT_LIST_HEAD(&sdp->sd_log_le_ordered); spin_lock_init(&sdp->sd_ordered_lock); @@ -115,8 +130,10 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) init_rwsem(&sdp->sd_log_flush_lock); atomic_set(&sdp->sd_log_in_flight, 0); init_waitqueue_head(&sdp->sd_log_flush_wait); - - INIT_LIST_HEAD(&sdp->sd_revoke_list); + init_waitqueue_head(&sdp->sd_log_frozen_wait); + atomic_set(&sdp->sd_log_freeze, 0); + atomic_set(&sdp->sd_frozen_root, 0); + init_waitqueue_head(&sdp->sd_frozen_root_wait); return sdp; } @@ -140,7 +157,7 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent) if (sb->sb_magic != GFS2_MAGIC || sb->sb_type != GFS2_METATYPE_SB) { if (!silent) - printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n"); + pr_warn("not a GFS2 filesystem\n"); return -EINVAL; } @@ -162,7 +179,7 @@ static void end_bio_io_page(struct bio *bio, int error) if (!error) SetPageUptodate(page); else - printk(KERN_WARNING "gfs2: error %d reading superblock\n", error); + pr_warn("error %d reading superblock\n", error); unlock_page(page); } @@ -217,14 +234,14 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) page = alloc_page(GFP_NOFS); if (unlikely(!page)) - return -ENOBUFS; + return -ENOMEM; ClearPageUptodate(page); ClearPageDirty(page); lock_page(page); bio = bio_alloc(GFP_NOFS, 1); - bio->bi_sector = sector * (sb->s_blocksize >> 9); + bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9); bio->bi_bdev = sb->s_bdev; bio_add_page(bio, page, PAGE_SIZE, 0); @@ -407,8 +424,8 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, goto fail_live; } - error = gfs2_glock_get(sdp, GFS2_TRANS_LOCK, &gfs2_trans_glops, - CREATE, &sdp->sd_trans_gl); + error = gfs2_glock_get(sdp, GFS2_FREEZE_LOCK, &gfs2_freeze_glops, + CREATE, &sdp->sd_freeze_gl); if (error) { fs_err(sdp, "can't create transaction glock: %d\n", error); goto fail_rename; @@ -417,7 +434,7 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, return 0; fail_trans: - gfs2_glock_put(sdp->sd_trans_gl); + gfs2_glock_put(sdp->sd_freeze_gl); fail_rename: gfs2_glock_put(sdp->sd_rename_gl); fail_live: @@ -505,67 +522,6 @@ out: return ret; } -/** - * map_journal_extents - create a reusable "extent" mapping from all logical - * blocks to all physical blocks for the given journal. This will save - * us time when writing journal blocks. Most journals will have only one - * extent that maps all their logical blocks. That's because gfs2.mkfs - * arranges the journal blocks sequentially to maximize performance. - * So the extent would map the first block for the entire file length. - * However, gfs2_jadd can happen while file activity is happening, so - * those journals may not be sequential. Less likely is the case where - * the users created their own journals by mounting the metafs and - * laying it out. But it's still possible. These journals might have - * several extents. - * - * TODO: This should be done in bigger chunks rather than one block at a time, - * but since it's only done at mount time, I'm not worried about the - * time it takes. - */ -static int map_journal_extents(struct gfs2_sbd *sdp) -{ - struct gfs2_jdesc *jd = sdp->sd_jdesc; - unsigned int lb; - u64 db, prev_db; /* logical block, disk block, prev disk block */ - struct gfs2_inode *ip = GFS2_I(jd->jd_inode); - struct gfs2_journal_extent *jext = NULL; - struct buffer_head bh; - int rc = 0; - - prev_db = 0; - - for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) { - bh.b_state = 0; - bh.b_blocknr = 0; - bh.b_size = 1 << ip->i_inode.i_blkbits; - rc = gfs2_block_map(jd->jd_inode, lb, &bh, 0); - db = bh.b_blocknr; - if (rc || !db) { - printk(KERN_INFO "GFS2 journal mapping error %d: lb=" - "%u db=%llu\n", rc, lb, (unsigned long long)db); - break; - } - if (!prev_db || db != prev_db + 1) { - jext = kzalloc(sizeof(struct gfs2_journal_extent), - GFP_KERNEL); - if (!jext) { - printk(KERN_INFO "GFS2 error: out of memory " - "mapping journal extents.\n"); - rc = -ENOMEM; - break; - } - jext->dblock = db; - jext->lblock = lb; - jext->blocks = 1; - list_add_tail(&jext->extent_list, &jd->extent_list); - } else { - jext->blocks++; - } - prev_db = db; - } - return rc; -} - static void gfs2_others_may_mount(struct gfs2_sbd *sdp) { char *message = "FIRSTMOUNT=Done"; @@ -624,6 +580,8 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) break; INIT_LIST_HEAD(&jd->extent_list); + INIT_LIST_HEAD(&jd->jd_revoke_list); + INIT_WORK(&jd->jd_work, gfs2_recover_func); jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1); if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { @@ -767,7 +725,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5); /* Map the extents for this journal's blocks */ - map_journal_extents(sdp); + gfs2_map_journal_extents(sdp, sdp->sd_jdesc); } trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free)); @@ -802,7 +760,15 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags); gfs2_glock_dq_uninit(&ji_gh); jindex = 0; - + if (!sdp->sd_args.ar_spectator) { + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0, + &sdp->sd_thaw_gh); + if (error) { + fs_err(sdp, "can't acquire freeze glock: %d\n", error); + goto fail_jinode_gh; + } + } + gfs2_glock_dq_uninit(&sdp->sd_thaw_gh); return 0; fail_jinode_gh: @@ -831,6 +797,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) goto fail_qinode; error = init_journal(sdp, undo); + complete_all(&sdp->sd_journal_ready); if (error) goto fail; @@ -956,40 +923,6 @@ fail: return error; } -static int init_threads(struct gfs2_sbd *sdp, int undo) -{ - struct task_struct *p; - int error = 0; - - if (undo) - goto fail_quotad; - - p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); - if (IS_ERR(p)) { - error = PTR_ERR(p); - fs_err(sdp, "can't start logd thread: %d\n", error); - return error; - } - sdp->sd_logd_process = p; - - p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); - if (IS_ERR(p)) { - error = PTR_ERR(p); - fs_err(sdp, "can't start quotad thread: %d\n", error); - goto fail; - } - sdp->sd_quotad_process = p; - - return 0; - - -fail_quotad: - kthread_stop(sdp->sd_quotad_process); -fail: - kthread_stop(sdp->sd_logd_process); - return error; -} - static const match_table_t nolock_tokens = { { Opt_jid, "jid=%d\n", }, { Opt_err, NULL }, @@ -1028,7 +961,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) lm = &gfs2_dlm_ops; #endif } else { - printk(KERN_INFO "GFS2: can't find protocol %s\n", proto); + pr_info("can't find protocol %s\n", proto); return -ENOENT; } @@ -1135,7 +1068,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent sdp = init_sbd(sb); if (!sdp) { - printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n"); + pr_warn("can't alloc struct gfs2_sbd\n"); return -ENOMEM; } sdp->sd_args = *args; @@ -1254,15 +1187,11 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent goto fail_per_node; } - error = init_threads(sdp, DO); - if (error) - goto fail_per_node; - if (!(sb->s_flags & MS_RDONLY)) { error = gfs2_make_fs_rw(sdp); if (error) { fs_err(sdp, "can't make FS RW: %d\n", error); - goto fail_threads; + goto fail_per_node; } } @@ -1270,8 +1199,6 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent gfs2_online_uevent(sdp); return 0; -fail_threads: - init_threads(sdp, UNDO); fail_per_node: init_per_node(sdp, UNDO); fail_inodes: @@ -1287,6 +1214,7 @@ fail_sb: fail_locking: init_locking(sdp, &mount_gh, UNDO); fail_lm: + complete_all(&sdp->sd_journal_ready); gfs2_gl_hash_clear(sdp); gfs2_lm_unmount(sdp); fail_debug: @@ -1366,8 +1294,18 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, if (IS_ERR(s)) goto error_bdev; - if (s->s_root) + if (s->s_root) { + /* + * s_umount nests inside bd_mutex during + * __invalidate_device(). blkdev_put() acquires + * bd_mutex and can't be called under s_umount. Drop + * s_umount temporarily. This is safe as we're + * holding an active reference. + */ + up_write(&s->s_umount); blkdev_put(bdev, mode); + down_write(&s->s_umount); + } memset(&args, 0, sizeof(args)); args.ar_quota = GFS2_QUOTA_DEFAULT; @@ -1379,7 +1317,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, error = gfs2_mount_args(&args, data); if (error) { - printk(KERN_WARNING "GFS2: can't parse mount arguments\n"); + pr_warn("can't parse mount arguments\n"); goto error_super; } @@ -1429,15 +1367,15 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type, error = kern_path(dev_name, LOOKUP_FOLLOW, &path); if (error) { - printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n", - dev_name, error); + pr_warn("path_lookup on %s returned error %d\n", + dev_name, error); return ERR_PTR(error); } s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags, path.dentry->d_inode->i_sb->s_bdev); path_put(&path); if (IS_ERR(s)) { - printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); + pr_warn("gfs2 mount does not exist\n"); return ERR_CAST(s); } if ((flags ^ s->s_flags) & MS_RDONLY) { @@ -1457,7 +1395,7 @@ static void gfs2_kill_sb(struct super_block *sb) return; } - gfs2_meta_syncfs(sdp); + gfs2_log_flush(sdp, NULL, SYNC_FLUSH); dput(sdp->sd_root_dir); dput(sdp->sd_master_dir); sdp->sd_root_dir = NULL; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index db441359ee8..64b29f7f6b4 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -36,6 +36,8 @@ * the quota file, so it is not being constantly read. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/sched.h> #include <linux/slab.h> #include <linux/mm.h> @@ -50,6 +52,13 @@ #include <linux/freezer.h> #include <linux/quota.h> #include <linux/dqblk_xfs.h> +#include <linux/lockref.h> +#include <linux/list_lru.h> +#include <linux/rcupdate.h> +#include <linux/rculist_bl.h> +#include <linux/bit_spinlock.h> +#include <linux/jhash.h> +#include <linux/vmalloc.h> #include "gfs2.h" #include "incore.h" @@ -65,35 +74,63 @@ #include "inode.h" #include "util.h" -struct gfs2_quota_change_host { - u64 qc_change; - u32 qc_flags; /* GFS2_QCF_... */ - struct kqid qc_id; -}; +#define GFS2_QD_HASH_SHIFT 12 +#define GFS2_QD_HASH_SIZE (1 << GFS2_QD_HASH_SHIFT) +#define GFS2_QD_HASH_MASK (GFS2_QD_HASH_SIZE - 1) -static LIST_HEAD(qd_lru_list); -static atomic_t qd_lru_count = ATOMIC_INIT(0); -static DEFINE_SPINLOCK(qd_lru_lock); +/* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */ +/* -> sd_bitmap_lock */ +static DEFINE_SPINLOCK(qd_lock); +struct list_lru gfs2_qd_lru; -unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, - struct shrink_control *sc) +static struct hlist_bl_head qd_hash_table[GFS2_QD_HASH_SIZE]; + +static unsigned int gfs2_qd_hash(const struct gfs2_sbd *sdp, + const struct kqid qid) +{ + unsigned int h; + + h = jhash(&sdp, sizeof(struct gfs2_sbd *), 0); + h = jhash(&qid, sizeof(struct kqid), h); + + return h & GFS2_QD_HASH_MASK; +} + +static inline void spin_lock_bucket(unsigned int hash) +{ + hlist_bl_lock(&qd_hash_table[hash]); +} + +static inline void spin_unlock_bucket(unsigned int hash) +{ + hlist_bl_unlock(&qd_hash_table[hash]); +} + +static void gfs2_qd_dealloc(struct rcu_head *rcu) +{ + struct gfs2_quota_data *qd = container_of(rcu, struct gfs2_quota_data, qd_rcu); + kmem_cache_free(gfs2_quotad_cachep, qd); +} + +static void gfs2_qd_dispose(struct list_head *list) { struct gfs2_quota_data *qd; struct gfs2_sbd *sdp; - int nr_to_scan = sc->nr_to_scan; - long freed = 0; - - if (!(sc->gfp_mask & __GFP_FS)) - return SHRINK_STOP; - spin_lock(&qd_lru_lock); - while (nr_to_scan && !list_empty(&qd_lru_list)) { - qd = list_entry(qd_lru_list.next, - struct gfs2_quota_data, qd_reclaim); + while (!list_empty(list)) { + qd = list_entry(list->next, struct gfs2_quota_data, qd_lru); sdp = qd->qd_gl->gl_sbd; + list_del(&qd->qd_lru); + /* Free from the filesystem-specific list */ + spin_lock(&qd_lock); list_del(&qd->qd_list); + spin_unlock(&qd_lock); + + spin_lock_bucket(qd->qd_hash); + hlist_bl_del_rcu(&qd->qd_hlist); + spin_unlock_bucket(qd->qd_hash); gfs2_assert_warn(sdp, !qd->qd_change); gfs2_assert_warn(sdp, !qd->qd_slot_count); @@ -103,24 +140,59 @@ unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, atomic_dec(&sdp->sd_quota_count); /* Delete it from the common reclaim list */ - list_del_init(&qd->qd_reclaim); - atomic_dec(&qd_lru_count); - spin_unlock(&qd_lru_lock); - kmem_cache_free(gfs2_quotad_cachep, qd); - spin_lock(&qd_lru_lock); - nr_to_scan--; - freed++; + call_rcu(&qd->qd_rcu, gfs2_qd_dealloc); } - spin_unlock(&qd_lru_lock); +} + + +static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock, void *arg) +{ + struct list_head *dispose = arg; + struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru); + + if (!spin_trylock(&qd->qd_lockref.lock)) + return LRU_SKIP; + + if (qd->qd_lockref.count == 0) { + lockref_mark_dead(&qd->qd_lockref); + list_move(&qd->qd_lru, dispose); + } + + spin_unlock(&qd->qd_lockref.lock); + return LRU_REMOVED; +} + +static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + LIST_HEAD(dispose); + unsigned long freed; + + if (!(sc->gfp_mask & __GFP_FS)) + return SHRINK_STOP; + + freed = list_lru_walk_node(&gfs2_qd_lru, sc->nid, gfs2_qd_isolate, + &dispose, &sc->nr_to_scan); + + gfs2_qd_dispose(&dispose); + return freed; } -unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, - struct shrink_control *sc) +static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) { - return vfs_pressure_ratio(atomic_read(&qd_lru_count)); + return vfs_pressure_ratio(list_lru_count_node(&gfs2_qd_lru, sc->nid)); } +struct shrinker gfs2_qd_shrinker = { + .count_objects = gfs2_qd_shrink_count, + .scan_objects = gfs2_qd_shrink_scan, + .seeks = DEFAULT_SEEKS, + .flags = SHRINKER_NUMA_AWARE, +}; + + static u64 qd2index(struct gfs2_quota_data *qd) { struct kqid qid = qd->qd_id; @@ -138,168 +210,158 @@ static u64 qd2offset(struct gfs2_quota_data *qd) return offset; } -static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid, - struct gfs2_quota_data **qdp) +static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, struct kqid qid) { struct gfs2_quota_data *qd; int error; qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS); if (!qd) - return -ENOMEM; + return NULL; - atomic_set(&qd->qd_count, 1); + qd->qd_sbd = sdp; + qd->qd_lockref.count = 1; + spin_lock_init(&qd->qd_lockref.lock); qd->qd_id = qid; qd->qd_slot = -1; - INIT_LIST_HEAD(&qd->qd_reclaim); + INIT_LIST_HEAD(&qd->qd_lru); + qd->qd_hash = hash; error = gfs2_glock_get(sdp, qd2index(qd), &gfs2_quota_glops, CREATE, &qd->qd_gl); if (error) goto fail; - *qdp = qd; - - return 0; + return qd; fail: kmem_cache_free(gfs2_quotad_cachep, qd); - return error; + return NULL; } -static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, - struct gfs2_quota_data **qdp) +static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash, + const struct gfs2_sbd *sdp, + struct kqid qid) { - struct gfs2_quota_data *qd = NULL, *new_qd = NULL; - int error, found; - - *qdp = NULL; + struct gfs2_quota_data *qd; + struct hlist_bl_node *h; - for (;;) { - found = 0; - spin_lock(&qd_lru_lock); - list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { - if (qid_eq(qd->qd_id, qid)) { - if (!atomic_read(&qd->qd_count) && - !list_empty(&qd->qd_reclaim)) { - /* Remove it from reclaim list */ - list_del_init(&qd->qd_reclaim); - atomic_dec(&qd_lru_count); - } - atomic_inc(&qd->qd_count); - found = 1; - break; - } + hlist_bl_for_each_entry_rcu(qd, h, &qd_hash_table[hash], qd_hlist) { + if (!qid_eq(qd->qd_id, qid)) + continue; + if (qd->qd_sbd != sdp) + continue; + if (lockref_get_not_dead(&qd->qd_lockref)) { + list_lru_del(&gfs2_qd_lru, &qd->qd_lru); + return qd; } + } - if (!found) - qd = NULL; + return NULL; +} - if (!qd && new_qd) { - qd = new_qd; - list_add(&qd->qd_list, &sdp->sd_quota_list); - atomic_inc(&sdp->sd_quota_count); - new_qd = NULL; - } - spin_unlock(&qd_lru_lock); +static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, + struct gfs2_quota_data **qdp) +{ + struct gfs2_quota_data *qd, *new_qd; + unsigned int hash = gfs2_qd_hash(sdp, qid); + + rcu_read_lock(); + *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); + rcu_read_unlock(); - if (qd) { - if (new_qd) { - gfs2_glock_put(new_qd->qd_gl); - kmem_cache_free(gfs2_quotad_cachep, new_qd); - } - *qdp = qd; - return 0; - } + if (qd) + return 0; - error = qd_alloc(sdp, qid, &new_qd); - if (error) - return error; + new_qd = qd_alloc(hash, sdp, qid); + if (!new_qd) + return -ENOMEM; + + spin_lock(&qd_lock); + spin_lock_bucket(hash); + *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); + if (qd == NULL) { + *qdp = new_qd; + list_add(&new_qd->qd_list, &sdp->sd_quota_list); + hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]); + atomic_inc(&sdp->sd_quota_count); + } + spin_unlock_bucket(hash); + spin_unlock(&qd_lock); + + if (qd) { + gfs2_glock_put(new_qd->qd_gl); + kmem_cache_free(gfs2_quotad_cachep, new_qd); } + + return 0; } + static void qd_hold(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; - gfs2_assert(sdp, atomic_read(&qd->qd_count)); - atomic_inc(&qd->qd_count); + gfs2_assert(sdp, !__lockref_is_dead(&qd->qd_lockref)); + lockref_get(&qd->qd_lockref); } static void qd_put(struct gfs2_quota_data *qd) { - if (atomic_dec_and_lock(&qd->qd_count, &qd_lru_lock)) { - /* Add to the reclaim list */ - list_add_tail(&qd->qd_reclaim, &qd_lru_list); - atomic_inc(&qd_lru_count); - spin_unlock(&qd_lru_lock); - } + if (lockref_put_or_lock(&qd->qd_lockref)) + return; + + qd->qd_lockref.count = 0; + list_lru_add(&gfs2_qd_lru, &qd->qd_lru); + spin_unlock(&qd->qd_lockref.lock); + } static int slot_get(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; - unsigned int c, o = 0, b; - unsigned char byte = 0; + struct gfs2_sbd *sdp = qd->qd_sbd; + unsigned int bit; + int error = 0; - spin_lock(&qd_lru_lock); + spin_lock(&sdp->sd_bitmap_lock); + if (qd->qd_slot_count != 0) + goto out; - if (qd->qd_slot_count++) { - spin_unlock(&qd_lru_lock); - return 0; + error = -ENOSPC; + bit = find_first_zero_bit(sdp->sd_quota_bitmap, sdp->sd_quota_slots); + if (bit < sdp->sd_quota_slots) { + set_bit(bit, sdp->sd_quota_bitmap); + qd->qd_slot = bit; + error = 0; +out: + qd->qd_slot_count++; } + spin_unlock(&sdp->sd_bitmap_lock); - for (c = 0; c < sdp->sd_quota_chunks; c++) - for (o = 0; o < PAGE_SIZE; o++) { - byte = sdp->sd_quota_bitmap[c][o]; - if (byte != 0xFF) - goto found; - } - - goto fail; - -found: - for (b = 0; b < 8; b++) - if (!(byte & (1 << b))) - break; - qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b; - - if (qd->qd_slot >= sdp->sd_quota_slots) - goto fail; - - sdp->sd_quota_bitmap[c][o] |= 1 << b; - - spin_unlock(&qd_lru_lock); - - return 0; - -fail: - qd->qd_slot_count--; - spin_unlock(&qd_lru_lock); - return -ENOSPC; + return error; } static void slot_hold(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_sbd *sdp = qd->qd_sbd; - spin_lock(&qd_lru_lock); + spin_lock(&sdp->sd_bitmap_lock); gfs2_assert(sdp, qd->qd_slot_count); qd->qd_slot_count++; - spin_unlock(&qd_lru_lock); + spin_unlock(&sdp->sd_bitmap_lock); } static void slot_put(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_sbd *sdp = qd->qd_sbd; - spin_lock(&qd_lru_lock); + spin_lock(&sdp->sd_bitmap_lock); gfs2_assert(sdp, qd->qd_slot_count); if (!--qd->qd_slot_count) { - gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0); + BUG_ON(!test_and_clear_bit(qd->qd_slot, sdp->sd_quota_bitmap)); qd->qd_slot = -1; } - spin_unlock(&qd_lru_lock); + spin_unlock(&sdp->sd_bitmap_lock); } static int bh_get(struct gfs2_quota_data *qd) @@ -363,6 +425,24 @@ static void bh_put(struct gfs2_quota_data *qd) mutex_unlock(&sdp->sd_quota_mutex); } +static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd, + u64 *sync_gen) +{ + if (test_bit(QDF_LOCKED, &qd->qd_flags) || + !test_bit(QDF_CHANGE, &qd->qd_flags) || + (sync_gen && (qd->qd_sync_gen >= *sync_gen))) + return 0; + + if (!lockref_get_not_dead(&qd->qd_lockref)) + return 0; + + list_move_tail(&qd->qd_list, &sdp->sd_quota_list); + set_bit(QDF_LOCKED, &qd->qd_flags); + qd->qd_change_sync = qd->qd_change; + slot_hold(qd); + return 1; +} + static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) { struct gfs2_quota_data *qd = NULL; @@ -374,31 +454,18 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) if (sdp->sd_vfs->s_flags & MS_RDONLY) return 0; - spin_lock(&qd_lru_lock); + spin_lock(&qd_lock); list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { - if (test_bit(QDF_LOCKED, &qd->qd_flags) || - !test_bit(QDF_CHANGE, &qd->qd_flags) || - qd->qd_sync_gen >= sdp->sd_quota_sync_gen) - continue; - - list_move_tail(&qd->qd_list, &sdp->sd_quota_list); - - set_bit(QDF_LOCKED, &qd->qd_flags); - gfs2_assert_warn(sdp, atomic_read(&qd->qd_count)); - atomic_inc(&qd->qd_count); - qd->qd_change_sync = qd->qd_change; - gfs2_assert_warn(sdp, qd->qd_slot_count); - qd->qd_slot_count++; - found = 1; - - break; + found = qd_check_sync(sdp, qd, &sdp->sd_quota_sync_gen); + if (found) + break; } if (!found) qd = NULL; - spin_unlock(&qd_lru_lock); + spin_unlock(&qd_lock); if (qd) { gfs2_assert_warn(sdp, qd->qd_change_sync); @@ -416,43 +483,6 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) return 0; } -static int qd_trylock(struct gfs2_quota_data *qd) -{ - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; - - if (sdp->sd_vfs->s_flags & MS_RDONLY) - return 0; - - spin_lock(&qd_lru_lock); - - if (test_bit(QDF_LOCKED, &qd->qd_flags) || - !test_bit(QDF_CHANGE, &qd->qd_flags)) { - spin_unlock(&qd_lru_lock); - return 0; - } - - list_move_tail(&qd->qd_list, &sdp->sd_quota_list); - - set_bit(QDF_LOCKED, &qd->qd_flags); - gfs2_assert_warn(sdp, atomic_read(&qd->qd_count)); - atomic_inc(&qd->qd_count); - qd->qd_change_sync = qd->qd_change; - gfs2_assert_warn(sdp, qd->qd_slot_count); - qd->qd_slot_count++; - - spin_unlock(&qd_lru_lock); - - gfs2_assert_warn(sdp, qd->qd_change_sync); - if (bh_get(qd)) { - clear_bit(QDF_LOCKED, &qd->qd_flags); - slot_put(qd); - qd_put(qd); - return 0; - } - - return 1; -} - static void qd_unlock(struct gfs2_quota_data *qd) { gfs2_assert_warn(qd->qd_gl->gl_sbd, @@ -602,9 +632,9 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change) x = be64_to_cpu(qc->qc_change) + change; qc->qc_change = cpu_to_be64(x); - spin_lock(&qd_lru_lock); + spin_lock(&qd_lock); qd->qd_change = x; - spin_unlock(&qd_lru_lock); + spin_unlock(&qd_lock); if (!x) { gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags)); @@ -648,7 +678,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, struct buffer_head *bh; struct page *page; void *kaddr, *ptr; - struct gfs2_quota q, *qp; + struct gfs2_quota q; int err, nbytes; u64 size; @@ -664,28 +694,25 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, return err; err = -EIO; - qp = &q; - qp->qu_value = be64_to_cpu(qp->qu_value); - qp->qu_value += change; - qp->qu_value = cpu_to_be64(qp->qu_value); - qd->qd_qb.qb_value = qp->qu_value; + be64_add_cpu(&q.qu_value, change); + qd->qd_qb.qb_value = q.qu_value; if (fdq) { if (fdq->d_fieldmask & FS_DQ_BSOFT) { - qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift); - qd->qd_qb.qb_warn = qp->qu_warn; + q.qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift); + qd->qd_qb.qb_warn = q.qu_warn; } if (fdq->d_fieldmask & FS_DQ_BHARD) { - qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift); - qd->qd_qb.qb_limit = qp->qu_limit; + q.qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift); + qd->qd_qb.qb_limit = q.qu_limit; } if (fdq->d_fieldmask & FS_DQ_BCOUNT) { - qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift); - qd->qd_qb.qb_value = qp->qu_value; + q.qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift); + qd->qd_qb.qb_value = q.qu_value; } } /* Write the quota into the quota file on disk */ - ptr = qp; + ptr = &q; nbytes = sizeof(struct gfs2_quota); get_a_page: page = find_or_create_page(mapping, index, GFP_NOFS); @@ -751,6 +778,7 @@ get_a_page: i_size_write(inode, size); inode->i_mtime = inode->i_atime = CURRENT_TIME; mark_inode_dirty(inode); + set_bit(QDF_REFRESH, &qd->qd_flags); return 0; unlock_out: @@ -763,6 +791,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) { struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + struct gfs2_alloc_parms ap = { .aflags = 0, }; unsigned int data_blocks, ind_blocks; struct gfs2_holder *ghs, i_gh; unsigned int qx, x; @@ -815,7 +844,8 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3; reserved = 1 + (nalloc * (data_blocks + ind_blocks)); - error = gfs2_inplace_reserve(ip, reserved, 0); + ap.target = reserved; + error = gfs2_inplace_reserve(ip, &ap); if (error) goto out_alloc; @@ -850,7 +880,7 @@ out: gfs2_glock_dq_uninit(&ghs[qx]); mutex_unlock(&ip->i_inode.i_mutex); kfree(ghs); - gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl); + gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl, NORMAL_FLUSH); return error; } @@ -974,9 +1004,9 @@ static int need_sync(struct gfs2_quota_data *qd) if (!qd->qd_qb.qb_limit) return 0; - spin_lock(&qd_lru_lock); + spin_lock(&qd_lock); value = qd->qd_change; - spin_unlock(&qd_lru_lock); + spin_unlock(&qd_lock); spin_lock(>->gt_spin); num = gt->gt_quota_scale_num; @@ -1001,9 +1031,11 @@ static int need_sync(struct gfs2_quota_data *qd) void gfs2_quota_unlock(struct gfs2_inode *ip) { + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qda[4]; unsigned int count = 0; unsigned int x; + int found; if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags)) goto out; @@ -1016,9 +1048,25 @@ void gfs2_quota_unlock(struct gfs2_inode *ip) sync = need_sync(qd); gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]); + if (!sync) + continue; + + spin_lock(&qd_lock); + found = qd_check_sync(sdp, qd, NULL); + spin_unlock(&qd_lock); + + if (!found) + continue; - if (sync && qd_trylock(qd)) - qda[count++] = qd; + gfs2_assert_warn(sdp, qd->qd_change_sync); + if (bh_get(qd)) { + clear_bit(QDF_LOCKED, &qd->qd_flags); + slot_put(qd); + qd_put(qd); + continue; + } + + qda[count++] = qd; } if (count) { @@ -1037,10 +1085,10 @@ static int print_message(struct gfs2_quota_data *qd, char *type) { struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; - printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\n", - sdp->sd_fsname, type, - (qd->qd_id.type == USRQUOTA) ? "user" : "group", - from_kqid(&init_user_ns, qd->qd_id)); + fs_info(sdp, "quota %s for %s %u\n", + type, + (qd->qd_id.type == USRQUOTA) ? "user" : "group", + from_kqid(&init_user_ns, qd->qd_id)); return 0; } @@ -1067,9 +1115,9 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) continue; value = (s64)be64_to_cpu(qd->qd_qb.qb_value); - spin_lock(&qd_lru_lock); + spin_lock(&qd_lock); value += qd->qd_change; - spin_unlock(&qd_lru_lock); + spin_unlock(&qd_lock); if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) { print_message(qd, "exceeded"); @@ -1118,17 +1166,18 @@ int gfs2_quota_sync(struct super_block *sb, int type) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_quota_data **qda; - unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync); + unsigned int max_qd = PAGE_SIZE/sizeof(struct gfs2_holder); unsigned int num_qd; unsigned int x; int error = 0; - sdp->sd_quota_sync_gen++; - qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL); if (!qda) return -ENOMEM; + mutex_lock(&sdp->sd_quota_sync_mutex); + sdp->sd_quota_sync_gen++; + do { num_qd = 0; @@ -1153,6 +1202,7 @@ int gfs2_quota_sync(struct super_block *sb, int type) } } while (!error && num_qd == max_qd); + mutex_unlock(&sdp->sd_quota_sync_mutex); kfree(qda); return error; @@ -1176,17 +1226,6 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid) return error; } -static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *buf) -{ - const struct gfs2_quota_change *str = buf; - - qc->qc_change = be64_to_cpu(str->qc_change); - qc->qc_flags = be32_to_cpu(str->qc_flags); - qc->qc_id = make_kqid(&init_user_ns, - (qc->qc_flags & GFS2_QCF_USER)?USRQUOTA:GRPQUOTA, - be32_to_cpu(str->qc_id)); -} - int gfs2_quota_init(struct gfs2_sbd *sdp) { struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); @@ -1194,6 +1233,8 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift; unsigned int x, slot = 0; unsigned int found = 0; + unsigned int hash; + unsigned int bm_size; u64 dblock; u32 extlen = 0; int error; @@ -1202,23 +1243,19 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) return -EIO; sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; - sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE); - + bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long)); + bm_size *= sizeof(unsigned long); error = -ENOMEM; - - sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks, - sizeof(unsigned char *), GFP_NOFS); + sdp->sd_quota_bitmap = kzalloc(bm_size, GFP_NOFS | __GFP_NOWARN); + if (sdp->sd_quota_bitmap == NULL) + sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS | + __GFP_ZERO, PAGE_KERNEL); if (!sdp->sd_quota_bitmap) return error; - for (x = 0; x < sdp->sd_quota_chunks; x++) { - sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_NOFS); - if (!sdp->sd_quota_bitmap[x]) - goto fail; - } - for (x = 0; x < blocks; x++) { struct buffer_head *bh; + const struct gfs2_quota_change *qc; unsigned int y; if (!extlen) { @@ -1236,33 +1273,41 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) goto fail; } + qc = (const struct gfs2_quota_change *)(bh->b_data + sizeof(struct gfs2_meta_header)); for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots; y++, slot++) { - struct gfs2_quota_change_host qc; struct gfs2_quota_data *qd; - - gfs2_quota_change_in(&qc, bh->b_data + - sizeof(struct gfs2_meta_header) + - y * sizeof(struct gfs2_quota_change)); - if (!qc.qc_change) + s64 qc_change = be64_to_cpu(qc->qc_change); + u32 qc_flags = be32_to_cpu(qc->qc_flags); + enum quota_type qtype = (qc_flags & GFS2_QCF_USER) ? + USRQUOTA : GRPQUOTA; + struct kqid qc_id = make_kqid(&init_user_ns, qtype, + be32_to_cpu(qc->qc_id)); + qc++; + if (!qc_change) continue; - error = qd_alloc(sdp, qc.qc_id, &qd); - if (error) { + hash = gfs2_qd_hash(sdp, qc_id); + qd = qd_alloc(hash, sdp, qc_id); + if (qd == NULL) { brelse(bh); goto fail; } set_bit(QDF_CHANGE, &qd->qd_flags); - qd->qd_change = qc.qc_change; + qd->qd_change = qc_change; qd->qd_slot = slot; qd->qd_slot_count = 1; - spin_lock(&qd_lru_lock); - gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1); + spin_lock(&qd_lock); + BUG_ON(test_and_set_bit(slot, sdp->sd_quota_bitmap)); list_add(&qd->qd_list, &sdp->sd_quota_list); atomic_inc(&sdp->sd_quota_count); - spin_unlock(&qd_lru_lock); + spin_unlock(&qd_lock); + + spin_lock_bucket(hash); + hlist_bl_add_head_rcu(&qd->qd_hlist, &qd_hash_table[hash]); + spin_unlock_bucket(hash); found++; } @@ -1286,51 +1331,41 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) { struct list_head *head = &sdp->sd_quota_list; struct gfs2_quota_data *qd; - unsigned int x; - spin_lock(&qd_lru_lock); + spin_lock(&qd_lock); while (!list_empty(head)) { qd = list_entry(head->prev, struct gfs2_quota_data, qd_list); - if (atomic_read(&qd->qd_count) > 1 || - (atomic_read(&qd->qd_count) && - !test_bit(QDF_CHANGE, &qd->qd_flags))) { - list_move(&qd->qd_list, head); - spin_unlock(&qd_lru_lock); - schedule(); - spin_lock(&qd_lru_lock); - continue; - } - list_del(&qd->qd_list); + /* Also remove if this qd exists in the reclaim list */ - if (!list_empty(&qd->qd_reclaim)) { - list_del_init(&qd->qd_reclaim); - atomic_dec(&qd_lru_count); - } + list_lru_del(&gfs2_qd_lru, &qd->qd_lru); atomic_dec(&sdp->sd_quota_count); - spin_unlock(&qd_lru_lock); + spin_unlock(&qd_lock); + + spin_lock_bucket(qd->qd_hash); + hlist_bl_del_rcu(&qd->qd_hlist); + spin_unlock_bucket(qd->qd_hash); - if (!atomic_read(&qd->qd_count)) { - gfs2_assert_warn(sdp, !qd->qd_change); - gfs2_assert_warn(sdp, !qd->qd_slot_count); - } else - gfs2_assert_warn(sdp, qd->qd_slot_count == 1); + gfs2_assert_warn(sdp, !qd->qd_change); + gfs2_assert_warn(sdp, !qd->qd_slot_count); gfs2_assert_warn(sdp, !qd->qd_bh_count); gfs2_glock_put(qd->qd_gl); - kmem_cache_free(gfs2_quotad_cachep, qd); + call_rcu(&qd->qd_rcu, gfs2_qd_dealloc); - spin_lock(&qd_lru_lock); + spin_lock(&qd_lock); } - spin_unlock(&qd_lru_lock); + spin_unlock(&qd_lock); gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count)); if (sdp->sd_quota_bitmap) { - for (x = 0; x < sdp->sd_quota_chunks; x++) - kfree(sdp->sd_quota_bitmap[x]); - kfree(sdp->sd_quota_bitmap); + if (is_vmalloc_addr(sdp->sd_quota_bitmap)) + vfree(sdp->sd_quota_bitmap); + else + kfree(sdp->sd_quota_bitmap); + sdp->sd_quota_bitmap = NULL; } } @@ -1462,7 +1497,7 @@ static int gfs2_quota_get_xstate(struct super_block *sb, } fqs->qs_uquota.qfs_nextents = 1; /* unsupported */ fqs->qs_gquota = fqs->qs_uquota; /* its the same inode in both cases */ - fqs->qs_incoredqs = atomic_read(&qd_lru_count); + fqs->qs_incoredqs = list_lru_count(&gfs2_qd_lru); return 0; } @@ -1573,10 +1608,12 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, if (gfs2_is_stuffed(ip)) alloc_required = 1; if (alloc_required) { + struct gfs2_alloc_parms ap = { .aflags = 0, }; gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), &data_blocks, &ind_blocks); blocks = 1 + data_blocks + ind_blocks; - error = gfs2_inplace_reserve(ip, blocks, 0); + ap.target = blocks; + error = gfs2_inplace_reserve(ip, &ap); if (error) goto out_i; blocks += gfs2_rg_blocks(ip, blocks); @@ -1612,3 +1649,11 @@ const struct quotactl_ops gfs2_quotactl_ops = { .get_dqblk = gfs2_get_dqblk, .set_dqblk = gfs2_set_dqblk, }; + +void __init gfs2_quota_hash_init(void) +{ + unsigned i; + + for(i = 0; i < GFS2_QD_HASH_SIZE; i++) + INIT_HLIST_BL_HEAD(&qd_hash_table[i]); +} diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 0f64d9deb1b..55d506eb3c4 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -10,9 +10,10 @@ #ifndef __QUOTA_DOT_H__ #define __QUOTA_DOT_H__ +#include <linux/list_lru.h> + struct gfs2_inode; struct gfs2_sbd; -struct shrink_control; #define NO_UID_QUOTA_CHANGE INVALID_UID #define NO_GID_QUOTA_CHANGE INVALID_GID @@ -53,10 +54,9 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) return ret; } -extern unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, - struct shrink_control *sc); -extern unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, - struct shrink_control *sc); extern const struct quotactl_ops gfs2_quotactl_ops; +extern struct shrinker gfs2_qd_shrinker; +extern struct list_lru gfs2_qd_lru; +extern void __init gfs2_quota_hash_init(void); #endif /* __QUOTA_DOT_H__ */ diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 963b2d75200..94555d4c569 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -52,9 +52,9 @@ int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, return error; } -int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where) +int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where) { - struct list_head *head = &sdp->sd_revoke_list; + struct list_head *head = &jd->jd_revoke_list; struct gfs2_revoke_replay *rr; int found = 0; @@ -81,13 +81,13 @@ int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where) return 1; } -int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where) +int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where) { struct gfs2_revoke_replay *rr; int wrap, a, b, revoke; int found = 0; - list_for_each_entry(rr, &sdp->sd_revoke_list, rr_list) { + list_for_each_entry(rr, &jd->jd_revoke_list, rr_list) { if (rr->rr_blkno == blkno) { found = 1; break; @@ -97,17 +97,17 @@ int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where) if (!found) return 0; - wrap = (rr->rr_where < sdp->sd_replay_tail); - a = (sdp->sd_replay_tail < where); + wrap = (rr->rr_where < jd->jd_replay_tail); + a = (jd->jd_replay_tail < where); b = (where < rr->rr_where); revoke = (wrap) ? (a || b) : (a && b); return revoke; } -void gfs2_revoke_clean(struct gfs2_sbd *sdp) +void gfs2_revoke_clean(struct gfs2_jdesc *jd) { - struct list_head *head = &sdp->sd_revoke_list; + struct list_head *head = &jd->jd_revoke_list; struct gfs2_revoke_replay *rr; while (!list_empty(head)) { @@ -454,7 +454,7 @@ void gfs2_recover_func(struct work_struct *work) struct gfs2_inode *ip = GFS2_I(jd->jd_inode); struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct gfs2_log_header_host head; - struct gfs2_holder j_gh, ji_gh, t_gh; + struct gfs2_holder j_gh, ji_gh, thaw_gh; unsigned long t; int ro = 0; unsigned int pass; @@ -508,11 +508,11 @@ void gfs2_recover_func(struct work_struct *work) t = jiffies; - /* Acquire a shared hold on the transaction lock */ + /* Acquire a shared hold on the freeze lock */ - error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, - LM_FLAG_NOEXP | LM_FLAG_PRIORITY | - GL_NOCACHE, &t_gh); + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, + LM_FLAG_NOEXP | LM_FLAG_PRIORITY, + &thaw_gh); if (error) goto fail_gunlock_ji; @@ -538,7 +538,7 @@ void gfs2_recover_func(struct work_struct *work) fs_warn(sdp, "jid=%u: Can't replay: read-only block " "device\n", jd->jd_jid); error = -EROFS; - goto fail_gunlock_tr; + goto fail_gunlock_thaw; } fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid); @@ -549,14 +549,14 @@ void gfs2_recover_func(struct work_struct *work) head.lh_blkno, pass); lops_after_scan(jd, error, pass); if (error) - goto fail_gunlock_tr; + goto fail_gunlock_thaw; } error = clean_journal(jd, &head); if (error) - goto fail_gunlock_tr; + goto fail_gunlock_thaw; - gfs2_glock_dq_uninit(&t_gh); + gfs2_glock_dq_uninit(&thaw_gh); t = DIV_ROUND_UP(jiffies - t, HZ); fs_info(sdp, "jid=%u: Journal replayed in %lus\n", jd->jd_jid, t); @@ -572,8 +572,8 @@ void gfs2_recover_func(struct work_struct *work) fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); goto done; -fail_gunlock_tr: - gfs2_glock_dq_uninit(&t_gh); +fail_gunlock_thaw: + gfs2_glock_dq_uninit(&thaw_gh); fail_gunlock_ji: if (jlocked) { gfs2_glock_dq_uninit(&ji_gh); @@ -587,7 +587,7 @@ fail: gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); done: clear_bit(JDF_RECOVERY, &jd->jd_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&jd->jd_flags, JDF_RECOVERY); } diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index 2226136c764..6142836cce9 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -23,9 +23,9 @@ static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk) extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, struct buffer_head **bh); -extern int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); -extern int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); -extern void gfs2_revoke_clean(struct gfs2_sbd *sdp); +extern int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where); +extern int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where); +extern void gfs2_revoke_clean(struct gfs2_jdesc *jd); extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 69317435faa..f4cb9c0d6bb 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -7,6 +7,8 @@ * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> @@ -57,6 +59,11 @@ * 3 = Used (metadata) */ +struct gfs2_extent { + struct gfs2_rbm rbm; + u32 len; +}; + static const char valid_change[16] = { /* current */ /* n */ 0, 1, 1, 1, @@ -65,8 +72,9 @@ static const char valid_change[16] = { 1, 0, 0, 0 }; -static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext, - const struct gfs2_inode *ip, bool nowrap); +static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, + const struct gfs2_inode *ip, bool nowrap, + const struct gfs2_alloc_parms *ap); /** @@ -81,32 +89,32 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone, unsigned char new_state) { unsigned char *byte1, *byte2, *end, cur_state; - unsigned int buflen = rbm->bi->bi_len; + struct gfs2_bitmap *bi = rbm_bi(rbm); + unsigned int buflen = bi->bi_len; const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE; - byte1 = rbm->bi->bi_bh->b_data + rbm->bi->bi_offset + (rbm->offset / GFS2_NBBY); - end = rbm->bi->bi_bh->b_data + rbm->bi->bi_offset + buflen; + byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY); + end = bi->bi_bh->b_data + bi->bi_offset + buflen; BUG_ON(byte1 >= end); cur_state = (*byte1 >> bit) & GFS2_BIT_MASK; if (unlikely(!valid_change[new_state * 4 + cur_state])) { - printk(KERN_WARNING "GFS2: buf_blk = 0x%x old_state=%d, " - "new_state=%d\n", rbm->offset, cur_state, new_state); - printk(KERN_WARNING "GFS2: rgrp=0x%llx bi_start=0x%x\n", - (unsigned long long)rbm->rgd->rd_addr, - rbm->bi->bi_start); - printk(KERN_WARNING "GFS2: bi_offset=0x%x bi_len=0x%x\n", - rbm->bi->bi_offset, rbm->bi->bi_len); + pr_warn("buf_blk = 0x%x old_state=%d, new_state=%d\n", + rbm->offset, cur_state, new_state); + pr_warn("rgrp=0x%llx bi_start=0x%x\n", + (unsigned long long)rbm->rgd->rd_addr, bi->bi_start); + pr_warn("bi_offset=0x%x bi_len=0x%x\n", + bi->bi_offset, bi->bi_len); dump_stack(); gfs2_consist_rgrpd(rbm->rgd); return; } *byte1 ^= (cur_state ^ new_state) << bit; - if (do_clone && rbm->bi->bi_clone) { - byte2 = rbm->bi->bi_clone + rbm->bi->bi_offset + (rbm->offset / GFS2_NBBY); + if (do_clone && bi->bi_clone) { + byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY); cur_state = (*byte2 >> bit) & GFS2_BIT_MASK; *byte2 ^= (cur_state ^ new_state) << bit; } @@ -121,7 +129,8 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone, static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm) { - const u8 *buffer = rbm->bi->bi_bh->b_data + rbm->bi->bi_offset; + struct gfs2_bitmap *bi = rbm_bi(rbm); + const u8 *buffer = bi->bi_bh->b_data + bi->bi_offset; const u8 *byte; unsigned int bit; @@ -252,29 +261,53 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len, static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block) { u64 rblock = block - rbm->rgd->rd_data0; - u32 x; if (WARN_ON_ONCE(rblock > UINT_MAX)) return -EINVAL; if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data) return -E2BIG; - rbm->bi = rbm->rgd->rd_bits; + rbm->bii = 0; rbm->offset = (u32)(rblock); /* Check if the block is within the first block */ - if (rbm->offset < (rbm->bi->bi_start + rbm->bi->bi_len) * GFS2_NBBY) + if (rbm->offset < rbm_bi(rbm)->bi_blocks) return 0; /* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */ rbm->offset += (sizeof(struct gfs2_rgrp) - sizeof(struct gfs2_meta_header)) * GFS2_NBBY; - x = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap; - rbm->offset -= x * rbm->rgd->rd_sbd->sd_blocks_per_bitmap; - rbm->bi += x; + rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap; + rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap; return 0; } /** + * gfs2_rbm_incr - increment an rbm structure + * @rbm: The rbm with rgd already set correctly + * + * This function takes an existing rbm structure and increments it to the next + * viable block offset. + * + * Returns: If incrementing the offset would cause the rbm to go past the + * end of the rgrp, true is returned, otherwise false. + * + */ + +static bool gfs2_rbm_incr(struct gfs2_rbm *rbm) +{ + if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */ + rbm->offset++; + return false; + } + if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */ + return true; + + rbm->offset = 0; + rbm->bii++; + return false; +} + +/** * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned * @rbm: Position to search (value/result) * @n_unaligned: Number of unaligned blocks to check @@ -285,7 +318,6 @@ static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block) static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len) { - u64 block; u32 n; u8 res; @@ -296,8 +328,7 @@ static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *le (*len)--; if (*len == 0) return true; - block = gfs2_rbm_to_block(rbm); - if (gfs2_rbm_from_block(rbm, block + 1)) + if (gfs2_rbm_incr(rbm)) return true; } @@ -306,7 +337,7 @@ static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *le /** * gfs2_free_extlen - Return extent length of free blocks - * @rbm: Starting position + * @rrbm: Starting position * @len: Max length to check * * Starting at the block specified by the rbm, see how many free blocks @@ -328,6 +359,7 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len) u32 chunk_size; u8 *ptr, *start, *end; u64 block; + struct gfs2_bitmap *bi; if (n_unaligned && gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len)) @@ -336,11 +368,12 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len) n_unaligned = len & 3; /* Start is now byte aligned */ while (len > 3) { - start = rbm.bi->bi_bh->b_data; - if (rbm.bi->bi_clone) - start = rbm.bi->bi_clone; - end = start + rbm.bi->bi_bh->b_size; - start += rbm.bi->bi_offset; + bi = rbm_bi(&rbm); + start = bi->bi_bh->b_data; + if (bi->bi_clone) + start = bi->bi_clone; + end = start + bi->bi_bh->b_size; + start += bi->bi_offset; BUG_ON(rbm.offset & 3); start += (rbm.offset / GFS2_NBBY); bytes = min_t(u32, len / GFS2_NBBY, (end - start)); @@ -605,12 +638,18 @@ static void __rs_deltree(struct gfs2_blkreserv *rs) RB_CLEAR_NODE(&rs->rs_node); if (rs->rs_free) { + struct gfs2_bitmap *bi = rbm_bi(&rs->rs_rbm); + /* return reserved blocks to the rgrp */ BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free); rs->rs_rbm.rgd->rd_reserved -= rs->rs_free; + /* The rgrp extent failure point is likely not to increase; + it will only do so if the freed blocks are somehow + contiguous with a span of free blocks that follows. Still, + it will force the number to be recalculated later. */ + rgd->rd_extfail_pt += rs->rs_free; rs->rs_free = 0; - clear_bit(GBF_FULL, &rs->rs_rbm.bi->bi_flags); - smp_mb__after_clear_bit(); + clear_bit(GBF_FULL, &bi->bi_flags); } } @@ -634,14 +673,13 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs) /** * gfs2_rs_delete - delete a multi-block reservation * @ip: The inode for this reservation + * @wcount: The inode's write count, or NULL * */ -void gfs2_rs_delete(struct gfs2_inode *ip) +void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount) { - struct inode *inode = &ip->i_inode; - down_write(&ip->i_rw_mutex); - if (ip->i_res && atomic_read(&inode->i_writecount) <= 1) { + if (ip->i_res && ((wcount == NULL) || (atomic_read(wcount) <= 1))) { gfs2_rs_deltree(ip->i_res); BUG_ON(ip->i_res->rs_free); kmem_cache_free(gfs2_rsrv_cachep, ip->i_res); @@ -700,11 +738,11 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd) { - printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)rgd->rd_addr); - printk(KERN_INFO " ri_length = %u\n", rgd->rd_length); - printk(KERN_INFO " ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0); - printk(KERN_INFO " ri_data = %u\n", rgd->rd_data); - printk(KERN_INFO " ri_bitbytes = %u\n", rgd->rd_bitbytes); + pr_info("ri_addr = %llu\n", (unsigned long long)rgd->rd_addr); + pr_info("ri_length = %u\n", rgd->rd_length); + pr_info("ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0); + pr_info("ri_data = %u\n", rgd->rd_data); + pr_info("ri_bitbytes = %u\n", rgd->rd_bitbytes); } /** @@ -743,18 +781,21 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd) bi->bi_offset = sizeof(struct gfs2_rgrp); bi->bi_start = 0; bi->bi_len = bytes; + bi->bi_blocks = bytes * GFS2_NBBY; /* header block */ } else if (x == 0) { bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp); bi->bi_offset = sizeof(struct gfs2_rgrp); bi->bi_start = 0; bi->bi_len = bytes; + bi->bi_blocks = bytes * GFS2_NBBY; /* last block */ } else if (x + 1 == length) { bytes = bytes_left; bi->bi_offset = sizeof(struct gfs2_meta_header); bi->bi_start = rgd->rd_bitbytes - bytes_left; bi->bi_len = bytes; + bi->bi_blocks = bytes * GFS2_NBBY; /* other blocks */ } else { bytes = sdp->sd_sb.sb_bsize - @@ -762,6 +803,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd) bi->bi_offset = sizeof(struct gfs2_meta_header); bi->bi_start = rgd->rd_bitbytes - bytes_left; bi->bi_len = bytes; + bi->bi_blocks = bytes * GFS2_NBBY; } bytes_left -= bytes; @@ -846,6 +888,7 @@ static int rgd_insert(struct gfs2_rgrpd *rgd) static int read_rindex_entry(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + const unsigned bsize = sdp->sd_sb.sb_bsize; loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); struct gfs2_rindex buf; int error; @@ -883,6 +926,8 @@ static int read_rindex_entry(struct gfs2_inode *ip) goto fail; rgd->rd_gl->gl_object = rgd; + rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize; + rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1; rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; rgd->rd_flags &= ~GFS2_RDF_UPTODATE; if (rgd->rd_data > sdp->sd_max_rg_data) @@ -1059,7 +1104,7 @@ static u32 count_unlinked(struct gfs2_rgrpd *rgd) * Returns: errno */ -int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) +static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) { struct gfs2_sbd *sdp = rgd->rd_sbd; struct gfs2_glock *gl = rgd->rd_gl; @@ -1096,8 +1141,10 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); rgd->rd_free_clone = rgd->rd_free; + /* max out the rgrp allocation failure point */ + rgd->rd_extfail_pt = rgd->rd_free; } - if (be32_to_cpu(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) { + if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) { rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd)); gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); @@ -1124,14 +1171,14 @@ fail: return error; } -int update_rgrp_lvb(struct gfs2_rgrpd *rgd) +static int update_rgrp_lvb(struct gfs2_rgrpd *rgd) { u32 rl_flags; if (rgd->rd_flags & GFS2_RDF_UPTODATE) return 0; - if (be32_to_cpu(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) + if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) return gfs2_rgrp_bh_get(rgd); rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags); @@ -1154,7 +1201,7 @@ int gfs2_rgrp_go_lock(struct gfs2_holder *gh) if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb) return 0; - return gfs2_rgrp_bh_get((struct gfs2_rgrpd *)gh->gh_gl->gl_object); + return gfs2_rgrp_bh_get(rgd); } /** @@ -1392,12 +1439,12 @@ static void rs_insert(struct gfs2_inode *ip) * rg_mblk_search - find a group of multiple free blocks to form a reservation * @rgd: the resource group descriptor * @ip: pointer to the inode for which we're reserving blocks - * @requested: number of blocks required for this allocation + * @ap: the allocation parameters * */ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, - unsigned requested) + const struct gfs2_alloc_parms *ap) { struct gfs2_rbm rbm = { .rgd = rgd, }; u64 goal; @@ -1410,7 +1457,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, if (S_ISDIR(inode->i_mode)) extlen = 1; else { - extlen = max_t(u32, atomic_read(&rs->rs_sizehint), requested); + extlen = max_t(u32, atomic_read(&rs->rs_sizehint), ap->target); extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks); } if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen)) @@ -1425,7 +1472,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, if (WARN_ON(gfs2_rbm_from_block(&rbm, goal))) return; - ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, extlen, ip, true); + ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true, ap); if (ret == 0) { rs->rs_rbm = rbm; rs->rs_free = extlen; @@ -1490,6 +1537,7 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block, * @rbm: The current position in the resource group * @ip: The inode for which we are searching for blocks * @minext: The minimum extent length + * @maxext: A pointer to the maximum extent structure * * This checks the current position in the rgrp to see whether there is * a reservation covering this block. If not then this function is a @@ -1502,7 +1550,8 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block, static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, const struct gfs2_inode *ip, - u32 minext) + u32 minext, + struct gfs2_extent *maxext) { u64 block = gfs2_rbm_to_block(rbm); u32 extlen = 1; @@ -1515,8 +1564,7 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, */ if (minext) { extlen = gfs2_free_extlen(rbm, minext); - nblock = block + extlen; - if (extlen < minext) + if (extlen <= maxext->len) goto fail; } @@ -1525,9 +1573,17 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, * and skip if parts of it are already reserved */ nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip); - if (nblock == block) - return 0; + if (nblock == block) { + if (!minext || extlen >= minext) + return 0; + + if (extlen > maxext->len) { + maxext->len = extlen; + maxext->rbm = *rbm; + } fail: + nblock = block + extlen; + } ret = gfs2_rbm_from_block(rbm, nblock); if (ret < 0) return ret; @@ -1538,30 +1594,38 @@ fail: * gfs2_rbm_find - Look for blocks of a particular state * @rbm: Value/result starting position and final position * @state: The state which we want to find - * @minext: The requested extent length (0 for a single block) + * @minext: Pointer to the requested extent length (NULL for a single block) + * This is updated to be the actual reservation size. * @ip: If set, check for reservations * @nowrap: Stop looking at the end of the rgrp, rather than wrapping * around until we've reached the starting point. + * @ap: the allocation parameters * * Side effects: * - If looking for free blocks, we set GBF_FULL on each bitmap which * has no free blocks in it. + * - If looking for free blocks, we set rd_extfail_pt on each rgrp which + * has come up short on a free block search. * * Returns: 0 on success, -ENOSPC if there is no block of the requested state */ -static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext, - const struct gfs2_inode *ip, bool nowrap) +static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, + const struct gfs2_inode *ip, bool nowrap, + const struct gfs2_alloc_parms *ap) { struct buffer_head *bh; - struct gfs2_bitmap *initial_bi; + int initial_bii; u32 initial_offset; + int first_bii = rbm->bii; + u32 first_offset = rbm->offset; u32 offset; u8 *buffer; - int index; int n = 0; int iters = rbm->rgd->rd_length; int ret; + struct gfs2_bitmap *bi; + struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, }; /* If we are not starting at the beginning of a bitmap, then we * need to add one to the bitmap count to ensure that we search @@ -1571,52 +1635,55 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext, iters++; while(1) { - if (test_bit(GBF_FULL, &rbm->bi->bi_flags) && + bi = rbm_bi(rbm); + if (test_bit(GBF_FULL, &bi->bi_flags) && (state == GFS2_BLKST_FREE)) goto next_bitmap; - bh = rbm->bi->bi_bh; - buffer = bh->b_data + rbm->bi->bi_offset; + bh = bi->bi_bh; + buffer = bh->b_data + bi->bi_offset; WARN_ON(!buffer_uptodate(bh)); - if (state != GFS2_BLKST_UNLINKED && rbm->bi->bi_clone) - buffer = rbm->bi->bi_clone + rbm->bi->bi_offset; + if (state != GFS2_BLKST_UNLINKED && bi->bi_clone) + buffer = bi->bi_clone + bi->bi_offset; initial_offset = rbm->offset; - offset = gfs2_bitfit(buffer, rbm->bi->bi_len, rbm->offset, state); + offset = gfs2_bitfit(buffer, bi->bi_len, rbm->offset, state); if (offset == BFITNOENT) goto bitmap_full; rbm->offset = offset; if (ip == NULL) return 0; - initial_bi = rbm->bi; - ret = gfs2_reservation_check_and_update(rbm, ip, minext); + initial_bii = rbm->bii; + ret = gfs2_reservation_check_and_update(rbm, ip, + minext ? *minext : 0, + &maxext); if (ret == 0) return 0; if (ret > 0) { - n += (rbm->bi - initial_bi); + n += (rbm->bii - initial_bii); goto next_iter; } if (ret == -E2BIG) { - index = 0; + rbm->bii = 0; rbm->offset = 0; - n += (rbm->bi - initial_bi); + n += (rbm->bii - initial_bii); goto res_covered_end_of_rgrp; } return ret; bitmap_full: /* Mark bitmap as full and fall through */ - if ((state == GFS2_BLKST_FREE) && initial_offset == 0) - set_bit(GBF_FULL, &rbm->bi->bi_flags); + if ((state == GFS2_BLKST_FREE) && initial_offset == 0) { + struct gfs2_bitmap *bi = rbm_bi(rbm); + set_bit(GBF_FULL, &bi->bi_flags); + } next_bitmap: /* Find next bitmap in the rgrp */ rbm->offset = 0; - index = rbm->bi - rbm->rgd->rd_bits; - index++; - if (index == rbm->rgd->rd_length) - index = 0; + rbm->bii++; + if (rbm->bii == rbm->rgd->rd_length) + rbm->bii = 0; res_covered_end_of_rgrp: - rbm->bi = &rbm->rgd->rd_bits[index]; - if ((index == 0) && nowrap) + if ((rbm->bii == 0) && nowrap) break; n++; next_iter: @@ -1624,6 +1691,24 @@ next_iter: break; } + if (minext == NULL || state != GFS2_BLKST_FREE) + return -ENOSPC; + + /* If the extent was too small, and it's smaller than the smallest + to have failed before, remember for future reference that it's + useless to search this rgrp again for this amount or more. */ + if ((first_offset == 0) && (first_bii == 0) && + (*minext < rbm->rgd->rd_extfail_pt)) + rbm->rgd->rd_extfail_pt = *minext; + + /* If the maximum extent we found is big enough to fulfill the + minimum requirements, use it anyway. */ + if (maxext.len) { + *rbm = maxext.rbm; + *minext = maxext.len; + return 0; + } + return -ENOSPC; } @@ -1645,11 +1730,12 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip struct gfs2_inode *ip; int error; int found = 0; - struct gfs2_rbm rbm = { .rgd = rgd, .bi = rgd->rd_bits, .offset = 0 }; + struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 }; while (1) { down_write(&sdp->sd_log_flush_lock); - error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, 0, NULL, true); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL, + true, NULL); up_write(&sdp->sd_log_flush_lock); if (error == -ENOSPC) break; @@ -1800,12 +1886,12 @@ static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *b /** * gfs2_inplace_reserve - Reserve space in the filesystem * @ip: the inode to reserve space for - * @requested: the number of blocks to be reserved + * @ap: the allocation parameters * * Returns: errno */ -int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags) +int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *begin = NULL; @@ -1817,17 +1903,16 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags) if (sdp->sd_args.ar_rgrplvb) flags |= GL_SKIP; - if (gfs2_assert_warn(sdp, requested)) + if (gfs2_assert_warn(sdp, ap->target)) return -EINVAL; if (gfs2_rs_active(rs)) { begin = rs->rs_rbm.rgd; - flags = 0; /* Yoda: Do or do not. There is no try */ } else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) { rs->rs_rbm.rgd = begin = ip->i_rgd; } else { rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); } - if (S_ISDIR(ip->i_inode.i_mode) && (aflags & GFS2_AF_ORLOV)) + if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV)) skip = gfs2_orlov_skip(ip); if (rs->rs_rbm.rgd == NULL) return -EBADSLT; @@ -1861,7 +1946,9 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags) } /* Skip unuseable resource groups */ - if (rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) + if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | + GFS2_RDF_ERROR)) || + (ap->target > rs->rs_rbm.rgd->rd_extfail_pt)) goto skip_rgrp; if (sdp->sd_args.ar_rgrplvb) @@ -1869,27 +1956,28 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags) /* Get a reservation if we don't already have one */ if (!gfs2_rs_active(rs)) - rg_mblk_search(rs->rs_rbm.rgd, ip, requested); + rg_mblk_search(rs->rs_rbm.rgd, ip, ap); /* Skip rgrps when we can't get a reservation on first pass */ if (!gfs2_rs_active(rs) && (loops < 1)) goto check_rgrp; /* If rgrp has enough free space, use it */ - if (rs->rs_rbm.rgd->rd_free_clone >= requested) { + if (rs->rs_rbm.rgd->rd_free_clone >= ap->target) { ip->i_rgd = rs->rs_rbm.rgd; return 0; } - /* Drop reservation, if we couldn't use reserved rgrp */ - if (gfs2_rs_active(rs)) - gfs2_rs_deltree(rs); check_rgrp: /* Check for unlinked inodes which can be reclaimed */ if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK) try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked, ip->i_no_addr); skip_rgrp: + /* Drop reservation, if we couldn't use reserved rgrp */ + if (gfs2_rs_active(rs)) + gfs2_rs_deltree(rs); + /* Unlock rgrp if required */ if (!rg_locked) gfs2_glock_dq_uninit(&rs->rs_rgd_gh); @@ -1913,7 +2001,7 @@ next_rgrp: } /* Flushing the log may release space */ if (loops == 2) - gfs2_log_flush(sdp, NULL); + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); } return -ENOSPC; @@ -1973,14 +2061,14 @@ static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode, *n = 1; block = gfs2_rbm_to_block(rbm); - gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm->bi->bi_bh); + gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh); gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); block++; while (*n < elen) { ret = gfs2_rbm_from_block(&pos, block); if (ret || gfs2_testbit(&pos) != GFS2_BLKST_FREE) break; - gfs2_trans_add_meta(pos.rgd->rd_gl, pos.bi->bi_bh); + gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh); gfs2_setbit(&pos, true, GFS2_BLKST_USED); (*n)++; block++; @@ -2001,6 +2089,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, u32 blen, unsigned char new_state) { struct gfs2_rbm rbm; + struct gfs2_bitmap *bi; rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1); if (!rbm.rgd) { @@ -2011,15 +2100,15 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, while (blen--) { gfs2_rbm_from_block(&rbm, bstart); + bi = rbm_bi(&rbm); bstart++; - if (!rbm.bi->bi_clone) { - rbm.bi->bi_clone = kmalloc(rbm.bi->bi_bh->b_size, - GFP_NOFS | __GFP_NOFAIL); - memcpy(rbm.bi->bi_clone + rbm.bi->bi_offset, - rbm.bi->bi_bh->b_data + rbm.bi->bi_offset, - rbm.bi->bi_len); + if (!bi->bi_clone) { + bi->bi_clone = kmalloc(bi->bi_bh->b_size, + GFP_NOFS | __GFP_NOFAIL); + memcpy(bi->bi_clone + bi->bi_offset, + bi->bi_bh->b_data + bi->bi_offset, bi->bi_len); } - gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.bi->bi_bh); + gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh); gfs2_setbit(&rbm, false, new_state); } @@ -2033,25 +2122,24 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, * */ -int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) +void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) { struct gfs2_rgrpd *rgd = gl->gl_object; struct gfs2_blkreserv *trs; const struct rb_node *n; if (rgd == NULL) - return 0; - gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u\n", + return; + gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n", (unsigned long long)rgd->rd_addr, rgd->rd_flags, rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes, - rgd->rd_reserved); + rgd->rd_reserved, rgd->rd_extfail_pt); spin_lock(&rgd->rd_rsspin); for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) { trs = rb_entry(n, struct gfs2_blkreserv, rs_node); dump_rs(seq, trs); } spin_unlock(&rgd->rd_rsspin); - return 0; } static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd) @@ -2103,6 +2191,35 @@ out: } /** + * gfs2_set_alloc_start - Set starting point for block allocation + * @rbm: The rbm which will be set to the required location + * @ip: The gfs2 inode + * @dinode: Flag to say if allocation includes a new inode + * + * This sets the starting point from the reservation if one is active + * otherwise it falls back to guessing a start point based on the + * inode's goal block or the last allocation point in the rgrp. + */ + +static void gfs2_set_alloc_start(struct gfs2_rbm *rbm, + const struct gfs2_inode *ip, bool dinode) +{ + u64 goal; + + if (gfs2_rs_active(ip->i_res)) { + *rbm = ip->i_res->rs_rbm; + return; + } + + if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal)) + goal = ip->i_goal; + else + goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0; + + gfs2_rbm_from_block(rbm, goal); +} + +/** * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode * @ip: the inode to allocate the block for * @bn: Used to return the starting block number @@ -2120,30 +2237,24 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, struct buffer_head *dibh; struct gfs2_rbm rbm = { .rgd = ip->i_rgd, }; unsigned int ndata; - u64 goal; u64 block; /* block, within the file system scope */ int error; - if (gfs2_rs_active(ip->i_res)) - goal = gfs2_rbm_to_block(&ip->i_res->rs_rbm); - else if (!dinode && rgrp_contains_block(rbm.rgd, ip->i_goal)) - goal = ip->i_goal; - else - goal = rbm.rgd->rd_last_alloc + rbm.rgd->rd_data0; - - gfs2_rbm_from_block(&rbm, goal); - error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, ip, false); + gfs2_set_alloc_start(&rbm, ip, dinode); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false, NULL); if (error == -ENOSPC) { - gfs2_rbm_from_block(&rbm, goal); - error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, NULL, false); + gfs2_set_alloc_start(&rbm, ip, dinode); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false, + NULL); } /* Since all blocks are reserved in advance, this shouldn't happen */ if (error) { - fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d\n", + fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n", (unsigned long long)ip->i_no_addr, error, *nblocks, - test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags)); + test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags), + rbm.rgd->rd_extfail_pt); goto rgrp_error; } @@ -2169,7 +2280,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, } } if (rbm.rgd->rd_free < *nblocks) { - printk(KERN_WARNING "nblocks=%u\n", *nblocks); + pr_warn("nblocks=%u\n", *nblocks); goto rgrp_error; } @@ -2187,7 +2298,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0); if (dinode) - gfs2_trans_add_unrevoke(sdp, block, 1); + gfs2_trans_add_unrevoke(sdp, block, *nblocks); gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid); @@ -2411,7 +2522,7 @@ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state) /** * gfs2_rlist_free - free a resource group list - * @list: the list of resource groups + * @rlist: the list of resource groups * */ diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 5b3f4a896e6..463ab2e95d1 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -40,7 +40,7 @@ extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh); extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); #define GFS2_AF_ORLOV 1 -extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 flags); +extern int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap); extern void gfs2_inplace_release(struct gfs2_inode *ip); extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, @@ -48,7 +48,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, extern int gfs2_rs_alloc(struct gfs2_inode *ip); extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs); -extern void gfs2_rs_delete(struct gfs2_inode *ip); +extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount); extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta); extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); @@ -68,7 +68,7 @@ extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); -extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); +extern void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, struct buffer_head *bh, const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index e5639dec66c..1319b5c4ec6 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -7,6 +7,8 @@ * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/bio.h> #include <linux/sched.h> #include <linux/slab.h> @@ -175,8 +177,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options) break; case Opt_debug: if (args->ar_errors == GFS2_ERRORS_PANIC) { - printk(KERN_WARNING "GFS2: -o debug and -o errors=panic " - "are mutually exclusive.\n"); + pr_warn("-o debug and -o errors=panic are mutually exclusive\n"); return -EINVAL; } args->ar_debug = 1; @@ -228,21 +229,21 @@ int gfs2_mount_args(struct gfs2_args *args, char *options) case Opt_commit: rv = match_int(&tmp[0], &args->ar_commit); if (rv || args->ar_commit <= 0) { - printk(KERN_WARNING "GFS2: commit mount option requires a positive numeric argument\n"); + pr_warn("commit mount option requires a positive numeric argument\n"); return rv ? rv : -EINVAL; } break; case Opt_statfs_quantum: rv = match_int(&tmp[0], &args->ar_statfs_quantum); if (rv || args->ar_statfs_quantum < 0) { - printk(KERN_WARNING "GFS2: statfs_quantum mount option requires a non-negative numeric argument\n"); + pr_warn("statfs_quantum mount option requires a non-negative numeric argument\n"); return rv ? rv : -EINVAL; } break; case Opt_quota_quantum: rv = match_int(&tmp[0], &args->ar_quota_quantum); if (rv || args->ar_quota_quantum <= 0) { - printk(KERN_WARNING "GFS2: quota_quantum mount option requires a positive numeric argument\n"); + pr_warn("quota_quantum mount option requires a positive numeric argument\n"); return rv ? rv : -EINVAL; } break; @@ -250,7 +251,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options) rv = match_int(&tmp[0], &args->ar_statfs_percent); if (rv || args->ar_statfs_percent < 0 || args->ar_statfs_percent > 100) { - printk(KERN_WARNING "statfs_percent mount option requires a numeric argument between 0 and 100\n"); + pr_warn("statfs_percent mount option requires a numeric argument between 0 and 100\n"); return rv ? rv : -EINVAL; } break; @@ -259,8 +260,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options) break; case Opt_err_panic: if (args->ar_debug) { - printk(KERN_WARNING "GFS2: -o debug and -o errors=panic " - "are mutually exclusive.\n"); + pr_warn("-o debug and -o errors=panic are mutually exclusive\n"); return -EINVAL; } args->ar_errors = GFS2_ERRORS_PANIC; @@ -279,7 +279,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options) break; case Opt_error: default: - printk(KERN_WARNING "GFS2: invalid mount option: %s\n", o); + pr_warn("invalid mount option: %s\n", o); return -EINVAL; } } @@ -295,9 +295,8 @@ int gfs2_mount_args(struct gfs2_args *args, char *options) void gfs2_jindex_free(struct gfs2_sbd *sdp) { - struct list_head list, *head; + struct list_head list; struct gfs2_jdesc *jd; - struct gfs2_journal_extent *jext; spin_lock(&sdp->sd_jindex_spin); list_add(&list, &sdp->sd_jindex_list); @@ -307,14 +306,7 @@ void gfs2_jindex_free(struct gfs2_sbd *sdp) while (!list_empty(&list)) { jd = list_entry(list.next, struct gfs2_jdesc, jd_list); - head = &jd->extent_list; - while (!list_empty(head)) { - jext = list_entry(head->next, - struct gfs2_journal_extent, - extent_list); - list_del(&jext->extent_list); - kfree(jext); - } + gfs2_free_journal_extents(jd); list_del(&jd->jd_list); iput(jd->jd_inode); kfree(jd); @@ -369,6 +361,33 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd) return 0; } +static int init_threads(struct gfs2_sbd *sdp) +{ + struct task_struct *p; + int error = 0; + + p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); + if (IS_ERR(p)) { + error = PTR_ERR(p); + fs_err(sdp, "can't start logd thread: %d\n", error); + return error; + } + sdp->sd_logd_process = p; + + p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); + if (IS_ERR(p)) { + error = PTR_ERR(p); + fs_err(sdp, "can't start quotad thread: %d\n", error); + goto fail; + } + sdp->sd_quotad_process = p; + return 0; + +fail: + kthread_stop(sdp->sd_logd_process); + return error; +} + /** * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one * @sdp: the filesystem @@ -380,14 +399,19 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) { struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); struct gfs2_glock *j_gl = ip->i_gl; - struct gfs2_holder t_gh; + struct gfs2_holder thaw_gh; struct gfs2_log_header_host head; int error; - error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh); + error = init_threads(sdp); if (error) return error; + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0, + &thaw_gh); + if (error) + goto fail_threads; + j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); error = gfs2_find_jhead(sdp->sd_jdesc, &head); @@ -410,14 +434,16 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - gfs2_glock_dq_uninit(&t_gh); + gfs2_glock_dq_uninit(&thaw_gh); return 0; fail: - t_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_uninit(&t_gh); - + thaw_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq_uninit(&thaw_gh); +fail_threads: + kthread_stop(sdp->sd_quotad_process); + kthread_stop(sdp->sd_logd_process); return error; } @@ -610,15 +636,21 @@ struct lfcc { */ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, - struct gfs2_holder *t_gh) + struct gfs2_holder *freeze_gh) { struct gfs2_inode *ip; struct gfs2_jdesc *jd; struct lfcc *lfcc; LIST_HEAD(list); struct gfs2_log_header_host lh; + struct gfs2_inode *dip = GFS2_I(sdp->sd_root_dir->d_inode); int error; + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, + &sdp->sd_freeze_root_gh); + if (error) + return error; + atomic_set(&sdp->sd_frozen_root, 1); list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL); if (!lfcc) { @@ -634,8 +666,8 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, list_add(&lfcc->list, &list); } - error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED, - GL_NOCACHE, t_gh); + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE, + GL_NOCACHE, freeze_gh); list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { error = gfs2_jdesc_check(jd); @@ -651,7 +683,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, } if (error) - gfs2_glock_dq_uninit(t_gh); + gfs2_glock_dq_uninit(freeze_gh); out: while (!list_empty(&list)) { @@ -660,6 +692,11 @@ out: gfs2_glock_dq_uninit(&lfcc->gh); kfree(lfcc); } + if (error) { + atomic_dec(&sdp->sd_frozen_root); + wait_event(sdp->sd_frozen_root_wait, atomic_read(&sdp->sd_frozen_root) == 0); + gfs2_glock_dq_uninit(&sdp->sd_freeze_root_gh); + } return error; } @@ -717,7 +754,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) int ret = 0; if (wbc->sync_mode == WB_SYNC_ALL) - gfs2_log_flush(GFS2_SB(inode), ip->i_gl); + gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH); if (bdi->dirty_exceeded) gfs2_ail1_flush(sdp, wbc); else @@ -797,25 +834,30 @@ out: static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) { - struct gfs2_holder t_gh; + struct gfs2_holder thaw_gh; int error; - flush_workqueue(gfs2_delete_workqueue); - gfs2_quota_sync(sdp->sd_vfs, 0); - gfs2_statfs_sync(sdp->sd_vfs, 0); - - error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE, - &t_gh); + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, GL_NOCACHE, + &thaw_gh); if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) return error; - gfs2_meta_syncfs(sdp); - gfs2_log_shutdown(sdp); - + down_write(&sdp->sd_log_flush_lock); clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + up_write(&sdp->sd_log_flush_lock); - if (t_gh.gh_gl) - gfs2_glock_dq_uninit(&t_gh); + kthread_stop(sdp->sd_quotad_process); + kthread_stop(sdp->sd_logd_process); + + flush_workqueue(gfs2_delete_workqueue); + gfs2_quota_sync(sdp->sd_vfs, 0); + gfs2_statfs_sync(sdp->sd_vfs, 0); + + gfs2_log_flush(sdp, NULL, SHUTDOWN_FLUSH); + gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks); + + if (thaw_gh.gh_gl) + gfs2_glock_dq_uninit(&thaw_gh); gfs2_quota_cleanup(sdp); @@ -857,9 +899,6 @@ restart: } spin_unlock(&sdp->sd_jindex_spin); - kthread_stop(sdp->sd_quotad_process); - kthread_stop(sdp->sd_logd_process); - if (!(sb->s_flags & MS_RDONLY)) { error = gfs2_make_fs_ro(sdp); if (error) @@ -875,7 +914,7 @@ restart: iput(sdp->sd_quota_inode); gfs2_glock_put(sdp->sd_rename_gl); - gfs2_glock_put(sdp->sd_trans_gl); + gfs2_glock_put(sdp->sd_freeze_gl); if (!sdp->sd_args.ar_spectator) { gfs2_glock_dq_uninit(&sdp->sd_journal_gh); @@ -910,8 +949,8 @@ static int gfs2_sync_fs(struct super_block *sb, int wait) struct gfs2_sbd *sdp = sb->s_fs_info; gfs2_quota_sync(sb, -1); - if (wait && sdp) - gfs2_log_flush(sdp, NULL); + if (wait && sdp && !atomic_read(&sdp->sd_log_freeze)) + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); return 0; } @@ -961,6 +1000,9 @@ static int gfs2_unfreeze(struct super_block *sb) struct gfs2_sbd *sdp = sb->s_fs_info; gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); + atomic_dec(&sdp->sd_frozen_root); + wait_event(sdp->sd_frozen_root_wait, atomic_read(&sdp->sd_frozen_root) == 0); + gfs2_glock_dq_uninit(&sdp->sd_freeze_root_gh); return 0; } @@ -1142,6 +1184,8 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) struct gfs2_tune *gt = &sdp->sd_tune; int error; + sync_filesystem(sb); + spin_lock(>->gt_spin); args.ar_commit = gt->gt_logd_secs; args.ar_quota_quantum = gt->gt_quota_quantum; @@ -1223,7 +1267,7 @@ static int gfs2_drop_inode(struct inode *inode) { struct gfs2_inode *ip = GFS2_I(inode); - if (inode->i_nlink) { + if (!test_bit(GIF_FREE_VFS_INODE, &ip->i_flags) && inode->i_nlink) { struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags)) clear_nlink(inode); @@ -1438,6 +1482,11 @@ static void gfs2_evict_inode(struct inode *inode) struct gfs2_holder gh; int error; + if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) { + clear_inode(inode); + return; + } + if (inode->i_nlink || (sb->s_flags & MS_RDONLY)) goto out; @@ -1493,7 +1542,7 @@ static void gfs2_evict_inode(struct inode *inode) goto out_unlock; out_truncate: - gfs2_log_flush(sdp, ip->i_gl); + gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH); if (test_bit(GLF_DIRTY, &ip->i_gl->gl_flags)) { struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); filemap_fdatawrite(metamapping); @@ -1525,8 +1574,8 @@ out_unlock: fs_warn(sdp, "gfs2_evict_inode: %d\n", error); out: /* Case 3 starts here */ - truncate_inode_pages(&inode->i_data, 0); - gfs2_rs_delete(ip); + truncate_inode_pages_final(&inode->i_data); + gfs2_rs_delete(ip, NULL); gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index aa5c4804496..3ab566ba569 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -7,6 +7,8 @@ * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/completion.h> @@ -138,9 +140,8 @@ static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len) if (simple_strtol(buf, NULL, 0) != 1) return -EINVAL; - gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: withdrawing from cluster at user's request\n", - sdp->sd_fsname); + gfs2_lm_withdraw(sdp, "withdrawing from cluster at user's request\n"); + return len; } @@ -239,8 +240,8 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len if (gltype > LM_TYPE_JOURNAL) return -EINVAL; - if (gltype == LM_TYPE_NONDISK && glnum == GFS2_TRANS_LOCK) - glops = &gfs2_trans_glops; + if (gltype == LM_TYPE_NONDISK && glnum == GFS2_FREEZE_LOCK) + glops = &gfs2_freeze_glops; else glops = gfs2_glops_list[gltype]; if (glops == NULL) @@ -332,7 +333,7 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len) set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); else if (val == 0) { clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); gfs2_glock_thaw(sdp); } else { ret = -EINVAL; @@ -406,6 +407,9 @@ int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid) struct gfs2_jdesc *jd; int rv; + /* Wait for our primary journal to be initialized */ + wait_for_completion(&sdp->sd_journal_ready); + spin_lock(&sdp->sd_jindex_spin); rv = -EBUSY; if (sdp->sd_jdesc->jd_jid == jid) @@ -481,7 +485,7 @@ static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len) rv = jid = -EINVAL; sdp->sd_lockstruct.ls_jid = jid; clear_bit(SDF_NOJOURNALID, &sdp->sd_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); out: spin_unlock(&sdp->sd_jindex_spin); @@ -587,7 +591,6 @@ TUNE_ATTR(max_readahead, 0); TUNE_ATTR(complain_secs, 0); TUNE_ATTR(statfs_slow, 0); TUNE_ATTR(new_files_jdata, 0); -TUNE_ATTR(quota_simul_sync, 1); TUNE_ATTR(statfs_quantum, 1); TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); @@ -597,7 +600,6 @@ static struct attribute *tune_attrs[] = { &tune_attr_max_readahead.attr, &tune_attr_complain_secs.attr, &tune_attr_statfs_slow.attr, - &tune_attr_quota_simul_sync.attr, &tune_attr_statfs_quantum.attr, &tune_attr_quota_scale.attr, &tune_attr_new_files_jdata.attr, diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 2b20d7046bf..0546ab4e28e 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -7,6 +7,8 @@ * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -46,64 +48,41 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, tr->tr_blocks = blocks; tr->tr_revokes = revokes; tr->tr_reserved = 1; + tr->tr_alloced = 1; if (blocks) tr->tr_reserved += 6 + blocks; if (revokes) tr->tr_reserved += gfs2_struct2blk(sdp, revokes, sizeof(u64)); - sb_start_intwrite(sdp->sd_vfs); - gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh); + INIT_LIST_HEAD(&tr->tr_databuf); + INIT_LIST_HEAD(&tr->tr_buf); - error = gfs2_glock_nq(&tr->tr_t_gh); - if (error) - goto fail_holder_uninit; + sb_start_intwrite(sdp->sd_vfs); error = gfs2_log_reserve(sdp, tr->tr_reserved); if (error) - goto fail_gunlock; + goto fail; current->journal_info = tr; return 0; -fail_gunlock: - gfs2_glock_dq(&tr->tr_t_gh); - -fail_holder_uninit: +fail: sb_end_intwrite(sdp->sd_vfs); - gfs2_holder_uninit(&tr->tr_t_gh); kfree(tr); return error; } -/** - * gfs2_log_release - Release a given number of log blocks - * @sdp: The GFS2 superblock - * @blks: The number of blocks - * - */ - -static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) -{ - - atomic_add(blks, &sdp->sd_log_blks_free); - trace_gfs2_log_blocks(sdp, blks); - gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= - sdp->sd_jdesc->jd_blocks); - up_read(&sdp->sd_log_flush_lock); -} - static void gfs2_print_trans(const struct gfs2_trans *tr) { - printk(KERN_WARNING "GFS2: Transaction created at: %pSR\n", - (void *)tr->tr_ip); - printk(KERN_WARNING "GFS2: blocks=%u revokes=%u reserved=%u touched=%d\n", - tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched); - printk(KERN_WARNING "GFS2: Buf %u/%u Databuf %u/%u Revoke %u/%u\n", - tr->tr_num_buf_new, tr->tr_num_buf_rm, - tr->tr_num_databuf_new, tr->tr_num_databuf_rm, - tr->tr_num_revoke, tr->tr_num_revoke_rm); + pr_warn("Transaction created at: %pSR\n", (void *)tr->tr_ip); + pr_warn("blocks=%u revokes=%u reserved=%u touched=%u\n", + tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched); + pr_warn("Buf %u/%u Databuf %u/%u Revoke %u/%u\n", + tr->tr_num_buf_new, tr->tr_num_buf_rm, + tr->tr_num_databuf_new, tr->tr_num_databuf_rm, + tr->tr_num_revoke, tr->tr_num_revoke_rm); } void gfs2_trans_end(struct gfs2_sbd *sdp) @@ -115,11 +94,8 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) if (!tr->tr_touched) { gfs2_log_release(sdp, tr->tr_reserved); - if (tr->tr_t_gh.gh_gl) { - gfs2_glock_dq(&tr->tr_t_gh); - gfs2_holder_uninit(&tr->tr_t_gh); + if (tr->tr_alloced) kfree(tr); - } sb_end_intwrite(sdp->sd_vfs); return; } @@ -133,16 +109,12 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) gfs2_print_trans(tr); gfs2_log_commit(sdp, tr); - if (tr->tr_t_gh.gh_gl) { - gfs2_glock_dq(&tr->tr_t_gh); - gfs2_holder_uninit(&tr->tr_t_gh); - if (!tr->tr_attached) + if (tr->tr_alloced && !tr->tr_attached) kfree(tr); - } up_read(&sdp->sd_log_flush_lock); if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) - gfs2_log_flush(sdp, NULL); + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); sb_end_intwrite(sdp->sd_vfs); } @@ -210,8 +182,7 @@ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh) set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); gfs2_pin(sdp, bd->bd_bh); tr->tr_num_databuf_new++; - sdp->sd_log_num_databuf++; - list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf); + list_add_tail(&bd->bd_list, &tr->tr_databuf); } gfs2_log_unlock(sdp); unlock_buffer(bh); @@ -230,16 +201,14 @@ static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) { - printk(KERN_ERR - "Attempting to add uninitialised block to journal (inplace block=%lld)\n", + pr_err("Attempting to add uninitialised block to journal (inplace block=%lld)\n", (unsigned long long)bd->bd_bh->b_blocknr); BUG(); } gfs2_pin(sdp, bd->bd_bh); mh->__pad0 = cpu_to_be64(0); mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); - sdp->sd_log_num_buf++; - list_add(&bd->bd_list, &sdp->sd_log_le_buf); + list_add(&bd->bd_list, &tr->tr_buf); tr->tr_num_buf_new++; } diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 6402fb69d71..86d2035ac66 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -7,6 +7,8 @@ * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> @@ -30,22 +32,27 @@ mempool_t *gfs2_page_pool __read_mostly; void gfs2_assert_i(struct gfs2_sbd *sdp) { - printk(KERN_EMERG "GFS2: fsid=%s: fatal assertion failed\n", - sdp->sd_fsname); + fs_emerg(sdp, "fatal assertion failed\n"); } -int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) +int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; const struct lm_lockops *lm = ls->ls_ops; va_list args; + struct va_format vaf; if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW && test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) return 0; va_start(args, fmt); - vprintk(fmt, args); + + vaf.fmt = fmt; + vaf.va = &args; + + fs_err(sdp, "%pV", &vaf); + va_end(args); if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) { @@ -66,7 +73,7 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) } if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) - panic("GFS2: fsid=%s: panic requested.\n", sdp->sd_fsname); + panic("GFS2: fsid=%s: panic requested\n", sdp->sd_fsname); return -1; } @@ -82,10 +89,9 @@ int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion, { int me; me = gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: fatal: assertion \"%s\" failed\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, assertion, - sdp->sd_fsname, function, file, line); + "fatal: assertion \"%s\" failed\n" + " function = %s, file = %s, line = %u\n", + assertion, function, file, line); dump_stack(); return (me) ? -1 : -2; } @@ -105,11 +111,8 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, return -2; if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) - printk(KERN_WARNING - "GFS2: fsid=%s: warning: assertion \"%s\" failed\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, assertion, - sdp->sd_fsname, function, file, line); + fs_warn(sdp, "warning: assertion \"%s\" failed at function = %s, file = %s, line = %u\n", + assertion, function, file, line); if (sdp->sd_args.ar_debug) BUG(); @@ -138,10 +141,8 @@ int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function, { int rv; rv = gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: fatal: filesystem consistency error\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, - sdp->sd_fsname, function, file, line); + "fatal: filesystem consistency error - function = %s, file = %s, line = %u\n", + function, file, line); return rv; } @@ -157,13 +158,12 @@ int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide, struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); int rv; rv = gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: fatal: filesystem consistency error\n" - "GFS2: fsid=%s: inode = %llu %llu\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, - sdp->sd_fsname, (unsigned long long)ip->i_no_formal_ino, - (unsigned long long)ip->i_no_addr, - sdp->sd_fsname, function, file, line); + "fatal: filesystem consistency error\n" + " inode = %llu %llu\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)ip->i_no_formal_ino, + (unsigned long long)ip->i_no_addr, + function, file, line); return rv; } @@ -179,12 +179,11 @@ int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide, struct gfs2_sbd *sdp = rgd->rd_sbd; int rv; rv = gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: fatal: filesystem consistency error\n" - "GFS2: fsid=%s: RG = %llu\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, - sdp->sd_fsname, (unsigned long long)rgd->rd_addr, - sdp->sd_fsname, function, file, line); + "fatal: filesystem consistency error\n" + " RG = %llu\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)rgd->rd_addr, + function, file, line); return rv; } @@ -200,12 +199,11 @@ int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, { int me; me = gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: fatal: invalid metadata block\n" - "GFS2: fsid=%s: bh = %llu (%s)\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, - sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, - sdp->sd_fsname, function, file, line); + "fatal: invalid metadata block\n" + " bh = %llu (%s)\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)bh->b_blocknr, type, + function, file, line); return (me) ? -1 : -2; } @@ -221,12 +219,11 @@ int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, { int me; me = gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: fatal: invalid metadata block\n" - "GFS2: fsid=%s: bh = %llu (type: exp=%u, found=%u)\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, - sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, t, - sdp->sd_fsname, function, file, line); + "fatal: invalid metadata block\n" + " bh = %llu (type: exp=%u, found=%u)\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)bh->b_blocknr, type, t, + function, file, line); return (me) ? -1 : -2; } @@ -241,10 +238,9 @@ int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file, { int rv; rv = gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: fatal: I/O error\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, - sdp->sd_fsname, function, file, line); + "fatal: I/O error\n" + " function = %s, file = %s, line = %u\n", + function, file, line); return rv; } @@ -259,32 +255,11 @@ int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, { int rv; rv = gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: fatal: I/O error\n" - "GFS2: fsid=%s: block = %llu\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, - sdp->sd_fsname, (unsigned long long)bh->b_blocknr, - sdp->sd_fsname, function, file, line); + "fatal: I/O error\n" + " block = %llu\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)bh->b_blocknr, + function, file, line); return rv; } -void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap, - unsigned int bit, int new_value) -{ - unsigned int c, o, b = bit; - int old_value; - - c = b / (8 * PAGE_SIZE); - b %= 8 * PAGE_SIZE; - o = b / 8; - b %= 8; - - old_value = (bitmap[c][o] & (1 << b)); - gfs2_assert_withdraw(sdp, !old_value != !new_value); - - if (new_value) - bitmap[c][o] |= 1 << b; - else - bitmap[c][o] &= ~(1 << b); -} - diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 80535739ac7..cbdcbdf3961 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -10,22 +10,23 @@ #ifndef __UTIL_DOT_H__ #define __UTIL_DOT_H__ +#ifdef pr_fmt +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#endif + #include <linux/mempool.h> #include "incore.h" -#define fs_printk(level, fs, fmt, arg...) \ - printk(level "GFS2: fsid=%s: " fmt , (fs)->sd_fsname , ## arg) - -#define fs_info(fs, fmt, arg...) \ - fs_printk(KERN_INFO , fs , fmt , ## arg) - -#define fs_warn(fs, fmt, arg...) \ - fs_printk(KERN_WARNING , fs , fmt , ## arg) - -#define fs_err(fs, fmt, arg...) \ - fs_printk(KERN_ERR, fs , fmt , ## arg) - +#define fs_emerg(fs, fmt, ...) \ + pr_emerg("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__) +#define fs_warn(fs, fmt, ...) \ + pr_warn("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__) +#define fs_err(fs, fmt, ...) \ + pr_err("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__) +#define fs_info(fs, fmt, ...) \ + pr_info("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__) void gfs2_assert_i(struct gfs2_sbd *sdp); @@ -85,7 +86,7 @@ static inline int gfs2_meta_check(struct gfs2_sbd *sdp, struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; u32 magic = be32_to_cpu(mh->mh_magic); if (unlikely(magic != GFS2_MAGIC)) { - printk(KERN_ERR "GFS2: Magic number missing at %llu\n", + pr_err("Magic number missing at %llu\n", (unsigned long long)bh->b_blocknr); return -EIO; } @@ -164,9 +165,7 @@ static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, #define gfs2_tune_get(sdp, field) \ gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) -void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap, - unsigned int bit, int new_value); -int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...); +__printf(2, 3) +int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...); #endif /* __UTIL_DOT_H__ */ - diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index ecd37f30ab9..0b81f783f78 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -13,6 +13,7 @@ #include <linux/buffer_head.h> #include <linux/xattr.h> #include <linux/gfs2_ondisk.h> +#include <linux/posix_acl_xattr.h> #include <asm/uaccess.h> #include "gfs2.h" @@ -723,6 +724,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, unsigned int blks, ea_skeleton_call_t skeleton_call, void *private) { + struct gfs2_alloc_parms ap = { .target = blks }; struct buffer_head *dibh; int error; @@ -734,7 +736,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (error) return error; - error = gfs2_inplace_reserve(ip, blks, 0); + error = gfs2_inplace_reserve(ip, &ap); if (error) goto out_gunlock_q; @@ -1499,7 +1501,8 @@ static const struct xattr_handler gfs2_xattr_security_handler = { const struct xattr_handler *gfs2_xattr_handlers[] = { &gfs2_xattr_user_handler, &gfs2_xattr_security_handler, - &gfs2_xattr_system_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, NULL, }; diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h index 2a1d712f85d..f6bd266d70b 100644 --- a/fs/hfs/btree.h +++ b/fs/hfs/btree.h @@ -153,11 +153,6 @@ struct hfs_btree_header_rec { u32 reserved3[16]; } __packed; -#define HFS_NODE_INDEX 0x00 /* An internal (index) node */ -#define HFS_NODE_HEADER 0x01 /* The tree header node (node 0) */ -#define HFS_NODE_MAP 0x02 /* Holds part of the bitmap of used nodes */ -#define HFS_NODE_LEAF 0xFF /* A leaf (ndNHeight==1) node */ - #define BTREE_ATTR_BADCLOSE 0x00000001 /* b-tree not closed properly. not used by hfsplus. */ #define HFS_TREE_BIGKEYS 0x00000002 /* key length is u16 instead of u8. diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 380ab31b5e0..d0929bc8178 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -125,15 +125,15 @@ static int hfs_releasepage(struct page *page, gfp_t mask) } static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = file_inode(file)->i_mapping->host; + size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, - hfs_get_block); + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, hfs_get_block); /* * In case of error extending write may have instantiated a few @@ -141,7 +141,7 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb, */ if (unlikely((rw & WRITE) && ret < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + iov_length(iov, nr_segs); + loff_t end = offset + count; if (end > isize) hfs_write_failed(mapping, end); @@ -547,7 +547,7 @@ out: void hfs_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) { HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL; @@ -674,10 +674,10 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end, static const struct file_operations hfs_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .splice_read = generic_file_splice_read, .fsync = hfs_file_fsync, diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 2d2039e754c..eee7206c38d 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -112,6 +112,7 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf) static int hfs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_NODIRATIME; if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; diff --git a/fs/hfsplus/acl.h b/fs/hfsplus/acl.h index 07c0d494752..95c8ed9ec17 100644 --- a/fs/hfsplus/acl.h +++ b/fs/hfsplus/acl.h @@ -12,16 +12,13 @@ /* posix_acl.c */ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type); -extern int hfsplus_posix_acl_chmod(struct inode *); +int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, + int type); extern int hfsplus_init_posix_acl(struct inode *, struct inode *); #else /* CONFIG_HFSPLUS_FS_POSIX_ACL */ #define hfsplus_get_posix_acl NULL - -static inline int hfsplus_posix_acl_chmod(struct inode *inode) -{ - return 0; -} +#define hfsplus_set_posix_acl NULL static inline int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) { diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c index 0f47890299c..e5b221de7de 100644 --- a/fs/hfsplus/attributes.c +++ b/fs/hfsplus/attributes.c @@ -11,7 +11,7 @@ static struct kmem_cache *hfsplus_attr_tree_cachep; -int hfsplus_create_attr_tree_cache(void) +int __init hfsplus_create_attr_tree_cache(void) { if (hfsplus_attr_tree_cachep) return -EEXIST; @@ -54,14 +54,11 @@ int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key, memset(key, 0, sizeof(struct hfsplus_attr_key)); key->attr.cnid = cpu_to_be32(cnid); if (name) { - len = strlen(name); - if (len > HFSPLUS_ATTR_MAX_STRLEN) { - pr_err("invalid xattr name's length\n"); - return -EINVAL; - } - hfsplus_asc2uni(sb, + int res = hfsplus_asc2uni(sb, (struct hfsplus_unistr *)&key->attr.key_name, - HFSPLUS_ATTR_MAX_STRLEN, name, len); + HFSPLUS_ATTR_MAX_STRLEN, name, strlen(name)); + if (res) + return res; len = be16_to_cpu(key->attr.key_name.length); } else { key->attr.key_name.length = 0; @@ -82,31 +79,6 @@ int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key, return 0; } -void hfsplus_attr_build_key_uni(hfsplus_btree_key *key, - u32 cnid, - struct hfsplus_attr_unistr *name) -{ - int ustrlen; - - memset(key, 0, sizeof(struct hfsplus_attr_key)); - ustrlen = be16_to_cpu(name->length); - key->attr.cnid = cpu_to_be32(cnid); - key->attr.key_name.length = cpu_to_be16(ustrlen); - ustrlen *= 2; - memcpy(key->attr.key_name.unicode, name->unicode, ustrlen); - - /* The length of the key, as stored in key_len field, does not include - * the size of the key_len field itself. - * So, offsetof(hfsplus_attr_key, key_name) is a trick because - * it takes into consideration key_len field (__be16) of - * hfsplus_attr_key structure instead of length field (__be16) of - * hfsplus_attr_unistr structure. - */ - key->key_len = - cpu_to_be16(offsetof(struct hfsplus_attr_key, key_name) + - ustrlen); -} - hfsplus_attr_entry *hfsplus_alloc_attr_entry(void) { return kmem_cache_alloc(hfsplus_attr_tree_cachep, GFP_KERNEL); diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 11c86020452..759708fd933 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -27,13 +27,13 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) pagep = node->page + (off >> PAGE_CACHE_SHIFT); off &= ~PAGE_CACHE_MASK; - l = min(len, (int)PAGE_CACHE_SIZE - off); + l = min_t(int, len, PAGE_CACHE_SIZE - off); memcpy(buf, kmap(*pagep) + off, l); kunmap(*pagep); while ((len -= l) != 0) { buf += l; - l = min(len, (int)PAGE_CACHE_SIZE); + l = min_t(int, len, PAGE_CACHE_SIZE); memcpy(buf, kmap(*++pagep), l); kunmap(*pagep); } @@ -80,14 +80,14 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) pagep = node->page + (off >> PAGE_CACHE_SHIFT); off &= ~PAGE_CACHE_MASK; - l = min(len, (int)PAGE_CACHE_SIZE - off); + l = min_t(int, len, PAGE_CACHE_SIZE - off); memcpy(kmap(*pagep) + off, buf, l); set_page_dirty(*pagep); kunmap(*pagep); while ((len -= l) != 0) { buf += l; - l = min(len, (int)PAGE_CACHE_SIZE); + l = min_t(int, len, PAGE_CACHE_SIZE); memcpy(kmap(*++pagep), buf, l); set_page_dirty(*pagep); kunmap(*pagep); @@ -110,13 +110,13 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) pagep = node->page + (off >> PAGE_CACHE_SHIFT); off &= ~PAGE_CACHE_MASK; - l = min(len, (int)PAGE_CACHE_SIZE - off); + l = min_t(int, len, PAGE_CACHE_SIZE - off); memset(kmap(*pagep) + off, 0, l); set_page_dirty(*pagep); kunmap(*pagep); while ((len -= l) != 0) { - l = min(len, (int)PAGE_CACHE_SIZE); + l = min_t(int, len, PAGE_CACHE_SIZE); memset(kmap(*++pagep), 0, l); set_page_dirty(*pagep); kunmap(*pagep); @@ -142,14 +142,14 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, dst &= ~PAGE_CACHE_MASK; if (src == dst) { - l = min(len, (int)PAGE_CACHE_SIZE - src); + l = min_t(int, len, PAGE_CACHE_SIZE - src); memcpy(kmap(*dst_page) + src, kmap(*src_page) + src, l); kunmap(*src_page); set_page_dirty(*dst_page); kunmap(*dst_page); while ((len -= l) != 0) { - l = min(len, (int)PAGE_CACHE_SIZE); + l = min_t(int, len, PAGE_CACHE_SIZE); memcpy(kmap(*++dst_page), kmap(*++src_page), l); kunmap(*src_page); set_page_dirty(*dst_page); @@ -251,7 +251,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) dst &= ~PAGE_CACHE_MASK; if (src == dst) { - l = min(len, (int)PAGE_CACHE_SIZE - src); + l = min_t(int, len, PAGE_CACHE_SIZE - src); memmove(kmap(*dst_page) + src, kmap(*src_page) + src, l); kunmap(*src_page); @@ -259,7 +259,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) kunmap(*dst_page); while ((len -= l) != 0) { - l = min(len, (int)PAGE_CACHE_SIZE); + l = min_t(int, len, PAGE_CACHE_SIZE); memmove(kmap(*++dst_page), kmap(*++src_page), l); kunmap(*src_page); @@ -386,9 +386,8 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid) struct hfs_bnode *node; if (cnid >= tree->node_count) { - pr_err("request for non-existent node " - "%d in B*Tree\n", - cnid); + pr_err("request for non-existent node %d in B*Tree\n", + cnid); return NULL; } @@ -409,9 +408,8 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) loff_t off; if (cnid >= tree->node_count) { - pr_err("request for non-existent node " - "%d in B*Tree\n", - cnid); + pr_err("request for non-existent node %d in B*Tree\n", + cnid); return NULL; } @@ -602,7 +600,7 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) pagep = node->page; memset(kmap(*pagep) + node->page_offset, 0, - min((int)PAGE_CACHE_SIZE, (int)tree->node_size)); + min_t(int, PAGE_CACHE_SIZE, tree->node_size)); set_page_dirty(*pagep); kunmap(*pagep); for (i = 1; i < tree->pages_per_bnode; i++) { @@ -648,8 +646,8 @@ void hfs_bnode_put(struct hfs_bnode *node) if (test_bit(HFS_BNODE_DELETED, &node->flags)) { hfs_bnode_unhash(node); spin_unlock(&tree->hash_lock); - hfs_bnode_clear(node, 0, - PAGE_CACHE_SIZE * tree->pages_per_bnode); + if (hfs_bnode_need_zeroout(tree)) + hfs_bnode_clear(node, 0, tree->node_size); hfs_bmap_free(node); hfs_bnode_free(node); return; @@ -658,3 +656,16 @@ void hfs_bnode_put(struct hfs_bnode *node) } } +/* + * Unused nodes have to be zeroed if this is the catalog tree and + * a corresponding flag in the volume header is set. + */ +bool hfs_bnode_need_zeroout(struct hfs_btree *tree) +{ + struct super_block *sb = tree->inode->i_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + const u32 volume_attr = be32_to_cpu(sbi->s_vhdr->attributes); + + return tree->cnid == HFSPLUS_CAT_CNID && + volume_attr & HFSPLUS_VOL_UNUSED_NODE_FIX; +} diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 0c6540c9116..3345c7553ed 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -15,6 +15,118 @@ #include "hfsplus_fs.h" #include "hfsplus_raw.h" +/* + * Initial source code of clump size calculation is gotten + * from http://opensource.apple.com/tarballs/diskdev_cmds/ + */ +#define CLUMP_ENTRIES 15 + +static short clumptbl[CLUMP_ENTRIES * 3] = { +/* + * Volume Attributes Catalog Extents + * Size Clump (MB) Clump (MB) Clump (MB) + */ + /* 1GB */ 4, 4, 4, + /* 2GB */ 6, 6, 4, + /* 4GB */ 8, 8, 4, + /* 8GB */ 11, 11, 5, + /* + * For volumes 16GB and larger, we want to make sure that a full OS + * install won't require fragmentation of the Catalog or Attributes + * B-trees. We do this by making the clump sizes sufficiently large, + * and by leaving a gap after the B-trees for them to grow into. + * + * For SnowLeopard 10A298, a FullNetInstall with all packages selected + * results in: + * Catalog B-tree Header + * nodeSize: 8192 + * totalNodes: 31616 + * freeNodes: 1978 + * (used = 231.55 MB) + * Attributes B-tree Header + * nodeSize: 8192 + * totalNodes: 63232 + * freeNodes: 958 + * (used = 486.52 MB) + * + * We also want Time Machine backup volumes to have a sufficiently + * large clump size to reduce fragmentation. + * + * The series of numbers for Catalog and Attribute form a geometric + * series. For Catalog (16GB to 512GB), each term is 8**(1/5) times + * the previous term. For Attributes (16GB to 512GB), each term is + * 4**(1/5) times the previous term. For 1TB to 16TB, each term is + * 2**(1/5) times the previous term. + */ + /* 16GB */ 64, 32, 5, + /* 32GB */ 84, 49, 6, + /* 64GB */ 111, 74, 7, + /* 128GB */ 147, 111, 8, + /* 256GB */ 194, 169, 9, + /* 512GB */ 256, 256, 11, + /* 1TB */ 294, 294, 14, + /* 2TB */ 338, 338, 16, + /* 4TB */ 388, 388, 20, + /* 8TB */ 446, 446, 25, + /* 16TB */ 512, 512, 32 +}; + +u32 hfsplus_calc_btree_clump_size(u32 block_size, u32 node_size, + u64 sectors, int file_id) +{ + u32 mod = max(node_size, block_size); + u32 clump_size; + int column; + int i; + + /* Figure out which column of the above table to use for this file. */ + switch (file_id) { + case HFSPLUS_ATTR_CNID: + column = 0; + break; + case HFSPLUS_CAT_CNID: + column = 1; + break; + default: + column = 2; + break; + } + + /* + * The default clump size is 0.8% of the volume size. And + * it must also be a multiple of the node and block size. + */ + if (sectors < 0x200000) { + clump_size = sectors << 2; /* 0.8 % */ + if (clump_size < (8 * node_size)) + clump_size = 8 * node_size; + } else { + /* turn exponent into table index... */ + for (i = 0, sectors = sectors >> 22; + sectors && (i < CLUMP_ENTRIES - 1); + ++i, sectors = sectors >> 1) { + /* empty body */ + } + + clump_size = clumptbl[column + (i) * 3] * 1024 * 1024; + } + + /* + * Round the clump size to a multiple of node and block size. + * NOTE: This rounds down. + */ + clump_size /= mod; + clump_size *= mod; + + /* + * Rounding down could have rounded down to 0 if the block size was + * greater than the clump size. If so, just use one block or node. + */ + if (clump_size == 0) + clump_size = mod; + + return clump_size; +} /* Get a reference to a B*Tree and do some initial checks */ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) @@ -246,7 +358,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) u32 count; int res; - res = hfsplus_file_extend(inode); + res = hfsplus_file_extend(inode, hfs_bnode_need_zeroout(tree)); if (res) return ERR_PTR(res); hip->phys_size = inode->i_size = diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 968ce411db5..32602c667b4 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -103,6 +103,8 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, folder = &entry->folder; memset(folder, 0, sizeof(*folder)); folder->type = cpu_to_be16(HFSPLUS_FOLDER); + if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) + folder->flags |= cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT); folder->id = cpu_to_be32(inode->i_ino); HFSPLUS_I(inode)->create_date = folder->create_date = @@ -203,6 +205,36 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid, return hfs_brec_find(fd, hfs_find_rec_by_key); } +static void hfsplus_subfolders_inc(struct inode *dir) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); + + if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) { + /* + * Increment subfolder count. Note, the value is only meaningful + * for folders with HFSPLUS_HAS_FOLDER_COUNT flag set. + */ + HFSPLUS_I(dir)->subfolders++; + } +} + +static void hfsplus_subfolders_dec(struct inode *dir) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); + + if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) { + /* + * Decrement subfolder count. Note, the value is only meaningful + * for folders with HFSPLUS_HAS_FOLDER_COUNT flag set. + * + * Check for zero. Some subfolders may have been created + * by an implementation ignorant of this counter. + */ + if (HFSPLUS_I(dir)->subfolders) + HFSPLUS_I(dir)->subfolders--; + } +} + int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode) { @@ -247,6 +279,8 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, goto err1; dir->i_size++; + if (S_ISDIR(inode->i_mode)) + hfsplus_subfolders_inc(dir); dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY); @@ -336,6 +370,8 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) goto out; dir->i_size--; + if (type == HFSPLUS_FOLDER) + hfsplus_subfolders_dec(dir); dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY); @@ -380,6 +416,7 @@ int hfsplus_rename_cat(u32 cnid, hfs_bnode_read(src_fd.bnode, &entry, src_fd.entryoffset, src_fd.entrylength); + type = be16_to_cpu(entry.type); /* create new dir entry with the data from the old entry */ hfsplus_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name); @@ -394,6 +431,8 @@ int hfsplus_rename_cat(u32 cnid, if (err) goto out; dst_dir->i_size++; + if (type == HFSPLUS_FOLDER) + hfsplus_subfolders_inc(dst_dir); dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC; /* finally remove the old entry */ @@ -405,6 +444,8 @@ int hfsplus_rename_cat(u32 cnid, if (err) goto out; src_dir->i_size--; + if (type == HFSPLUS_FOLDER) + hfsplus_subfolders_dec(src_dir); src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC; /* remove old thread entry */ diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 4a4fea00267..610a3260bef 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -12,6 +12,7 @@ #include <linux/fs.h> #include <linux/slab.h> #include <linux/random.h> +#include <linux/nls.h> #include "hfsplus_fs.h" #include "hfsplus_raw.h" @@ -127,7 +128,7 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx) struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; int len, err; - char strbuf[HFSPLUS_MAX_STRLEN + 1]; + char *strbuf; hfsplus_cat_entry entry; struct hfs_find_data fd; struct hfsplus_readdir_data *rd; @@ -139,6 +140,11 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx) err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (err) return err; + strbuf = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_MAX_STRLEN + 1, GFP_KERNEL); + if (!strbuf) { + err = -ENOMEM; + goto out; + } hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) @@ -193,7 +199,7 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx) hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); type = be16_to_cpu(entry.type); - len = HFSPLUS_MAX_STRLEN; + len = NLS_MAX_CHARSET_SIZE * HFSPLUS_MAX_STRLEN; err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len); if (err) goto out; @@ -212,13 +218,31 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx) be32_to_cpu(entry.folder.id), DT_DIR)) break; } else if (type == HFSPLUS_FILE) { + u16 mode; + unsigned type = DT_UNKNOWN; + if (fd.entrylength < sizeof(struct hfsplus_cat_file)) { pr_err("small file entry\n"); err = -EIO; goto out; } + + mode = be16_to_cpu(entry.file.permissions.mode); + if (S_ISREG(mode)) + type = DT_REG; + else if (S_ISLNK(mode)) + type = DT_LNK; + else if (S_ISFIFO(mode)) + type = DT_FIFO; + else if (S_ISCHR(mode)) + type = DT_CHR; + else if (S_ISBLK(mode)) + type = DT_BLK; + else if (S_ISSOCK(mode)) + type = DT_SOCK; + if (!dir_emit(ctx, strbuf, len, - be32_to_cpu(entry.file.id), DT_REG)) + be32_to_cpu(entry.file.id), type)) break; } else { pr_err("bad catalog entry type\n"); @@ -246,6 +270,7 @@ next: } memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key)); out: + kfree(strbuf); hfs_find_exit(&fd); return err; } @@ -529,9 +554,10 @@ const struct inode_operations hfsplus_dir_inode_operations = { .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = hfsplus_listxattr, - .removexattr = hfsplus_removexattr, + .removexattr = generic_removexattr, #ifdef CONFIG_HFSPLUS_FS_POSIX_ACL .get_acl = hfsplus_get_posix_acl, + .set_acl = hfsplus_set_posix_acl, #endif }; diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index fbb212fbb1e..feca524ce2a 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -227,17 +227,15 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, u32 ablock, dblock, mask; sector_t sector; int was_dirty = 0; - int shift; /* Convert inode block to disk allocation block */ - shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits; ablock = iblock >> sbi->fs_shift; if (iblock >= hip->fs_blocks) { if (iblock > hip->fs_blocks || !create) return -EIO; if (ablock >= hip->alloc_blocks) { - res = hfsplus_file_extend(inode); + res = hfsplus_file_extend(inode, false); if (res) return res; } @@ -427,7 +425,7 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, return res; } -int hfsplus_file_extend(struct inode *inode) +int hfsplus_file_extend(struct inode *inode, bool zeroout) { struct super_block *sb = inode->i_sb; struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); @@ -438,10 +436,9 @@ int hfsplus_file_extend(struct inode *inode) if (sbi->alloc_file->i_size * 8 < sbi->total_blocks - sbi->free_blocks + 8) { /* extend alloc file */ - pr_err("extend alloc file! " - "(%llu,%u,%u)\n", - sbi->alloc_file->i_size * 8, - sbi->total_blocks, sbi->free_blocks); + pr_err("extend alloc file! (%llu,%u,%u)\n", + sbi->alloc_file->i_size * 8, + sbi->total_blocks, sbi->free_blocks); return -ENOSPC; } @@ -465,6 +462,12 @@ int hfsplus_file_extend(struct inode *inode) } } + if (zeroout) { + res = sb_issue_zeroout(sb, start, len, GFP_NOFS); + if (res) + goto out; + } + hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); if (hip->alloc_blocks <= hip->first_blocks) { @@ -498,11 +501,13 @@ int hfsplus_file_extend(struct inode *inode) goto insert_extent; } out: - mutex_unlock(&hip->extents_lock); if (!res) { hip->alloc_blocks += len; + mutex_unlock(&hip->extents_lock); hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY); + return 0; } + mutex_unlock(&hip->extents_lock); return res; insert_extent: @@ -556,11 +561,13 @@ void hfsplus_file_truncate(struct inode *inode) blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >> HFSPLUS_SB(sb)->alloc_blksz_shift; + + mutex_lock(&hip->extents_lock); + alloc_cnt = hip->alloc_blocks; if (blk_cnt == alloc_cnt) - goto out; + goto out_unlock; - mutex_lock(&hip->extents_lock); res = hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd); if (res) { mutex_unlock(&hip->extents_lock); @@ -592,10 +599,10 @@ void hfsplus_file_truncate(struct inode *inode) hfs_brec_remove(&fd); } hfs_find_exit(&fd); - mutex_unlock(&hip->extents_lock); hip->alloc_blocks = blk_cnt; -out: +out_unlock: + mutex_unlock(&hip->extents_lock); hip->phys_size = inode->i_size; hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 2b9cd01696e..eb5e059f481 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -127,6 +127,14 @@ struct hfs_bnode { #define HFS_BNODE_DELETED 4 /* + * Attributes file states + */ +#define HFSPLUS_EMPTY_ATTR_TREE 0 +#define HFSPLUS_CREATING_ATTR_TREE 1 +#define HFSPLUS_VALID_ATTR_TREE 2 +#define HFSPLUS_FAILED_ATTR_TREE 3 + +/* * HFS+ superblock info (built from Volume Header on disk) */ @@ -141,6 +149,7 @@ struct hfsplus_sb_info { struct hfs_btree *ext_tree; struct hfs_btree *cat_tree; struct hfs_btree *attr_tree; + atomic_t attr_tree_state; struct inode *alloc_file; struct inode *hidden_dir; struct nls_table *nls; @@ -233,6 +242,7 @@ struct hfsplus_inode_info { */ sector_t fs_blocks; u8 userflags; /* BSD user file flags */ + u32 subfolders; /* Subfolder count (HFSX only) */ struct list_head open_dir_list; loff_t phys_size; @@ -357,115 +367,121 @@ typedef int (*search_strategy_t)(struct hfs_bnode *, */ /* attributes.c */ -int hfsplus_create_attr_tree_cache(void); +int __init hfsplus_create_attr_tree_cache(void); void hfsplus_destroy_attr_tree_cache(void); +int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *k1, + const hfsplus_btree_key *k2); +int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key, + u32 cnid, const char *name); hfsplus_attr_entry *hfsplus_alloc_attr_entry(void); -void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry_p); -int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *, - const hfsplus_btree_key *); -int hfsplus_attr_build_key(struct super_block *, hfsplus_btree_key *, - u32, const char *); -void hfsplus_attr_build_key_uni(hfsplus_btree_key *key, - u32 cnid, - struct hfsplus_attr_unistr *name); -int hfsplus_find_attr(struct super_block *, u32, - const char *, struct hfs_find_data *); +void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry); +int hfsplus_find_attr(struct super_block *sb, u32 cnid, const char *name, + struct hfs_find_data *fd); int hfsplus_attr_exists(struct inode *inode, const char *name); -int hfsplus_create_attr(struct inode *, const char *, const void *, size_t); -int hfsplus_delete_attr(struct inode *, const char *); +int hfsplus_create_attr(struct inode *inode, const char *name, + const void *value, size_t size); +int hfsplus_delete_attr(struct inode *inode, const char *name); int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid); /* bitmap.c */ -int hfsplus_block_allocate(struct super_block *, u32, u32, u32 *); -int hfsplus_block_free(struct super_block *, u32, u32); +int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, + u32 *max); +int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count); /* btree.c */ -struct hfs_btree *hfs_btree_open(struct super_block *, u32); -void hfs_btree_close(struct hfs_btree *); -int hfs_btree_write(struct hfs_btree *); -struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *); -void hfs_bmap_free(struct hfs_bnode *); +u32 hfsplus_calc_btree_clump_size(u32 block_size, u32 node_size, u64 sectors, + int file_id); +struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id); +void hfs_btree_close(struct hfs_btree *tree); +int hfs_btree_write(struct hfs_btree *tree); +struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree); +void hfs_bmap_free(struct hfs_bnode *node); /* bnode.c */ -void hfs_bnode_read(struct hfs_bnode *, void *, int, int); -u16 hfs_bnode_read_u16(struct hfs_bnode *, int); -u8 hfs_bnode_read_u8(struct hfs_bnode *, int); -void hfs_bnode_read_key(struct hfs_bnode *, void *, int); -void hfs_bnode_write(struct hfs_bnode *, void *, int, int); -void hfs_bnode_write_u16(struct hfs_bnode *, int, u16); -void hfs_bnode_clear(struct hfs_bnode *, int, int); -void hfs_bnode_copy(struct hfs_bnode *, int, - struct hfs_bnode *, int, int); -void hfs_bnode_move(struct hfs_bnode *, int, int, int); -void hfs_bnode_dump(struct hfs_bnode *); -void hfs_bnode_unlink(struct hfs_bnode *); -struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *, u32); -struct hfs_bnode *hfs_bnode_find(struct hfs_btree *, u32); -void hfs_bnode_unhash(struct hfs_bnode *); -void hfs_bnode_free(struct hfs_bnode *); -struct hfs_bnode *hfs_bnode_create(struct hfs_btree *, u32); -void hfs_bnode_get(struct hfs_bnode *); -void hfs_bnode_put(struct hfs_bnode *); +void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len); +u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off); +u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off); +void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off); +void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len); +void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data); +void hfs_bnode_clear(struct hfs_bnode *node, int off, int len); +void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, + struct hfs_bnode *src_node, int src, int len); +void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len); +void hfs_bnode_dump(struct hfs_bnode *node); +void hfs_bnode_unlink(struct hfs_bnode *node); +struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid); +void hfs_bnode_unhash(struct hfs_bnode *node); +struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num); +void hfs_bnode_free(struct hfs_bnode *node); +struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num); +void hfs_bnode_get(struct hfs_bnode *node); +void hfs_bnode_put(struct hfs_bnode *node); +bool hfs_bnode_need_zeroout(struct hfs_btree *tree); /* brec.c */ -u16 hfs_brec_lenoff(struct hfs_bnode *, u16, u16 *); -u16 hfs_brec_keylen(struct hfs_bnode *, u16); -int hfs_brec_insert(struct hfs_find_data *, void *, int); -int hfs_brec_remove(struct hfs_find_data *); +u16 hfs_brec_lenoff(struct hfs_bnode *node, u16 rec, u16 *off); +u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec); +int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len); +int hfs_brec_remove(struct hfs_find_data *fd); /* bfind.c */ -int hfs_find_init(struct hfs_btree *, struct hfs_find_data *); -void hfs_find_exit(struct hfs_find_data *); -int hfs_find_1st_rec_by_cnid(struct hfs_bnode *, - struct hfs_find_data *, - int *, int *, int *); -int hfs_find_rec_by_key(struct hfs_bnode *, - struct hfs_find_data *, - int *, int *, int *); -int __hfs_brec_find(struct hfs_bnode *, struct hfs_find_data *, - search_strategy_t); -int hfs_brec_find(struct hfs_find_data *, search_strategy_t); -int hfs_brec_read(struct hfs_find_data *, void *, int); -int hfs_brec_goto(struct hfs_find_data *, int); +int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd); +void hfs_find_exit(struct hfs_find_data *fd); +int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode, struct hfs_find_data *fd, + int *begin, int *end, int *cur_rec); +int hfs_find_rec_by_key(struct hfs_bnode *bnode, struct hfs_find_data *fd, + int *begin, int *end, int *cur_rec); +int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd, + search_strategy_t rec_found); +int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare); +int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len); +int hfs_brec_goto(struct hfs_find_data *fd, int cnt); /* catalog.c */ -int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *, - const hfsplus_btree_key *); -int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *, - const hfsplus_btree_key *); -void hfsplus_cat_build_key(struct super_block *sb, - hfsplus_btree_key *, u32, struct qstr *); -int hfsplus_find_cat(struct super_block *, u32, struct hfs_find_data *); -int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *); -int hfsplus_delete_cat(u32, struct inode *, struct qstr *); -int hfsplus_rename_cat(u32, struct inode *, struct qstr *, - struct inode *, struct qstr *); +int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *k1, + const hfsplus_btree_key *k2); +int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1, + const hfsplus_btree_key *k2); +void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key, + u32 parent, struct qstr *str); void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms); +int hfsplus_find_cat(struct super_block *sb, u32 cnid, + struct hfs_find_data *fd); +int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, + struct inode *inode); +int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str); +int hfsplus_rename_cat(u32 cnid, struct inode *src_dir, struct qstr *src_name, + struct inode *dst_dir, struct qstr *dst_name); /* dir.c */ extern const struct inode_operations hfsplus_dir_inode_operations; extern const struct file_operations hfsplus_dir_operations; /* extents.c */ -int hfsplus_ext_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *); -int hfsplus_ext_write_extent(struct inode *); -int hfsplus_get_block(struct inode *, sector_t, struct buffer_head *, int); -int hfsplus_free_fork(struct super_block *, u32, - struct hfsplus_fork_raw *, int); -int hfsplus_file_extend(struct inode *); -void hfsplus_file_truncate(struct inode *); +int hfsplus_ext_cmp_key(const hfsplus_btree_key *k1, + const hfsplus_btree_key *k2); +int hfsplus_ext_write_extent(struct inode *inode); +int hfsplus_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int hfsplus_free_fork(struct super_block *sb, u32 cnid, + struct hfsplus_fork_raw *fork, int type); +int hfsplus_file_extend(struct inode *inode, bool zeroout); +void hfsplus_file_truncate(struct inode *inode); /* inode.c */ extern const struct address_space_operations hfsplus_aops; extern const struct address_space_operations hfsplus_btree_aops; extern const struct dentry_operations hfsplus_dentry_operations; -void hfsplus_inode_read_fork(struct inode *, struct hfsplus_fork_raw *); -void hfsplus_inode_write_fork(struct inode *, struct hfsplus_fork_raw *); -int hfsplus_cat_read_inode(struct inode *, struct hfs_find_data *); -int hfsplus_cat_write_inode(struct inode *); -struct inode *hfsplus_new_inode(struct super_block *, umode_t); -void hfsplus_delete_inode(struct inode *); +struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode); +void hfsplus_delete_inode(struct inode *inode); +void hfsplus_inode_read_fork(struct inode *inode, + struct hfsplus_fork_raw *fork); +void hfsplus_inode_write_fork(struct inode *inode, + struct hfsplus_fork_raw *fork); +int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd); +int hfsplus_cat_write_inode(struct inode *inode); int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); @@ -473,13 +489,17 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); /* options.c */ -int hfsplus_parse_options(char *, struct hfsplus_sb_info *); +void hfsplus_fill_defaults(struct hfsplus_sb_info *opts); int hfsplus_parse_options_remount(char *input, int *force); -void hfsplus_fill_defaults(struct hfsplus_sb_info *); -int hfsplus_show_options(struct seq_file *, struct dentry *); +int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi); +int hfsplus_show_options(struct seq_file *seq, struct dentry *root); + +/* part_tbl.c */ +int hfs_part_find(struct super_block *sb, sector_t *part_start, + sector_t *part_size); /* super.c */ -struct inode *hfsplus_iget(struct super_block *, unsigned long); +struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino); void hfsplus_mark_mdb_dirty(struct super_block *sb); /* tables.c */ @@ -488,23 +508,23 @@ extern u16 hfsplus_decompose_table[]; extern u16 hfsplus_compose_table[]; /* unicode.c */ -int hfsplus_strcasecmp(const struct hfsplus_unistr *, - const struct hfsplus_unistr *); -int hfsplus_strcmp(const struct hfsplus_unistr *, - const struct hfsplus_unistr *); -int hfsplus_uni2asc(struct super_block *, - const struct hfsplus_unistr *, char *, int *); -int hfsplus_asc2uni(struct super_block *, - struct hfsplus_unistr *, int, const char *, int); +int hfsplus_strcasecmp(const struct hfsplus_unistr *s1, + const struct hfsplus_unistr *s2); +int hfsplus_strcmp(const struct hfsplus_unistr *s1, + const struct hfsplus_unistr *s2); +int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, + char *astr, int *len_p); +int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, + int max_unistr_len, const char *astr, int len); int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str); -int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry, - unsigned int len, const char *str, const struct qstr *name); +int hfsplus_compare_dentry(const struct dentry *parent, + const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name); /* wrapper.c */ -int hfsplus_read_wrapper(struct super_block *); -int hfs_part_find(struct super_block *, sector_t *, sector_t *); -int hfsplus_submit_bio(struct super_block *sb, sector_t sector, - void *buf, void **data, int rw); +int hfsplus_submit_bio(struct super_block *sb, sector_t sector, void *buf, + void **data, int rw); +int hfsplus_read_wrapper(struct super_block *sb); /* time macros */ #define __hfsp_mt2ut(t) (be32_to_cpu(t) - 2082844800U) diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h index 452ede01b03..8298d0985f8 100644 --- a/fs/hfsplus/hfsplus_raw.h +++ b/fs/hfsplus/hfsplus_raw.h @@ -144,6 +144,7 @@ struct hfsplus_vh { #define HFSPLUS_VOL_NODEID_REUSED (1 << 12) #define HFSPLUS_VOL_JOURNALED (1 << 13) #define HFSPLUS_VOL_SOFTLOCK (1 << 15) +#define HFSPLUS_VOL_UNUSED_NODE_FIX (1 << 31) /* HFS+ BTree node descriptor */ struct hfs_bnode_desc { @@ -156,10 +157,10 @@ struct hfs_bnode_desc { } __packed; /* HFS+ BTree node types */ -#define HFS_NODE_INDEX 0x00 -#define HFS_NODE_HEADER 0x01 -#define HFS_NODE_MAP 0x02 -#define HFS_NODE_LEAF 0xFF +#define HFS_NODE_INDEX 0x00 /* An internal (index) node */ +#define HFS_NODE_HEADER 0x01 /* The tree header node (node 0) */ +#define HFS_NODE_MAP 0x02 /* Holds part of the bitmap of used nodes */ +#define HFS_NODE_LEAF 0xFF /* A leaf (ndNHeight==1) node */ /* HFS+ BTree header */ struct hfs_btree_header_rec { @@ -187,6 +188,9 @@ struct hfs_btree_header_rec { /* HFS+ BTree misc info */ #define HFSPLUS_TREE_HEAD 0 #define HFSPLUS_NODE_MXSZ 32768 +#define HFSPLUS_ATTR_TREE_NODE_SIZE 8192 +#define HFSPLUS_BTREE_HDR_NODE_RECS_COUNT 3 +#define HFSPLUS_BTREE_HDR_USER_BYTES 128 /* Some special File ID numbers (stolen from hfs.h) */ #define HFSPLUS_POR_CNID 1 /* Parent Of the Root */ @@ -258,7 +262,7 @@ struct hfsplus_cat_folder { struct DInfo user_info; struct DXInfo finder_info; __be32 text_encoding; - u32 reserved; + __be32 subfolders; /* Subfolder count in HFSX. Reserved in HFS+. */ } __packed; /* HFS file info (stolen from hfs.h) */ @@ -298,11 +302,13 @@ struct hfsplus_cat_file { struct hfsplus_fork_raw rsrc_fork; } __packed; -/* File attribute bits */ +/* File and folder flag bits */ #define HFSPLUS_FILE_LOCKED 0x0001 #define HFSPLUS_FILE_THREAD_EXISTS 0x0002 #define HFSPLUS_XATTR_EXISTS 0x0004 #define HFSPLUS_ACL_EXISTS 0x0008 +#define HFSPLUS_HAS_FOLDER_COUNT 0x0010 /* Folder has subfolder count + * (HFSX only) */ /* HFS+ catalog thread (part of a cat_entry) */ struct hfsplus_cat_thread { diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 37213d075f3..0cf786f2d04 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -123,14 +123,15 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask) } static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = file_inode(file)->i_mapping->host; + size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, hfsplus_get_block); /* @@ -139,7 +140,7 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb, */ if (unlikely((rw & WRITE) && ret < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + iov_length(iov, nr_segs); + loff_t end = offset + count; if (end > isize) hfsplus_write_failed(mapping, end); @@ -178,64 +179,6 @@ const struct dentry_operations hfsplus_dentry_operations = { .d_compare = hfsplus_compare_dentry, }; -static struct dentry *hfsplus_file_lookup(struct inode *dir, - struct dentry *dentry, unsigned int flags) -{ - struct hfs_find_data fd; - struct super_block *sb = dir->i_sb; - struct inode *inode = NULL; - struct hfsplus_inode_info *hip; - int err; - - if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc")) - goto out; - - inode = HFSPLUS_I(dir)->rsrc_inode; - if (inode) - goto out; - - inode = new_inode(sb); - if (!inode) - return ERR_PTR(-ENOMEM); - - hip = HFSPLUS_I(inode); - inode->i_ino = dir->i_ino; - INIT_LIST_HEAD(&hip->open_dir_list); - mutex_init(&hip->extents_lock); - hip->extent_state = 0; - hip->flags = 0; - hip->userflags = 0; - set_bit(HFSPLUS_I_RSRC, &hip->flags); - - err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); - if (!err) { - err = hfsplus_find_cat(sb, dir->i_ino, &fd); - if (!err) - err = hfsplus_cat_read_inode(inode, &fd); - hfs_find_exit(&fd); - } - if (err) { - iput(inode); - return ERR_PTR(err); - } - hip->rsrc_inode = dir; - HFSPLUS_I(dir)->rsrc_inode = inode; - igrab(dir); - - /* - * __mark_inode_dirty expects inodes to be hashed. Since we don't - * want resource fork inodes in the regular inode space, we make them - * appear hashed, but do not put on any lists. hlist_del() - * will work fine and require no locking. - */ - hlist_add_fake(&inode->i_hash); - - mark_inode_dirty(inode); -out: - d_add(dentry, inode); - return NULL; -} - static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir) { @@ -319,7 +262,7 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) mark_inode_dirty(inode); if (attr->ia_valid & ATTR_MODE) { - error = hfsplus_posix_acl_chmod(inode); + error = posix_acl_chmod(inode, inode->i_mode); if (unlikely(error)) return error; } @@ -385,23 +328,23 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, } static const struct inode_operations hfsplus_file_inode_operations = { - .lookup = hfsplus_file_lookup, .setattr = hfsplus_setattr, .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = hfsplus_listxattr, - .removexattr = hfsplus_removexattr, + .removexattr = generic_removexattr, #ifdef CONFIG_HFSPLUS_FS_POSIX_ACL .get_acl = hfsplus_get_posix_acl, + .set_acl = hfsplus_set_posix_acl, #endif }; static const struct file_operations hfsplus_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .splice_read = generic_file_splice_read, .fsync = hfsplus_file_fsync, @@ -433,6 +376,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode) hip->extent_state = 0; hip->flags = 0; hip->userflags = 0; + hip->subfolders = 0; memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec)); memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); hip->alloc_blocks = 0; @@ -552,6 +496,10 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date); HFSPLUS_I(inode)->create_date = folder->create_date; HFSPLUS_I(inode)->fs_blocks = 0; + if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) { + HFSPLUS_I(inode)->subfolders = + be32_to_cpu(folder->subfolders); + } inode->i_op = &hfsplus_dir_inode_operations; inode->i_fop = &hfsplus_dir_operations; } else if (type == HFSPLUS_FILE) { @@ -624,6 +572,10 @@ int hfsplus_cat_write_inode(struct inode *inode) folder->content_mod_date = hfsp_ut2mt(inode->i_mtime); folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime); folder->valence = cpu_to_be32(inode->i_size - 2); + if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) { + folder->subfolders = + cpu_to_be32(HFSPLUS_I(inode)->subfolders); + } hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, sizeof(struct hfsplus_cat_folder)); } else if (HFSPLUS_IS_RSRC(inode)) { diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index 968eab5bc1f..c90b72ee676 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -75,7 +75,7 @@ int hfsplus_parse_options_remount(char *input, int *force) int token; if (!input) - return 0; + return 1; while ((p = strsep(&input, ",")) != NULL) { if (!*p) @@ -173,9 +173,8 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) if (p) sbi->nls = load_nls(p); if (!sbi->nls) { - pr_err("unable to load " - "nls mapping \"%s\"\n", - p); + pr_err("unable to load nls mapping \"%s\"\n", + p); kfree(p); return 0; } @@ -232,8 +231,8 @@ int hfsplus_show_options(struct seq_file *seq, struct dentry *root) if (sbi->nls) seq_printf(seq, ",nls=%s", sbi->nls->charset); if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags)) - seq_printf(seq, ",nodecompose"); + seq_puts(seq, ",nodecompose"); if (test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) - seq_printf(seq, ",nobarrier"); + seq_puts(seq, ",nobarrier"); return 0; } diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c index b609cc14c72..df0c9af68d0 100644 --- a/fs/hfsplus/posix_acl.c +++ b/fs/hfsplus/posix_acl.c @@ -17,9 +17,7 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type) char *value = NULL; ssize_t size; - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; + hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino); switch (type) { case ACL_TYPE_ACCESS: @@ -56,17 +54,15 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type) return acl; } -static int hfsplus_set_posix_acl(struct inode *inode, - int type, - struct posix_acl *acl) +int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, + int type) { int err; char *xattr_name; size_t size = 0; char *value = NULL; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; + hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino); switch (type) { case ACL_TYPE_ACCESS: @@ -115,7 +111,7 @@ end_set_acl: int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) { int err = 0; - struct posix_acl *acl = NULL; + struct posix_acl *default_acl, *acl; hfs_dbg(ACL_MOD, "[%s]: ino %lu, dir->ino %lu\n", @@ -124,151 +120,21 @@ int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) if (S_ISLNK(inode->i_mode)) return 0; - acl = hfsplus_get_posix_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (acl) { - if (S_ISDIR(inode->i_mode)) { - err = hfsplus_set_posix_acl(inode, - ACL_TYPE_DEFAULT, - acl); - if (unlikely(err)) - goto init_acl_cleanup; - } - - err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); - if (unlikely(err < 0)) - return err; - - if (err > 0) - err = hfsplus_set_posix_acl(inode, - ACL_TYPE_ACCESS, - acl); - } else - inode->i_mode &= ~current_umask(); - -init_acl_cleanup: - posix_acl_release(acl); - return err; -} - -int hfsplus_posix_acl_chmod(struct inode *inode) -{ - int err; - struct posix_acl *acl; - - hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino); - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - acl = hfsplus_get_posix_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - - err = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (unlikely(err)) + err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (err) return err; - err = hfsplus_set_posix_acl(inode, ACL_TYPE_ACCESS, acl); - posix_acl_release(acl); - return err; -} - -static int hfsplus_xattr_get_posix_acl(struct dentry *dentry, - const char *name, - void *buffer, - size_t size, - int type) -{ - int err = 0; - struct posix_acl *acl; - - hfs_dbg(ACL_MOD, - "[%s]: ino %lu, buffer %p, size %zu, type %#x\n", - __func__, dentry->d_inode->i_ino, buffer, size, type); - - if (strcmp(name, "") != 0) - return -EINVAL; - - acl = hfsplus_get_posix_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - - err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return err; -} - -static int hfsplus_xattr_set_posix_acl(struct dentry *dentry, - const char *name, - const void *value, - size_t size, - int flags, - int type) -{ - int err = 0; - struct inode *inode = dentry->d_inode; - struct posix_acl *acl = NULL; - - hfs_dbg(ACL_MOD, - "[%s]: ino %lu, value %p, size %zu, flags %#x, type %#x\n", - __func__, inode->i_ino, value, size, flags, type); - - if (strcmp(name, "") != 0) - return -EINVAL; - - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - err = posix_acl_valid(acl); - if (err) - goto end_xattr_set_acl; - } + if (default_acl) { + err = hfsplus_set_posix_acl(inode, default_acl, + ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); } - err = hfsplus_set_posix_acl(inode, type, acl); - -end_xattr_set_acl: - posix_acl_release(acl); + if (acl) { + if (!err) + err = hfsplus_set_posix_acl(inode, acl, + ACL_TYPE_ACCESS); + posix_acl_release(acl); + } return err; } - -static size_t hfsplus_xattr_list_posix_acl(struct dentry *dentry, - char *list, - size_t list_size, - const char *name, - size_t name_len, - int type) -{ - /* - * This method is not used. - * It is used hfsplus_listxattr() instead of generic_listxattr(). - */ - return -EOPNOTSUPP; -} - -const struct xattr_handler hfsplus_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = hfsplus_xattr_list_posix_acl, - .get = hfsplus_xattr_get_posix_acl, - .set = hfsplus_xattr_set_posix_acl, -}; - -const struct xattr_handler hfsplus_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = hfsplus_xattr_list_posix_acl, - .get = hfsplus_xattr_get_posix_acl, - .set = hfsplus_xattr_set_posix_acl, -}; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 4c4d142cf89..4cf2024b87d 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -131,9 +131,10 @@ static int hfsplus_system_write_inode(struct inode *inode) hfsplus_inode_write_fork(inode, fork); if (tree) { int err = hfs_btree_write(tree); + if (err) { pr_err("b-tree write err: %d, ino %lu\n", - err, inode->i_ino); + err, inode->i_ino); return err; } } @@ -161,7 +162,7 @@ static int hfsplus_write_inode(struct inode *inode, static void hfsplus_evict_inode(struct inode *inode) { hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (HFSPLUS_IS_RSRC(inode)) { HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL; @@ -323,6 +324,7 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) static int hfsplus_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; if (!(*flags & MS_RDONLY)) { @@ -474,12 +476,14 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) pr_err("failed to load catalog file\n"); goto out_close_ext_tree; } + atomic_set(&sbi->attr_tree_state, HFSPLUS_EMPTY_ATTR_TREE); if (vhdr->attr_file.total_blocks != 0) { sbi->attr_tree = hfs_btree_open(sb, HFSPLUS_ATTR_CNID); if (!sbi->attr_tree) { pr_err("failed to load attributes file\n"); goto out_close_cat_tree; } + atomic_set(&sbi->attr_tree_state, HFSPLUS_VALID_ATTR_TREE); } sb->s_xattr = hfsplus_xattr_handlers; diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index b51a6079108..cc623567143 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -24,15 +24,8 @@ struct hfsplus_wd { u16 embed_count; }; -static void hfsplus_end_io_sync(struct bio *bio, int err) -{ - if (err) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - complete(bio->bi_private); -} - -/* - * hfsplus_submit_bio - Perfrom block I/O +/** + * hfsplus_submit_bio - Perform block I/O * @sb: super block of volume for I/O * @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes * @buf: buffer for I/O @@ -53,7 +46,6 @@ static void hfsplus_end_io_sync(struct bio *bio, int err) int hfsplus_submit_bio(struct super_block *sb, sector_t sector, void *buf, void **data, int rw) { - DECLARE_COMPLETION_ONSTACK(wait); struct bio *bio; int ret = 0; u64 io_size; @@ -71,10 +63,8 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1); bio = bio_alloc(GFP_NOIO, 1); - bio->bi_sector = sector; + bio->bi_iter.bi_sector = sector; bio->bi_bdev = sb->s_bdev; - bio->bi_end_io = hfsplus_end_io_sync; - bio->bi_private = &wait; if (!(rw & WRITE) && data) *data = (u8 *)buf + offset; @@ -93,12 +83,7 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, buf = (u8 *)buf + len; } - submit_bio(rw, bio); - wait_for_completion(&wait); - - if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; - + ret = submit_bio_wait(rw, bio); out: bio_put(bio); return ret < 0 ? ret : 0; @@ -246,10 +231,8 @@ reread: if (blocksize < HFSPLUS_SECTOR_SIZE || ((blocksize - 1) & blocksize)) goto out_free_backup_vhdr; sbi->alloc_blksz = blocksize; - sbi->alloc_blksz_shift = 0; - while ((blocksize >>= 1) != 0) - sbi->alloc_blksz_shift++; - blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE); + sbi->alloc_blksz_shift = ilog2(blocksize); + blocksize = min_t(u32, sbi->alloc_blksz, PAGE_SIZE); /* * Align block size to block offset. diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index bd8471fb9a6..d98094a9f47 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -7,16 +7,20 @@ */ #include "hfsplus_fs.h" +#include <linux/posix_acl_xattr.h> +#include <linux/nls.h> #include "xattr.h" #include "acl.h" +static int hfsplus_removexattr(struct inode *inode, const char *name); + const struct xattr_handler *hfsplus_xattr_handlers[] = { &hfsplus_xattr_osx_handler, &hfsplus_xattr_user_handler, &hfsplus_xattr_trusted_handler, #ifdef CONFIG_HFSPLUS_FS_POSIX_ACL - &hfsplus_xattr_acl_access_handler, - &hfsplus_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif &hfsplus_xattr_security_handler, NULL @@ -51,80 +55,209 @@ static inline int is_known_namespace(const char *name) return true; } -static int can_set_system_xattr(struct inode *inode, const char *name, - const void *value, size_t size) +static void hfsplus_init_header_node(struct inode *attr_file, + u32 clump_size, + char *buf, u16 node_size) { -#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL - struct posix_acl *acl; - int err; + struct hfs_bnode_desc *desc; + struct hfs_btree_header_rec *head; + u16 offset; + __be16 *rec_offsets; + u32 hdr_node_map_rec_bits; + char *bmp; + u32 used_nodes; + u32 used_bmp_bytes; + u64 tmp; + + hfs_dbg(ATTR_MOD, "init_hdr_attr_file: clump %u, node_size %u\n", + clump_size, node_size); + + /* The end of the node contains list of record offsets */ + rec_offsets = (__be16 *)(buf + node_size); + + desc = (struct hfs_bnode_desc *)buf; + desc->type = HFS_NODE_HEADER; + desc->num_recs = cpu_to_be16(HFSPLUS_BTREE_HDR_NODE_RECS_COUNT); + offset = sizeof(struct hfs_bnode_desc); + *--rec_offsets = cpu_to_be16(offset); + + head = (struct hfs_btree_header_rec *)(buf + offset); + head->node_size = cpu_to_be16(node_size); + tmp = i_size_read(attr_file); + do_div(tmp, node_size); + head->node_count = cpu_to_be32(tmp); + head->free_nodes = cpu_to_be32(be32_to_cpu(head->node_count) - 1); + head->clump_size = cpu_to_be32(clump_size); + head->attributes |= cpu_to_be32(HFS_TREE_BIGKEYS | HFS_TREE_VARIDXKEYS); + head->max_key_len = cpu_to_be16(HFSPLUS_ATTR_KEYLEN - sizeof(u16)); + offset += sizeof(struct hfs_btree_header_rec); + *--rec_offsets = cpu_to_be16(offset); + offset += HFSPLUS_BTREE_HDR_USER_BYTES; + *--rec_offsets = cpu_to_be16(offset); + + hdr_node_map_rec_bits = 8 * (node_size - offset - (4 * sizeof(u16))); + if (be32_to_cpu(head->node_count) > hdr_node_map_rec_bits) { + u32 map_node_bits; + u32 map_nodes; + + desc->next = cpu_to_be32(be32_to_cpu(head->leaf_tail) + 1); + map_node_bits = 8 * (node_size - sizeof(struct hfs_bnode_desc) - + (2 * sizeof(u16)) - 2); + map_nodes = (be32_to_cpu(head->node_count) - + hdr_node_map_rec_bits + + (map_node_bits - 1)) / map_node_bits; + be32_add_cpu(&head->free_nodes, 0 - map_nodes); + } - if (!inode_owner_or_capable(inode)) - return -EPERM; + bmp = buf + offset; + used_nodes = + be32_to_cpu(head->node_count) - be32_to_cpu(head->free_nodes); + used_bmp_bytes = used_nodes / 8; + if (used_bmp_bytes) { + memset(bmp, 0xFF, used_bmp_bytes); + bmp += used_bmp_bytes; + used_nodes %= 8; + } + *bmp = ~(0xFF >> used_nodes); + offset += hdr_node_map_rec_bits / 8; + *--rec_offsets = cpu_to_be16(offset); +} - /* - * POSIX_ACL_XATTR_ACCESS is tied to i_mode - */ - if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - err = posix_acl_equiv_mode(acl, &inode->i_mode); - posix_acl_release(acl); - if (err < 0) - return err; - mark_inode_dirty(inode); - } +static int hfsplus_create_attributes_file(struct super_block *sb) +{ + int err = 0; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct inode *attr_file; + struct hfsplus_inode_info *hip; + u32 clump_size; + u16 node_size = HFSPLUS_ATTR_TREE_NODE_SIZE; + char *buf; + int index, written; + struct address_space *mapping; + struct page *page; + int old_state = HFSPLUS_EMPTY_ATTR_TREE; + + hfs_dbg(ATTR_MOD, "create_attr_file: ino %d\n", HFSPLUS_ATTR_CNID); + +check_attr_tree_state_again: + switch (atomic_read(&sbi->attr_tree_state)) { + case HFSPLUS_EMPTY_ATTR_TREE: + if (old_state != atomic_cmpxchg(&sbi->attr_tree_state, + old_state, + HFSPLUS_CREATING_ATTR_TREE)) + goto check_attr_tree_state_again; + break; + case HFSPLUS_CREATING_ATTR_TREE: /* - * We're changing the ACL. Get rid of the cached one + * This state means that another thread is in process + * of AttributesFile creation. Theoretically, it is + * possible to be here. But really __setxattr() method + * first of all calls hfs_find_init() for lookup in + * B-tree of CatalogFile. This method locks mutex of + * CatalogFile's B-tree. As a result, if some thread + * is inside AttributedFile creation operation then + * another threads will be waiting unlocking of + * CatalogFile's B-tree's mutex. However, if code will + * change then we will return error code (-EAGAIN) from + * here. Really, it means that first try to set of xattr + * fails with error but second attempt will have success. */ - forget_cached_acl(inode, ACL_TYPE_ACCESS); - + return -EAGAIN; + case HFSPLUS_VALID_ATTR_TREE: return 0; - } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - posix_acl_release(acl); + case HFSPLUS_FAILED_ATTR_TREE: + return -EOPNOTSUPP; + default: + BUG(); + } - /* - * We're changing the default ACL. Get rid of the cached one - */ - forget_cached_acl(inode, ACL_TYPE_DEFAULT); + attr_file = hfsplus_iget(sb, HFSPLUS_ATTR_CNID); + if (IS_ERR(attr_file)) { + pr_err("failed to load attributes file\n"); + return PTR_ERR(attr_file); + } - return 0; + BUG_ON(i_size_read(attr_file) != 0); + + hip = HFSPLUS_I(attr_file); + + clump_size = hfsplus_calc_btree_clump_size(sb->s_blocksize, + node_size, + sbi->sect_count, + HFSPLUS_ATTR_CNID); + + mutex_lock(&hip->extents_lock); + hip->clump_blocks = clump_size >> sbi->alloc_blksz_shift; + mutex_unlock(&hip->extents_lock); + + if (sbi->free_blocks <= (hip->clump_blocks << 1)) { + err = -ENOSPC; + goto end_attr_file_creation; } -#endif /* CONFIG_HFSPLUS_FS_POSIX_ACL */ - return -EOPNOTSUPP; -} -static int can_set_xattr(struct inode *inode, const char *name, - const void *value, size_t value_len) -{ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return can_set_system_xattr(inode, name, value, value_len); + while (hip->alloc_blocks < hip->clump_blocks) { + err = hfsplus_file_extend(attr_file, false); + if (unlikely(err)) { + pr_err("failed to extend attributes file\n"); + goto end_attr_file_creation; + } + hip->phys_size = attr_file->i_size = + (loff_t)hip->alloc_blocks << sbi->alloc_blksz_shift; + hip->fs_blocks = hip->alloc_blocks << sbi->fs_shift; + inode_set_bytes(attr_file, attr_file->i_size); + } - if (!strncmp(name, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN)) { - /* - * This makes sure that we aren't trying to set an - * attribute in a different namespace by prefixing it - * with "osx." - */ - if (is_known_namespace(name + XATTR_MAC_OSX_PREFIX_LEN)) - return -EOPNOTSUPP; + buf = kzalloc(node_size, GFP_NOFS); + if (!buf) { + pr_err("failed to allocate memory for header node\n"); + err = -ENOMEM; + goto end_attr_file_creation; + } - return 0; + hfsplus_init_header_node(attr_file, clump_size, buf, node_size); + + mapping = attr_file->i_mapping; + + index = 0; + written = 0; + for (; written < node_size; index++, written += PAGE_CACHE_SIZE) { + void *kaddr; + + page = read_mapping_page(mapping, index, NULL); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto failed_header_node_init; + } + + kaddr = kmap_atomic(page); + memcpy(kaddr, buf + written, + min_t(size_t, PAGE_CACHE_SIZE, node_size - written)); + kunmap_atomic(kaddr); + + set_page_dirty(page); + page_cache_release(page); } - /* - * Don't allow setting an attribute in an unknown namespace. - */ - if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) && - strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && - strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) - return -EOPNOTSUPP; + hfsplus_mark_inode_dirty(attr_file, HFSPLUS_I_ATTR_DIRTY); + + sbi->attr_tree = hfs_btree_open(sb, HFSPLUS_ATTR_CNID); + if (!sbi->attr_tree) + pr_err("failed to load attributes file\n"); + +failed_header_node_init: + kfree(buf); + +end_attr_file_creation: + iput(attr_file); - return 0; + if (!err) + atomic_set(&sbi->attr_tree_state, HFSPLUS_VALID_ATTR_TREE); + else if (err == -ENOSPC) + atomic_set(&sbi->attr_tree_state, HFSPLUS_EMPTY_ATTR_TREE); + else + atomic_set(&sbi->attr_tree_state, HFSPLUS_FAILED_ATTR_TREE); + + return err; } int __hfsplus_setxattr(struct inode *inode, const char *name, @@ -144,18 +277,8 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, HFSPLUS_IS_RSRC(inode)) return -EOPNOTSUPP; - err = can_set_xattr(inode, name, value, size); - if (err) - return err; - - if (strncmp(name, XATTR_MAC_OSX_PREFIX, - XATTR_MAC_OSX_PREFIX_LEN) == 0) - name += XATTR_MAC_OSX_PREFIX_LEN; - - if (value == NULL) { - value = ""; - size = 0; - } + if (value == NULL) + return hfsplus_removexattr(inode, name); err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd); if (err) { @@ -211,8 +334,9 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, } if (!HFSPLUS_SB(inode->i_sb)->attr_tree) { - err = -EOPNOTSUPP; - goto end_setxattr; + err = hfsplus_create_attributes_file(inode->i_sb); + if (unlikely(err)) + goto end_setxattr; } if (hfsplus_attr_exists(inode, name)) { @@ -272,16 +396,11 @@ end_setxattr: return err; } -static inline int is_osx_xattr(const char *xattr_name) -{ - return !is_known_namespace(xattr_name); -} - static int name_len(const char *xattr_name, int xattr_name_len) { int len = xattr_name_len + 1; - if (is_osx_xattr(xattr_name)) + if (!is_known_namespace(xattr_name)) len += XATTR_MAC_OSX_PREFIX_LEN; return len; @@ -292,7 +411,7 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len) int len = name_len; int offset = 0; - if (is_osx_xattr(xattr_name)) { + if (!is_known_namespace(xattr_name)) { strncpy(buffer, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN); offset += XATTR_MAC_OSX_PREFIX_LEN; len += XATTR_MAC_OSX_PREFIX_LEN; @@ -370,18 +489,6 @@ ssize_t __hfsplus_getxattr(struct inode *inode, const char *name, HFSPLUS_IS_RSRC(inode)) return -EOPNOTSUPP; - if (strncmp(name, XATTR_MAC_OSX_PREFIX, - XATTR_MAC_OSX_PREFIX_LEN) == 0) { - /* skip "osx." prefix */ - name += XATTR_MAC_OSX_PREFIX_LEN; - /* - * Don't allow retrieving properly prefixed attributes - * by prepending them with "osx." - */ - if (is_known_namespace(name)) - return -EOPNOTSUPP; - } - if (!strcmp_xattr_finder_info(name)) return hfsplus_getxattr_finder_info(inode, value, size); @@ -539,8 +646,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) struct hfs_find_data fd; u16 key_len = 0; struct hfsplus_attr_key attr_key; - char strbuf[HFSPLUS_ATTR_MAX_STRLEN + - XATTR_MAC_OSX_PREFIX_LEN + 1] = {0}; + char *strbuf; int xattr_name_len; if ((!S_ISREG(inode->i_mode) && @@ -560,6 +666,13 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) return err; } + strbuf = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + + XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL); + if (!strbuf) { + res = -ENOMEM; + goto out; + } + err = hfsplus_find_attr(inode->i_sb, inode->i_ino, NULL, &fd); if (err) { if (err == -ENOENT) { @@ -586,7 +699,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) if (be32_to_cpu(attr_key.cnid) != inode->i_ino) goto end_listxattr; - xattr_name_len = HFSPLUS_ATTR_MAX_STRLEN; + xattr_name_len = NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN; if (hfsplus_uni2asc(inode->i_sb, (const struct hfsplus_unistr *)&fd.key->attr.key_name, strbuf, &xattr_name_len)) { @@ -612,36 +725,24 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) } end_listxattr: + kfree(strbuf); +out: hfs_find_exit(&fd); return res; } -int hfsplus_removexattr(struct dentry *dentry, const char *name) +static int hfsplus_removexattr(struct inode *inode, const char *name) { int err = 0; - struct inode *inode = dentry->d_inode; struct hfs_find_data cat_fd; u16 flags; u16 cat_entry_type; int is_xattr_acl_deleted = 0; int is_all_xattrs_deleted = 0; - if ((!S_ISREG(inode->i_mode) && - !S_ISDIR(inode->i_mode)) || - HFSPLUS_IS_RSRC(inode)) - return -EOPNOTSUPP; - if (!HFSPLUS_SB(inode->i_sb)->attr_tree) return -EOPNOTSUPP; - err = can_set_xattr(inode, name, NULL, 0); - if (err) - return err; - - if (strncmp(name, XATTR_MAC_OSX_PREFIX, - XATTR_MAC_OSX_PREFIX_LEN) == 0) - name += XATTR_MAC_OSX_PREFIX_LEN; - if (!strcmp_xattr_finder_info(name)) return -EOPNOTSUPP; @@ -705,39 +806,55 @@ end_removexattr: static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size, int type) { - char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + - XATTR_MAC_OSX_PREFIX_LEN + 1] = {0}; - size_t len = strlen(name); + char *xattr_name; + int res; if (!strcmp(name, "")) return -EINVAL; - if (len > HFSPLUS_ATTR_MAX_STRLEN) + /* + * Don't allow retrieving properly prefixed attributes + * by prepending them with "osx." + */ + if (is_known_namespace(name)) return -EOPNOTSUPP; - + xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + + XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL); + if (!xattr_name) + return -ENOMEM; strcpy(xattr_name, XATTR_MAC_OSX_PREFIX); strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name); - return hfsplus_getxattr(dentry, xattr_name, buffer, size); + res = hfsplus_getxattr(dentry, xattr_name, buffer, size); + kfree(xattr_name); + return res; } static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags, int type) { - char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + - XATTR_MAC_OSX_PREFIX_LEN + 1] = {0}; - size_t len = strlen(name); + char *xattr_name; + int res; if (!strcmp(name, "")) return -EINVAL; - if (len > HFSPLUS_ATTR_MAX_STRLEN) + /* + * Don't allow setting properly prefixed attributes + * by prepending them with "osx." + */ + if (is_known_namespace(name)) return -EOPNOTSUPP; - + xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + + XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL); + if (!xattr_name) + return -ENOMEM; strcpy(xattr_name, XATTR_MAC_OSX_PREFIX); strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name); - return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); + res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); + kfree(xattr_name); + return res; } static size_t hfsplus_osx_listxattr(struct dentry *dentry, char *list, diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h index 841b5698c0f..288530cf80b 100644 --- a/fs/hfsplus/xattr.h +++ b/fs/hfsplus/xattr.h @@ -14,8 +14,6 @@ extern const struct xattr_handler hfsplus_xattr_osx_handler; extern const struct xattr_handler hfsplus_xattr_user_handler; extern const struct xattr_handler hfsplus_xattr_trusted_handler; -extern const struct xattr_handler hfsplus_xattr_acl_access_handler; -extern const struct xattr_handler hfsplus_xattr_acl_default_handler; extern const struct xattr_handler hfsplus_xattr_security_handler; extern const struct xattr_handler *hfsplus_xattr_handlers[]; @@ -42,8 +40,6 @@ static inline ssize_t hfsplus_getxattr(struct dentry *dentry, ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size); -int hfsplus_removexattr(struct dentry *dentry, const char *name); - int hfsplus_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr); diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c index 00722765ea7..6ec5e107691 100644 --- a/fs/hfsplus/xattr_security.c +++ b/fs/hfsplus/xattr_security.c @@ -7,6 +7,8 @@ */ #include <linux/security.h> +#include <linux/nls.h> + #include "hfsplus_fs.h" #include "xattr.h" #include "acl.h" @@ -14,37 +16,43 @@ static int hfsplus_security_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size, int type) { - char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; - size_t len = strlen(name); + char *xattr_name; + int res; if (!strcmp(name, "")) return -EINVAL; - if (len + XATTR_SECURITY_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) - return -EOPNOTSUPP; - + xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, + GFP_KERNEL); + if (!xattr_name) + return -ENOMEM; strcpy(xattr_name, XATTR_SECURITY_PREFIX); strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, name); - return hfsplus_getxattr(dentry, xattr_name, buffer, size); + res = hfsplus_getxattr(dentry, xattr_name, buffer, size); + kfree(xattr_name); + return res; } static int hfsplus_security_setxattr(struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags, int type) { - char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; - size_t len = strlen(name); + char *xattr_name; + int res; if (!strcmp(name, "")) return -EINVAL; - if (len + XATTR_SECURITY_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) - return -EOPNOTSUPP; - + xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, + GFP_KERNEL); + if (!xattr_name) + return -ENOMEM; strcpy(xattr_name, XATTR_SECURITY_PREFIX); strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, name); - return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); + res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); + kfree(xattr_name); + return res; } static size_t hfsplus_security_listxattr(struct dentry *dentry, char *list, @@ -62,31 +70,30 @@ static int hfsplus_initxattrs(struct inode *inode, void *fs_info) { const struct xattr *xattr; - char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; - size_t xattr_name_len; + char *xattr_name; int err = 0; + xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, + GFP_KERNEL); + if (!xattr_name) + return -ENOMEM; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - xattr_name_len = strlen(xattr->name); - if (xattr_name_len == 0) + if (!strcmp(xattr->name, "")) continue; - if (xattr_name_len + XATTR_SECURITY_PREFIX_LEN > - HFSPLUS_ATTR_MAX_STRLEN) - return -EOPNOTSUPP; - strcpy(xattr_name, XATTR_SECURITY_PREFIX); strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, xattr->name); memset(xattr_name + - XATTR_SECURITY_PREFIX_LEN + xattr_name_len, 0, 1); + XATTR_SECURITY_PREFIX_LEN + strlen(xattr->name), 0, 1); err = __hfsplus_setxattr(inode, xattr_name, xattr->value, xattr->value_len, 0); if (err) break; } + kfree(xattr_name); return err; } diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c index 426cee27754..3c5f27e4746 100644 --- a/fs/hfsplus/xattr_trusted.c +++ b/fs/hfsplus/xattr_trusted.c @@ -6,43 +6,51 @@ * Handler for trusted extended attributes. */ +#include <linux/nls.h> + #include "hfsplus_fs.h" #include "xattr.h" static int hfsplus_trusted_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size, int type) { - char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; - size_t len = strlen(name); + char *xattr_name; + int res; if (!strcmp(name, "")) return -EINVAL; - if (len + XATTR_TRUSTED_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) - return -EOPNOTSUPP; - + xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, + GFP_KERNEL); + if (!xattr_name) + return -ENOMEM; strcpy(xattr_name, XATTR_TRUSTED_PREFIX); strcpy(xattr_name + XATTR_TRUSTED_PREFIX_LEN, name); - return hfsplus_getxattr(dentry, xattr_name, buffer, size); + res = hfsplus_getxattr(dentry, xattr_name, buffer, size); + kfree(xattr_name); + return res; } static int hfsplus_trusted_setxattr(struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags, int type) { - char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; - size_t len = strlen(name); + char *xattr_name; + int res; if (!strcmp(name, "")) return -EINVAL; - if (len + XATTR_TRUSTED_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) - return -EOPNOTSUPP; - + xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, + GFP_KERNEL); + if (!xattr_name) + return -ENOMEM; strcpy(xattr_name, XATTR_TRUSTED_PREFIX); strcpy(xattr_name + XATTR_TRUSTED_PREFIX_LEN, name); - return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); + res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); + kfree(xattr_name); + return res; } static size_t hfsplus_trusted_listxattr(struct dentry *dentry, char *list, diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c index e34016561ae..2b625a538b6 100644 --- a/fs/hfsplus/xattr_user.c +++ b/fs/hfsplus/xattr_user.c @@ -6,43 +6,51 @@ * Handler for user extended attributes. */ +#include <linux/nls.h> + #include "hfsplus_fs.h" #include "xattr.h" static int hfsplus_user_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size, int type) { - char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; - size_t len = strlen(name); + char *xattr_name; + int res; if (!strcmp(name, "")) return -EINVAL; - if (len + XATTR_USER_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) - return -EOPNOTSUPP; - + xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, + GFP_KERNEL); + if (!xattr_name) + return -ENOMEM; strcpy(xattr_name, XATTR_USER_PREFIX); strcpy(xattr_name + XATTR_USER_PREFIX_LEN, name); - return hfsplus_getxattr(dentry, xattr_name, buffer, size); + res = hfsplus_getxattr(dentry, xattr_name, buffer, size); + kfree(xattr_name); + return res; } static int hfsplus_user_setxattr(struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags, int type) { - char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; - size_t len = strlen(name); + char *xattr_name; + int res; if (!strcmp(name, "")) return -EINVAL; - if (len + XATTR_USER_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) - return -EOPNOTSUPP; - + xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, + GFP_KERNEL); + if (!xattr_name) + return -ENOMEM; strcpy(xattr_name, XATTR_USER_PREFIX); strcpy(xattr_name + XATTR_USER_PREFIX_LEN, name); - return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); + res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); + kfree(xattr_name); + return res; } static size_t hfsplus_user_listxattr(struct dentry *dentry, char *list, diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 25437280a20..bb529f3b7f2 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -33,15 +33,6 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode) #define FILE_HOSTFS_I(file) HOSTFS_I(file_inode(file)) -static int hostfs_d_delete(const struct dentry *dentry) -{ - return 1; -} - -static const struct dentry_operations hostfs_dentry_ops = { - .d_delete = hostfs_d_delete, -}; - /* Changed in hostfs_args before the kernel starts running */ static char *root_ino = ""; static int append = 0; @@ -195,7 +186,7 @@ static struct inode *hostfs_iget(struct super_block *sb) return inode; } -int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf) +static int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf) { /* * do_statfs uses struct statfs64 internally, but the linux kernel @@ -239,7 +230,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb) static void hostfs_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (HOSTFS_I(inode)->fd != -1) { close_file(&HOSTFS_I(inode)->fd); @@ -277,7 +268,7 @@ static const struct super_operations hostfs_sbops = { .show_options = hostfs_show_options, }; -int hostfs_readdir(struct file *file, struct dir_context *ctx) +static int hostfs_readdir(struct file *file, struct dir_context *ctx) { void *dir; char *name; @@ -302,7 +293,7 @@ int hostfs_readdir(struct file *file, struct dir_context *ctx) return 0; } -int hostfs_file_open(struct inode *ino, struct file *file) +static int hostfs_file_open(struct inode *ino, struct file *file) { static DEFINE_MUTEX(open_mutex); char *name; @@ -368,7 +359,8 @@ static int hostfs_file_release(struct inode *inode, struct file *file) return 0; } -int hostfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) +static int hostfs_fsync(struct file *file, loff_t start, loff_t end, + int datasync) { struct inode *inode = file->f_mapping->host; int ret; @@ -386,11 +378,11 @@ int hostfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) static const struct file_operations hostfs_file_fops = { .llseek = generic_file_llseek, - .read = do_sync_read, + .read = new_sync_read, .splice_read = generic_file_splice_read, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, - .write = do_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .write = new_sync_write, .mmap = generic_file_mmap, .open = hostfs_file_open, .release = hostfs_file_release, @@ -403,7 +395,7 @@ static const struct file_operations hostfs_dir_fops = { .read = generic_read_dir, }; -int hostfs_writepage(struct page *page, struct writeback_control *wbc) +static int hostfs_writepage(struct page *page, struct writeback_control *wbc) { struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; @@ -439,7 +431,7 @@ int hostfs_writepage(struct page *page, struct writeback_control *wbc) return err; } -int hostfs_readpage(struct file *file, struct page *page) +static int hostfs_readpage(struct file *file, struct page *page) { char *buffer; long long start; @@ -464,9 +456,9 @@ int hostfs_readpage(struct file *file, struct page *page) return err; } -int hostfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) +static int hostfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) { pgoff_t index = pos >> PAGE_CACHE_SHIFT; @@ -476,9 +468,9 @@ int hostfs_write_begin(struct file *file, struct address_space *mapping, return 0; } -int hostfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) +static int hostfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { struct inode *inode = mapping->host; void *buffer; @@ -558,8 +550,8 @@ static int read_name(struct inode *ino, char *name) return 0; } -int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) { struct inode *inode; char *name; @@ -600,8 +592,8 @@ int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, return error; } -struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry, - unsigned int flags) +static struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry, + unsigned int flags) { struct inode *inode; char *name; @@ -637,7 +629,8 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry, return ERR_PTR(err); } -int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from) +static int hostfs_link(struct dentry *to, struct inode *ino, + struct dentry *from) { char *from_name, *to_name; int err; @@ -655,7 +648,7 @@ int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from) return err; } -int hostfs_unlink(struct inode *ino, struct dentry *dentry) +static int hostfs_unlink(struct inode *ino, struct dentry *dentry) { char *file; int err; @@ -671,7 +664,8 @@ int hostfs_unlink(struct inode *ino, struct dentry *dentry) return err; } -int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to) +static int hostfs_symlink(struct inode *ino, struct dentry *dentry, + const char *to) { char *file; int err; @@ -683,7 +677,7 @@ int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to) return err; } -int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode) +static int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode) { char *file; int err; @@ -695,7 +689,7 @@ int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode) return err; } -int hostfs_rmdir(struct inode *ino, struct dentry *dentry) +static int hostfs_rmdir(struct inode *ino, struct dentry *dentry) { char *file; int err; @@ -747,8 +741,8 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, return err; } -int hostfs_rename(struct inode *from_ino, struct dentry *from, - struct inode *to_ino, struct dentry *to) +static int hostfs_rename(struct inode *from_ino, struct dentry *from, + struct inode *to_ino, struct dentry *to) { char *from_name, *to_name; int err; @@ -765,7 +759,7 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from, return err; } -int hostfs_permission(struct inode *ino, int desired) +static int hostfs_permission(struct inode *ino, int desired) { char *name; int r = 0, w = 0, x = 0, err; @@ -791,7 +785,7 @@ int hostfs_permission(struct inode *ino, int desired) return err; } -int hostfs_setattr(struct dentry *dentry, struct iattr *attr) +static int hostfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; struct hostfs_iattr attrs; @@ -925,7 +919,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) sb->s_blocksize_bits = 10; sb->s_magic = HOSTFS_SUPER_MAGIC; sb->s_op = &hostfs_sbops; - sb->s_d_op = &hostfs_dentry_ops; + sb->s_d_op = &simple_dentry_operations; sb->s_maxbytes = MAX_LFS_FILESIZE; /* NULL is printed as <NULL> by sprintf: avoid that. */ diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c index cdb84a83806..f005046e159 100644 --- a/fs/hpfs/alloc.c +++ b/fs/hpfs/alloc.c @@ -8,6 +8,58 @@ #include "hpfs_fn.h" +static void hpfs_claim_alloc(struct super_block *s, secno sec) +{ + struct hpfs_sb_info *sbi = hpfs_sb(s); + if (sbi->sb_n_free != (unsigned)-1) { + if (unlikely(!sbi->sb_n_free)) { + hpfs_error(s, "free count underflow, allocating sector %08x", sec); + sbi->sb_n_free = -1; + return; + } + sbi->sb_n_free--; + } +} + +static void hpfs_claim_free(struct super_block *s, secno sec) +{ + struct hpfs_sb_info *sbi = hpfs_sb(s); + if (sbi->sb_n_free != (unsigned)-1) { + if (unlikely(sbi->sb_n_free >= sbi->sb_fs_size)) { + hpfs_error(s, "free count overflow, freeing sector %08x", sec); + sbi->sb_n_free = -1; + return; + } + sbi->sb_n_free++; + } +} + +static void hpfs_claim_dirband_alloc(struct super_block *s, secno sec) +{ + struct hpfs_sb_info *sbi = hpfs_sb(s); + if (sbi->sb_n_free_dnodes != (unsigned)-1) { + if (unlikely(!sbi->sb_n_free_dnodes)) { + hpfs_error(s, "dirband free count underflow, allocating sector %08x", sec); + sbi->sb_n_free_dnodes = -1; + return; + } + sbi->sb_n_free_dnodes--; + } +} + +static void hpfs_claim_dirband_free(struct super_block *s, secno sec) +{ + struct hpfs_sb_info *sbi = hpfs_sb(s); + if (sbi->sb_n_free_dnodes != (unsigned)-1) { + if (unlikely(sbi->sb_n_free_dnodes >= sbi->sb_dirband_size / 4)) { + hpfs_error(s, "dirband free count overflow, freeing sector %08x", sec); + sbi->sb_n_free_dnodes = -1; + return; + } + sbi->sb_n_free_dnodes++; + } +} + /* * Check if a sector is allocated in bitmap * This is really slow. Turned on only if chk==2 @@ -203,9 +255,15 @@ secno hpfs_alloc_sector(struct super_block *s, secno near, unsigned n, int forwa } sec = 0; ret: + if (sec) { + i = 0; + do + hpfs_claim_alloc(s, sec + i); + while (unlikely(++i < n)); + } if (sec && f_p) { for (i = 0; i < forward; i++) { - if (!hpfs_alloc_if_possible(s, sec + i + 1)) { + if (!hpfs_alloc_if_possible(s, sec + n + i)) { hpfs_error(s, "Prealloc doesn't work! Wanted %d, allocated at %08x, can't allocate %d", forward, sec, i); sec = 0; break; @@ -228,6 +286,7 @@ static secno alloc_in_dirband(struct super_block *s, secno near) nr >>= 2; sec = alloc_in_bmp(s, (~0x3fff) | nr, 1, 0); if (!sec) return 0; + hpfs_claim_dirband_alloc(s, sec); return ((sec & 0x3fff) << 2) + sbi->sb_dirband_start; } @@ -242,6 +301,7 @@ int hpfs_alloc_if_possible(struct super_block *s, secno sec) bmp[(sec & 0x3fff) >> 5] &= cpu_to_le32(~(1 << (sec & 0x1f))); hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); + hpfs_claim_alloc(s, sec); return 1; } hpfs_brelse4(&qbh); @@ -256,7 +316,7 @@ void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n) struct quad_buffer_head qbh; __le32 *bmp; struct hpfs_sb_info *sbi = hpfs_sb(s); - /*printk("2 - ");*/ + /*pr_info("2 - ");*/ if (!n) return; if (sec < 0x12) { hpfs_error(s, "Trying to free reserved sector %08x", sec); @@ -275,6 +335,7 @@ void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n) return; } bmp[(sec & 0x3fff) >> 5] |= cpu_to_le32(1 << (sec & 0x1f)); + hpfs_claim_free(s, sec); if (!--n) { hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); @@ -359,6 +420,7 @@ void hpfs_free_dnode(struct super_block *s, dnode_secno dno) bmp[ssec >> 5] |= cpu_to_le32(1 << (ssec & 0x1f)); hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); + hpfs_claim_dirband_free(s, dno); } } @@ -366,7 +428,7 @@ struct dnode *hpfs_alloc_dnode(struct super_block *s, secno near, dnode_secno *dno, struct quad_buffer_head *qbh) { struct dnode *d; - if (hpfs_count_one_bitmap(s, hpfs_sb(s)->sb_dmap) > FREE_DNODES_ADD) { + if (hpfs_get_free_dnodes(s) > FREE_DNODES_ADD) { if (!(*dno = alloc_in_dirband(s, near))) if (!(*dno = hpfs_alloc_sector(s, near, 4, 0))) return NULL; } else { diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c index 4d0a1afa058..8057fe4e657 100644 --- a/fs/hpfs/buffer.c +++ b/fs/hpfs/buffer.c @@ -55,7 +55,7 @@ void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head if (bh != NULL) return bh->b_data; else { - printk("HPFS: hpfs_map_sector: read error\n"); + pr_err("%s(): read error\n", __func__); return NULL; } } @@ -76,7 +76,7 @@ void *hpfs_get_sector(struct super_block *s, unsigned secno, struct buffer_head set_buffer_uptodate(bh); return bh->b_data; } else { - printk("HPFS: hpfs_get_sector: getblk failed\n"); + pr_err("%s(): getblk failed\n", __func__); return NULL; } } @@ -86,7 +86,6 @@ void *hpfs_get_sector(struct super_block *s, unsigned secno, struct buffer_head void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffer_head *qbh, int ahead) { - struct buffer_head *bh; char *data; hpfs_lock_assert(s); @@ -94,40 +93,38 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe cond_resched(); if (secno & 3) { - printk("HPFS: hpfs_map_4sectors: unaligned read\n"); + pr_err("%s(): unaligned read\n", __func__); return NULL; } hpfs_prefetch_sectors(s, secno, 4 + ahead); + if (!(qbh->bh[0] = sb_bread(s, secno + 0))) goto bail0; + if (!(qbh->bh[1] = sb_bread(s, secno + 1))) goto bail1; + if (!(qbh->bh[2] = sb_bread(s, secno + 2))) goto bail2; + if (!(qbh->bh[3] = sb_bread(s, secno + 3))) goto bail3; + + if (likely(qbh->bh[1]->b_data == qbh->bh[0]->b_data + 1 * 512) && + likely(qbh->bh[2]->b_data == qbh->bh[0]->b_data + 2 * 512) && + likely(qbh->bh[3]->b_data == qbh->bh[0]->b_data + 3 * 512)) { + return qbh->data = qbh->bh[0]->b_data; + } + qbh->data = data = kmalloc(2048, GFP_NOFS); if (!data) { - printk("HPFS: hpfs_map_4sectors: out of memory\n"); - goto bail; + pr_err("%s(): out of memory\n", __func__); + goto bail4; } - qbh->bh[0] = bh = sb_bread(s, secno); - if (!bh) - goto bail0; - memcpy(data, bh->b_data, 512); - - qbh->bh[1] = bh = sb_bread(s, secno + 1); - if (!bh) - goto bail1; - memcpy(data + 512, bh->b_data, 512); - - qbh->bh[2] = bh = sb_bread(s, secno + 2); - if (!bh) - goto bail2; - memcpy(data + 2 * 512, bh->b_data, 512); - - qbh->bh[3] = bh = sb_bread(s, secno + 3); - if (!bh) - goto bail3; - memcpy(data + 3 * 512, bh->b_data, 512); + memcpy(data + 0 * 512, qbh->bh[0]->b_data, 512); + memcpy(data + 1 * 512, qbh->bh[1]->b_data, 512); + memcpy(data + 2 * 512, qbh->bh[2]->b_data, 512); + memcpy(data + 3 * 512, qbh->bh[3]->b_data, 512); return data; + bail4: + brelse(qbh->bh[3]); bail3: brelse(qbh->bh[2]); bail2: @@ -135,9 +132,6 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe bail1: brelse(qbh->bh[0]); bail0: - kfree(data); - printk("HPFS: hpfs_map_4sectors: read error\n"); - bail: return NULL; } @@ -151,48 +145,58 @@ void *hpfs_get_4sectors(struct super_block *s, unsigned secno, hpfs_lock_assert(s); if (secno & 3) { - printk("HPFS: hpfs_get_4sectors: unaligned read\n"); + pr_err("%s(): unaligned read\n", __func__); return NULL; } - /*return hpfs_map_4sectors(s, secno, qbh, 0);*/ + if (!hpfs_get_sector(s, secno + 0, &qbh->bh[0])) goto bail0; + if (!hpfs_get_sector(s, secno + 1, &qbh->bh[1])) goto bail1; + if (!hpfs_get_sector(s, secno + 2, &qbh->bh[2])) goto bail2; + if (!hpfs_get_sector(s, secno + 3, &qbh->bh[3])) goto bail3; + + if (likely(qbh->bh[1]->b_data == qbh->bh[0]->b_data + 1 * 512) && + likely(qbh->bh[2]->b_data == qbh->bh[0]->b_data + 2 * 512) && + likely(qbh->bh[3]->b_data == qbh->bh[0]->b_data + 3 * 512)) { + return qbh->data = qbh->bh[0]->b_data; + } + if (!(qbh->data = kmalloc(2048, GFP_NOFS))) { - printk("HPFS: hpfs_get_4sectors: out of memory\n"); - return NULL; + pr_err("%s(): out of memory\n", __func__); + goto bail4; } - if (!(hpfs_get_sector(s, secno, &qbh->bh[0]))) goto bail0; - if (!(hpfs_get_sector(s, secno + 1, &qbh->bh[1]))) goto bail1; - if (!(hpfs_get_sector(s, secno + 2, &qbh->bh[2]))) goto bail2; - if (!(hpfs_get_sector(s, secno + 3, &qbh->bh[3]))) goto bail3; - memcpy(qbh->data, qbh->bh[0]->b_data, 512); - memcpy(qbh->data + 512, qbh->bh[1]->b_data, 512); - memcpy(qbh->data + 2*512, qbh->bh[2]->b_data, 512); - memcpy(qbh->data + 3*512, qbh->bh[3]->b_data, 512); return qbh->data; - bail3: brelse(qbh->bh[2]); - bail2: brelse(qbh->bh[1]); - bail1: brelse(qbh->bh[0]); - bail0: +bail4: + brelse(qbh->bh[3]); +bail3: + brelse(qbh->bh[2]); +bail2: + brelse(qbh->bh[1]); +bail1: + brelse(qbh->bh[0]); +bail0: return NULL; } void hpfs_brelse4(struct quad_buffer_head *qbh) { - brelse(qbh->bh[3]); - brelse(qbh->bh[2]); - brelse(qbh->bh[1]); + if (unlikely(qbh->data != qbh->bh[0]->b_data)) + kfree(qbh->data); brelse(qbh->bh[0]); - kfree(qbh->data); + brelse(qbh->bh[1]); + brelse(qbh->bh[2]); + brelse(qbh->bh[3]); } void hpfs_mark_4buffers_dirty(struct quad_buffer_head *qbh) { - memcpy(qbh->bh[0]->b_data, qbh->data, 512); - memcpy(qbh->bh[1]->b_data, qbh->data + 512, 512); - memcpy(qbh->bh[2]->b_data, qbh->data + 2 * 512, 512); - memcpy(qbh->bh[3]->b_data, qbh->data + 3 * 512, 512); + if (unlikely(qbh->data != qbh->bh[0]->b_data)) { + memcpy(qbh->bh[0]->b_data, qbh->data + 0 * 512, 512); + memcpy(qbh->bh[1]->b_data, qbh->data + 1 * 512, 512); + memcpy(qbh->bh[2]->b_data, qbh->data + 2 * 512, 512); + memcpy(qbh->bh[3]->b_data, qbh->data + 3 * 512, 512); + } mark_buffer_dirty(qbh->bh[0]); mark_buffer_dirty(qbh->bh[1]); mark_buffer_dirty(qbh->bh[2]); diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 292b1acb9b8..2a8e07425de 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -36,7 +36,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) mutex_lock(&i->i_mutex); hpfs_lock(s); - /*printk("dir lseek\n");*/ + /*pr_info("dir lseek\n");*/ if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok; pos = ((loff_t) hpfs_de_as_down_as_possible(s, hpfs_inode->i_dno) << 4) + 1; while (pos != new_off) { @@ -51,7 +51,7 @@ ok: mutex_unlock(&i->i_mutex); return new_off; fail: - /*printk("illegal lseek: %016llx\n", new_off);*/ + /*pr_warn("illegal lseek: %016llx\n", new_off);*/ hpfs_unlock(s); mutex_unlock(&i->i_mutex); return -ESPIPE; @@ -127,7 +127,7 @@ static int hpfs_readdir(struct file *file, struct dir_context *ctx) if (ctx->pos == 12) goto out; if (ctx->pos == 3 || ctx->pos == 4 || ctx->pos == 5) { - printk("HPFS: warning: pos==%d\n",(int)ctx->pos); + pr_err("pos==%d\n", (int)ctx->pos); goto out; } if (ctx->pos == 0) { diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c index 4364b2a02c5..f36fc010fcc 100644 --- a/fs/hpfs/dnode.c +++ b/fs/hpfs/dnode.c @@ -17,7 +17,7 @@ static loff_t get_pos(struct dnode *d, struct hpfs_dirent *fde) if (de == fde) return ((loff_t) le32_to_cpu(d->self) << 4) | (loff_t)i; i++; } - printk("HPFS: get_pos: not_found\n"); + pr_info("%s(): not_found\n", __func__); return ((loff_t)le32_to_cpu(d->self) << 4) | (loff_t)1; } @@ -32,7 +32,7 @@ void hpfs_add_pos(struct inode *inode, loff_t *pos) if (hpfs_inode->i_rddir_off[i] == pos) return; if (!(i&0x0f)) { if (!(ppos = kmalloc((i+0x11) * sizeof(loff_t*), GFP_NOFS))) { - printk("HPFS: out of memory for position list\n"); + pr_err("out of memory for position list\n"); return; } if (hpfs_inode->i_rddir_off) { @@ -63,7 +63,8 @@ void hpfs_del_pos(struct inode *inode, loff_t *pos) } return; not_f: - /*printk("HPFS: warning: position pointer %p->%08x not found\n", pos, (int)*pos);*/ + /*pr_warn("position pointer %p->%08x not found\n", + pos, (int)*pos);*/ return; } @@ -92,8 +93,11 @@ static void hpfs_pos_ins(loff_t *p, loff_t d, loff_t c) { if ((*p & ~0x3f) == (d & ~0x3f) && (*p & 0x3f) >= (d & 0x3f)) { int n = (*p & 0x3f) + c; - if (n > 0x3f) printk("HPFS: hpfs_pos_ins: %08x + %d\n", (int)*p, (int)c >> 8); - else *p = (*p & ~0x3f) | n; + if (n > 0x3f) + pr_err("%s(): %08x + %d\n", + __func__, (int)*p, (int)c >> 8); + else + *p = (*p & ~0x3f) | n; } } @@ -101,8 +105,11 @@ static void hpfs_pos_del(loff_t *p, loff_t d, loff_t c) { if ((*p & ~0x3f) == (d & ~0x3f) && (*p & 0x3f) >= (d & 0x3f)) { int n = (*p & 0x3f) - c; - if (n < 1) printk("HPFS: hpfs_pos_ins: %08x - %d\n", (int)*p, (int)c >> 8); - else *p = (*p & ~0x3f) | n; + if (n < 1) + pr_err("%s(): %08x - %d\n", + __func__, (int)*p, (int)c >> 8); + else + *p = (*p & ~0x3f) | n; } } @@ -239,12 +246,12 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, struct fnode *fnode; int c1, c2 = 0; if (!(nname = kmalloc(256, GFP_NOFS))) { - printk("HPFS: out of memory, can't add to dnode\n"); + pr_err("out of memory, can't add to dnode\n"); return 1; } go_up: if (namelen >= 256) { - hpfs_error(i->i_sb, "hpfs_add_to_dnode: namelen == %d", namelen); + hpfs_error(i->i_sb, "%s(): namelen == %d", __func__, namelen); kfree(nd); kfree(nname); return 1; @@ -281,7 +288,7 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, not be any error while splitting dnodes, otherwise the whole directory, not only file we're adding, would be lost. */ - printk("HPFS: out of memory for dnode splitting\n"); + pr_err("out of memory for dnode splitting\n"); hpfs_brelse4(&qbh); kfree(nname); return 1; @@ -597,7 +604,7 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) if (!de_next->down) goto endm; ndown = de_down_pointer(de_next); if (!(de_cp = kmalloc(le16_to_cpu(de->length), GFP_NOFS))) { - printk("HPFS: out of memory for dtree balancing\n"); + pr_err("out of memory for dtree balancing\n"); goto endm; } memcpy(de_cp, de, le16_to_cpu(de->length)); @@ -612,7 +619,8 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) hpfs_brelse4(&qbh1); } hpfs_add_to_dnode(i, ndown, de_cp->name, de_cp->namelen, de_cp, de_cp->down ? de_down_pointer(de_cp) : 0); - /*printk("UP-TO-DNODE: %08x (ndown = %08x, down = %08x, dno = %08x)\n", up, ndown, down, dno);*/ + /*pr_info("UP-TO-DNODE: %08x (ndown = %08x, down = %08x, dno = %08x)\n", + up, ndown, down, dno);*/ dno = up; kfree(de_cp); goto try_it_again; @@ -637,15 +645,15 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) if (!dlp && down) { if (le32_to_cpu(d1->first_free) > 2044) { if (hpfs_sb(i->i_sb)->sb_chk >= 2) { - printk("HPFS: warning: unbalanced dnode tree, see hpfs.txt 4 more info\n"); - printk("HPFS: warning: terminating balancing operation\n"); + pr_err("unbalanced dnode tree, see hpfs.txt 4 more info\n"); + pr_err("terminating balancing operation\n"); } hpfs_brelse4(&qbh1); goto endm; } if (hpfs_sb(i->i_sb)->sb_chk >= 2) { - printk("HPFS: warning: unbalanced dnode tree, see hpfs.txt 4 more info\n"); - printk("HPFS: warning: goin'on\n"); + pr_err("unbalanced dnode tree, see hpfs.txt 4 more info\n"); + pr_err("goin'on\n"); } le16_add_cpu(&del->length, 4); del->down = 1; @@ -659,7 +667,7 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) *(__le32 *) ((void *) del + le16_to_cpu(del->length) - 4) = cpu_to_le32(down); } else goto endm; if (!(de_cp = kmalloc(le16_to_cpu(de_prev->length), GFP_NOFS))) { - printk("HPFS: out of memory for dtree balancing\n"); + pr_err("out of memory for dtree balancing\n"); hpfs_brelse4(&qbh1); goto endm; } @@ -1000,7 +1008,7 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno, int d1, d2 = 0; name1 = f->name; if (!(name2 = kmalloc(256, GFP_NOFS))) { - printk("HPFS: out of memory, can't map dirent\n"); + pr_err("out of memory, can't map dirent\n"); return NULL; } if (f->len <= 15) diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c index bcaafcd2666..ce3f98ba993 100644 --- a/fs/hpfs/ea.c +++ b/fs/hpfs/ea.c @@ -51,7 +51,7 @@ static char *get_indirect_ea(struct super_block *s, int ano, secno a, int size) { char *ret; if (!(ret = kmalloc(size + 1, GFP_NOFS))) { - printk("HPFS: out of memory for EA\n"); + pr_err("out of memory for EA\n"); return NULL; } if (hpfs_ea_read(s, a, ano, 0, size, ret)) { @@ -139,7 +139,7 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si if (ea_indirect(ea)) return get_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), *size = ea_len(ea)); if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) { - printk("HPFS: out of memory for EA\n"); + pr_err("out of memory for EA\n"); return NULL; } memcpy(ret, ea_data(ea), ea_valuelen(ea)); @@ -165,7 +165,7 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si if (ea_indirect(ea)) return get_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), *size = ea_len(ea)); if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) { - printk("HPFS: out of memory for EA\n"); + pr_err("out of memory for EA\n"); return NULL; } if (hpfs_ea_read(s, a, ano, pos + 4 + ea->namelen + 1, ea_valuelen(ea), ret)) { diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 67c1a61e095..7f54e5f76ce 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -197,10 +197,10 @@ const struct address_space_operations hpfs_aops = { const struct file_operations hpfs_file_ops = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .release = hpfs_file_release, .fsync = hpfs_file_fsync, diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 1b398636e99..b63b75fa00e 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -8,6 +8,11 @@ //#define DBG //#define DEBUG_LOCKS +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mutex.h> #include <linux/pagemap.h> @@ -80,6 +85,7 @@ struct hpfs_sb_info { unsigned sb_c_bitmap; /* current bitmap */ unsigned sb_max_fwd_alloc; /* max forwad allocation */ int sb_timeshift; + struct rcu_head rcu; }; /* Four 512-byte buffers and the 2k block obtained by concatenating them */ @@ -311,7 +317,7 @@ static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb) __printf(2, 3) void hpfs_error(struct super_block *, const char *, ...); int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *); -unsigned hpfs_count_one_bitmap(struct super_block *, secno); +unsigned hpfs_get_free_dnodes(struct super_block *); /* * local time (HPFS) to GMT (Unix) diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 9edeeb0ea97..7ce4b74234a 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -183,7 +183,8 @@ void hpfs_write_inode(struct inode *i) struct inode *parent; if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return; if (hpfs_inode->i_rddir_off && !atomic_read(&i->i_count)) { - if (*hpfs_inode->i_rddir_off) printk("HPFS: write_inode: some position still there\n"); + if (*hpfs_inode->i_rddir_off) + pr_err("write_inode: some position still there\n"); kfree(hpfs_inode->i_rddir_off); hpfs_inode->i_rddir_off = NULL; } @@ -304,7 +305,7 @@ void hpfs_write_if_changed(struct inode *inode) void hpfs_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (!inode->i_nlink) { hpfs_lock(inode->i_sb); diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c index 3aa66ae1031..442770edcdc 100644 --- a/fs/hpfs/map.c +++ b/fs/hpfs/map.c @@ -65,12 +65,13 @@ unsigned char *hpfs_load_code_page(struct super_block *s, secno cps) struct code_page_directory *cp = hpfs_map_sector(s, cps, &bh, 0); if (!cp) return NULL; if (le32_to_cpu(cp->magic) != CP_DIR_MAGIC) { - printk("HPFS: Code page directory magic doesn't match (magic = %08x)\n", le32_to_cpu(cp->magic)); + pr_err("Code page directory magic doesn't match (magic = %08x)\n", + le32_to_cpu(cp->magic)); brelse(bh); return NULL; } if (!le32_to_cpu(cp->n_code_pages)) { - printk("HPFS: n_code_pages == 0\n"); + pr_err("n_code_pages == 0\n"); brelse(bh); return NULL; } @@ -79,19 +80,19 @@ unsigned char *hpfs_load_code_page(struct super_block *s, secno cps) brelse(bh); if (cpi >= 3) { - printk("HPFS: Code page index out of array\n"); + pr_err("Code page index out of array\n"); return NULL; } if (!(cpd = hpfs_map_sector(s, cpds, &bh, 0))) return NULL; if (le16_to_cpu(cpd->offs[cpi]) > 0x178) { - printk("HPFS: Code page index out of sector\n"); + pr_err("Code page index out of sector\n"); brelse(bh); return NULL; } ptr = (unsigned char *)cpd + le16_to_cpu(cpd->offs[cpi]) + 6; if (!(cp_table = kmalloc(256, GFP_KERNEL))) { - printk("HPFS: out of memory for code page table\n"); + pr_err("out of memory for code page table\n"); brelse(bh); return NULL; } @@ -114,7 +115,7 @@ __le32 *hpfs_load_bitmap_directory(struct super_block *s, secno bmp) int i; __le32 *b; if (!(b = kmalloc(n * 512, GFP_KERNEL))) { - printk("HPFS: can't allocate memory for bitmap directory\n"); + pr_err("can't allocate memory for bitmap directory\n"); return NULL; } for (i=0;i<n;i++) { @@ -281,7 +282,9 @@ struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno, hpfs_error(s, "dnode %08x does not end with \\377 entry", secno); goto bail; } - if (b == 3) printk("HPFS: warning: unbalanced dnode tree, dnode %08x; see hpfs.txt 4 more info\n", secno); + if (b == 3) + pr_err("unbalanced dnode tree, dnode %08x; see hpfs.txt 4 more info\n", + secno); } return dnode; bail: diff --git a/fs/hpfs/name.c b/fs/hpfs/name.c index 9acdf338def..b00d396d22c 100644 --- a/fs/hpfs/name.c +++ b/fs/hpfs/name.c @@ -56,14 +56,15 @@ unsigned char *hpfs_translate_name(struct super_block *s, unsigned char *from, unsigned char *to; int i; if (hpfs_sb(s)->sb_chk >= 2) if (hpfs_is_name_long(from, len) != lng) { - printk("HPFS: Long name flag mismatch - name "); - for (i=0; i<len; i++) printk("%c", from[i]); - printk(" misidentified as %s.\n", lng ? "short" : "long"); - printk("HPFS: It's nothing serious. It could happen because of bug in OS/2.\nHPFS: Set checks=normal to disable this message.\n"); + pr_err("Long name flag mismatch - name "); + for (i = 0; i < len; i++) + pr_cont("%c", from[i]); + pr_cont(" misidentified as %s.\n", lng ? "short" : "long"); + pr_err("It's nothing serious. It could happen because of bug in OS/2.\nSet checks=normal to disable this message.\n"); } if (!lc) return from; if (!(to = kmalloc(len, GFP_KERNEL))) { - printk("HPFS: can't allocate memory for name conversion buffer\n"); + pr_err("can't allocate memory for name conversion buffer\n"); return from; } for (i = 0; i < len; i++) to[i] = locase(hpfs_sb(s)->sb_cp_table,from[i]); diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 345713d2f8f..bdbc2c3080a 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -404,10 +404,10 @@ again: d_rehash(dentry); } else { struct iattr newattrs; - /*printk("HPFS: truncating file before delete.\n");*/ + /*pr_info("truncating file before delete.\n");*/ newattrs.ia_size = 0; newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; - err = notify_change(dentry, &newattrs); + err = notify_change(dentry, &newattrs, NULL); put_write_access(inode); if (!err) goto again; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 4334cda8dba..7cd00d3a7c9 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -62,22 +62,26 @@ void hpfs_error(struct super_block *s, const char *fmt, ...) vsnprintf(err_buf, sizeof(err_buf), fmt, args); va_end(args); - printk("HPFS: filesystem error: %s", err_buf); + pr_err("filesystem error: %s", err_buf); if (!hpfs_sb(s)->sb_was_error) { if (hpfs_sb(s)->sb_err == 2) { - printk("; crashing the system because you wanted it\n"); + pr_cont("; crashing the system because you wanted it\n"); mark_dirty(s, 0); panic("HPFS panic"); } else if (hpfs_sb(s)->sb_err == 1) { - if (s->s_flags & MS_RDONLY) printk("; already mounted read-only\n"); + if (s->s_flags & MS_RDONLY) + pr_cont("; already mounted read-only\n"); else { - printk("; remounting read-only\n"); + pr_cont("; remounting read-only\n"); mark_dirty(s, 0); s->s_flags |= MS_RDONLY; } - } else if (s->s_flags & MS_RDONLY) printk("; going on - but anything won't be destroyed because it's read-only\n"); - else printk("; corrupted filesystem mounted read/write - your computer will explode within 20 seconds ... but you wanted it so!\n"); - } else printk("\n"); + } else if (s->s_flags & MS_RDONLY) + pr_cont("; going on - but anything won't be destroyed because it's read-only\n"); + else + pr_cont("; corrupted filesystem mounted read/write - your computer will explode within 20 seconds ... but you wanted it so!\n"); + } else + pr_cont("\n"); hpfs_sb(s)->sb_was_error = 1; } @@ -101,21 +105,27 @@ int hpfs_stop_cycles(struct super_block *s, int key, int *c1, int *c2, return 0; } -static void hpfs_put_super(struct super_block *s) +static void free_sbi(struct hpfs_sb_info *sbi) { - struct hpfs_sb_info *sbi = hpfs_sb(s); + kfree(sbi->sb_cp_table); + kfree(sbi->sb_bmp_dir); + kfree(sbi); +} +static void lazy_free_sbi(struct rcu_head *rcu) +{ + free_sbi(container_of(rcu, struct hpfs_sb_info, rcu)); +} + +static void hpfs_put_super(struct super_block *s) +{ hpfs_lock(s); unmark_dirty(s); hpfs_unlock(s); - - kfree(sbi->sb_cp_table); - kfree(sbi->sb_bmp_dir); - s->s_fs_info = NULL; - kfree(sbi); + call_rcu(&hpfs_sb(s)->rcu, lazy_free_sbi); } -unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno) +static unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno) { struct quad_buffer_head qbh; unsigned long *bits; @@ -123,7 +133,7 @@ unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno) bits = hpfs_map_4sectors(s, secno, &qbh, 0); if (!bits) - return 0; + return (unsigned)-1; count = bitmap_weight(bits, 2048 * BITS_PER_BYTE); hpfs_brelse4(&qbh); return count; @@ -138,30 +148,45 @@ static unsigned count_bitmaps(struct super_block *s) hpfs_prefetch_bitmap(s, n); } for (n = 0; n < n_bands; n++) { + unsigned c; hpfs_prefetch_bitmap(s, n + COUNT_RD_AHEAD); - count += hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n])); + c = hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n])); + if (c != (unsigned)-1) + count += c; } return count; } +unsigned hpfs_get_free_dnodes(struct super_block *s) +{ + struct hpfs_sb_info *sbi = hpfs_sb(s); + if (sbi->sb_n_free_dnodes == (unsigned)-1) { + unsigned c = hpfs_count_one_bitmap(s, sbi->sb_dmap); + if (c == (unsigned)-1) + return 0; + sbi->sb_n_free_dnodes = c; + } + return sbi->sb_n_free_dnodes; +} + static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *s = dentry->d_sb; struct hpfs_sb_info *sbi = hpfs_sb(s); u64 id = huge_encode_dev(s->s_bdev->bd_dev); + hpfs_lock(s); - /*if (sbi->sb_n_free == -1) {*/ + if (sbi->sb_n_free == (unsigned)-1) sbi->sb_n_free = count_bitmaps(s); - sbi->sb_n_free_dnodes = hpfs_count_one_bitmap(s, sbi->sb_dmap); - /*}*/ + buf->f_type = s->s_magic; buf->f_bsize = 512; buf->f_blocks = sbi->sb_fs_size; buf->f_bfree = sbi->sb_n_free; buf->f_bavail = sbi->sb_n_free; buf->f_files = sbi->sb_dirband_size / 4; - buf->f_ffree = sbi->sb_n_free_dnodes; + buf->f_ffree = hpfs_get_free_dnodes(s); buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = 254; @@ -271,7 +296,7 @@ static int parse_opts(char *opts, kuid_t *uid, kgid_t *gid, umode_t *umask, if (!opts) return 1; - /*printk("Parsing opts: '%s'\n",opts);*/ + /*pr_info("Parsing opts: '%s'\n",opts);*/ while ((p = strsep(&opts, ",")) != NULL) { substring_t args[MAX_OPT_ARGS]; @@ -366,7 +391,7 @@ static int parse_opts(char *opts, kuid_t *uid, kgid_t *gid, umode_t *umask, static inline void hpfs_help(void) { - printk("\n\ + pr_info("\n\ HPFS filesystem options:\n\ help do not mount and display this text\n\ uid=xxx set uid of files that don't have uid specified in eas\n\ @@ -400,6 +425,8 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) struct hpfs_sb_info *sbi = hpfs_sb(s); char *new_opts = kstrdup(data, GFP_KERNEL); + sync_filesystem(s); + *flags |= MS_NOATIME; hpfs_lock(s); @@ -411,7 +438,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) if (!(o = parse_opts(data, &uid, &gid, &umask, &lowercase, &eas, &chk, &errs, &chkdsk, ×hift))) { - printk("HPFS: bad mount options.\n"); + pr_err("bad mount options.\n"); goto out_err; } if (o == 2) { @@ -419,7 +446,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) goto out_err; } if (timeshift != sbi->sb_timeshift) { - printk("HPFS: timeshift can't be changed using remount.\n"); + pr_err("timeshift can't be changed using remount.\n"); goto out_err; } @@ -485,9 +512,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) } s->s_fs_info = sbi; - sbi->sb_bmp_dir = NULL; - sbi->sb_cp_table = NULL; - mutex_init(&sbi->hpfs_mutex); hpfs_lock(s); @@ -503,7 +527,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) if (!(o = parse_opts(options, &uid, &gid, &umask, &lowercase, &eas, &chk, &errs, &chkdsk, ×hift))) { - printk("HPFS: bad mount options.\n"); + pr_err("bad mount options.\n"); goto bail0; } if (o==2) { @@ -522,16 +546,17 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) if (/*le16_to_cpu(bootblock->magic) != BB_MAGIC ||*/ le32_to_cpu(superblock->magic) != SB_MAGIC || le32_to_cpu(spareblock->magic) != SP_MAGIC) { - if (!silent) printk("HPFS: Bad magic ... probably not HPFS\n"); + if (!silent) + pr_err("Bad magic ... probably not HPFS\n"); goto bail4; } /* Check version */ if (!(s->s_flags & MS_RDONLY) && superblock->funcversion != 2 && superblock->funcversion != 3) { - printk("HPFS: Bad version %d,%d. Mount readonly to go around\n", + pr_err("Bad version %d,%d. Mount readonly to go around\n", (int)superblock->version, (int)superblock->funcversion); - printk("HPFS: please try recent version of HPFS driver at http://artax.karlin.mff.cuni.cz/~mikulas/vyplody/hpfs/index-e.cgi and if it still can't understand this format, contact author - mikulas@artax.karlin.mff.cuni.cz\n"); + pr_err("please try recent version of HPFS driver at http://artax.karlin.mff.cuni.cz/~mikulas/vyplody/hpfs/index-e.cgi and if it still can't understand this format, contact author - mikulas@artax.karlin.mff.cuni.cz\n"); goto bail4; } @@ -577,7 +602,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) /* Check for general fs errors*/ if (spareblock->dirty && !spareblock->old_wrote) { if (errs == 2) { - printk("HPFS: Improperly stopped, not mounted\n"); + pr_err("Improperly stopped, not mounted\n"); goto bail4; } hpfs_error(s, "improperly stopped"); @@ -591,22 +616,25 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) if (spareblock->hotfixes_used || spareblock->n_spares_used) { if (errs >= 2) { - printk("HPFS: Hotfixes not supported here, try chkdsk\n"); + pr_err("Hotfixes not supported here, try chkdsk\n"); mark_dirty(s, 0); goto bail4; } hpfs_error(s, "hotfixes not supported here, try chkdsk"); - if (errs == 0) printk("HPFS: Proceeding, but your filesystem will be probably corrupted by this driver...\n"); - else printk("HPFS: This driver may read bad files or crash when operating on disk with hotfixes.\n"); + if (errs == 0) + pr_err("Proceeding, but your filesystem will be probably corrupted by this driver...\n"); + else + pr_err("This driver may read bad files or crash when operating on disk with hotfixes.\n"); } if (le32_to_cpu(spareblock->n_dnode_spares) != le32_to_cpu(spareblock->n_dnode_spares_free)) { if (errs >= 2) { - printk("HPFS: Spare dnodes used, try chkdsk\n"); + pr_err("Spare dnodes used, try chkdsk\n"); mark_dirty(s, 0); goto bail4; } hpfs_error(s, "warning: spare dnodes used, try chkdsk"); - if (errs == 0) printk("HPFS: Proceeding, but your filesystem could be corrupted if you delete files or directories\n"); + if (errs == 0) + pr_err("Proceeding, but your filesystem could be corrupted if you delete files or directories\n"); } if (chk) { unsigned a; @@ -625,12 +653,13 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) goto bail4; } sbi->sb_dirband_size = a; - } else printk("HPFS: You really don't want any checks? You are crazy...\n"); + } else + pr_err("You really don't want any checks? You are crazy...\n"); /* Load code page table */ if (le32_to_cpu(spareblock->n_code_pages)) if (!(sbi->sb_cp_table = hpfs_load_code_page(s, le32_to_cpu(spareblock->code_page_dir)))) - printk("HPFS: Warning: code page support is disabled\n"); + pr_err("code page support is disabled\n"); brelse(bh2); brelse(bh1); @@ -679,10 +708,7 @@ bail2: brelse(bh0); bail1: bail0: hpfs_unlock(s); - kfree(sbi->sb_bmp_dir); - kfree(sbi->sb_cp_table); - s->s_fs_info = NULL; - kfree(sbi); + free_sbi(sbi); return -EINVAL; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index d19b30ababf..1e2872b2534 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -6,6 +6,8 @@ * Copyright (C) 2002 Linus Torvalds. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/thread_info.h> #include <asm/current.h> @@ -366,7 +368,13 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart) static void hugetlbfs_evict_inode(struct inode *inode) { + struct resv_map *resv_map; + truncate_hugepages(inode, 0); + resv_map = (struct resv_map *)inode->i_mapping->private_data; + /* root inode doesn't have the resv_map, so we should check it */ + if (resv_map) + resv_map_release(&resv_map->refs); clear_inode(inode); } @@ -469,13 +477,18 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, * annotation because huge_pmd_share() does an allocation under * i_mmap_mutex. */ -struct lock_class_key hugetlbfs_i_mmap_mutex_key; +static struct lock_class_key hugetlbfs_i_mmap_mutex_key; static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev) { struct inode *inode; + struct resv_map *resv_map; + + resv_map = resv_map_alloc(); + if (!resv_map) + return NULL; inode = new_inode(sb); if (inode) { @@ -487,7 +500,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, inode->i_mapping->a_ops = &hugetlbfs_aops; inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - INIT_LIST_HEAD(&inode->i_mapping->private_list); + inode->i_mapping->private_data = resv_map; info = HUGETLBFS_I(inode); /* * The policy is initialized here even if we are creating a @@ -517,7 +530,9 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, break; } lockdep_annotate_inode_mutex_key(inode); - } + } else + kref_put(&resv_map->refs, resv_map_release); + return inode; } @@ -810,8 +825,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) ps = memparse(args[0].from, &rest); pconfig->hstate = size_to_hstate(ps); if (!pconfig->hstate) { - printk(KERN_ERR - "hugetlbfs: Unsupported page size %lu MB\n", + pr_err("Unsupported page size %lu MB\n", ps >> 20); return -EINVAL; } @@ -819,8 +833,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) } default: - printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n", - p); + pr_err("Bad mount option: \"%s\"\n", p); return -EINVAL; break; } @@ -840,8 +853,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) return 0; bad_val: - printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n", - args[0].from, p); + pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p); return -EINVAL; } @@ -889,8 +901,7 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) goto out_free; return 0; out_free: - if (sbinfo->spool) - kfree(sbinfo->spool); + kfree(sbinfo->spool); kfree(sbinfo); return -ENOMEM; } @@ -926,7 +937,7 @@ static int get_hstate_idx(int page_size_log) return h - hstates; } -static struct dentry_operations anon_ops = { +static const struct dentry_operations anon_ops = { .d_dname = simple_dname }; @@ -957,8 +968,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, *user = current_user(); if (user_shm_lock(size, *user)) { task_lock(current); - printk_once(KERN_WARNING - "%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", + pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", current->comm, current->pid); task_unlock(current); } else { @@ -1017,6 +1027,11 @@ static int __init init_hugetlbfs_fs(void) int error; int i; + if (!hugepages_supported()) { + pr_info("disabling because there are no supported hugepage sizes\n"); + return -ENOTSUPP; + } + error = bdi_init(&hugetlbfs_backing_dev_info); if (error) return error; @@ -1042,7 +1057,7 @@ static int __init init_hugetlbfs_fs(void) buf); if (IS_ERR(hugetlbfs_vfsmount[i])) { - pr_err("hugetlb: Cannot mount internal hugetlbfs for " + pr_err("Cannot mount internal hugetlbfs for " "page size %uK", ps_kb); error = PTR_ERR(hugetlbfs_vfsmount[i]); hugetlbfs_vfsmount[i] = NULL; diff --git a/fs/inode.c b/fs/inode.c index b33ba8e021c..6eecb7ff0b9 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -105,7 +105,7 @@ long get_nr_dirty_inodes(void) * Handle nr_inode sysctl */ #ifdef CONFIG_SYSCTL -int proc_nr_inodes(ctl_table *table, int write, +int proc_nr_inodes(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { inodes_stat.nr_inodes = get_nr_inodes(); @@ -503,6 +503,7 @@ void clear_inode(struct inode *inode) */ spin_lock_irq(&inode->i_data.tree_lock); BUG_ON(inode->i_data.nrpages); + BUG_ON(inode->i_data.nrshadows); spin_unlock_irq(&inode->i_data.tree_lock); BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); @@ -548,8 +549,7 @@ static void evict(struct inode *inode) if (op->evict_inode) { op->evict_inode(inode); } else { - if (inode->i_data.nrpages) - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); } if (S_ISBLK(inode->i_mode) && inode->i_bdev) @@ -773,15 +773,11 @@ static struct inode *find_inode(struct super_block *sb, repeat: hlist_for_each_entry(inode, head, i_hash) { - spin_lock(&inode->i_lock); - if (inode->i_sb != sb) { - spin_unlock(&inode->i_lock); + if (inode->i_sb != sb) continue; - } - if (!test(inode, data)) { - spin_unlock(&inode->i_lock); + if (!test(inode, data)) continue; - } + spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; @@ -804,15 +800,11 @@ static struct inode *find_inode_fast(struct super_block *sb, repeat: hlist_for_each_entry(inode, head, i_hash) { - spin_lock(&inode->i_lock); - if (inode->i_ino != ino) { - spin_unlock(&inode->i_lock); + if (inode->i_ino != ino) continue; - } - if (inode->i_sb != sb) { - spin_unlock(&inode->i_lock); + if (inode->i_sb != sb) continue; - } + spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; @@ -951,6 +943,41 @@ void unlock_new_inode(struct inode *inode) EXPORT_SYMBOL(unlock_new_inode); /** + * lock_two_nondirectories - take two i_mutexes on non-directory objects + * + * Lock any non-NULL argument that is not a directory. + * Zero, one or two objects may be locked by this function. + * + * @inode1: first inode to lock + * @inode2: second inode to lock + */ +void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) +{ + if (inode1 > inode2) + swap(inode1, inode2); + + if (inode1 && !S_ISDIR(inode1->i_mode)) + mutex_lock(&inode1->i_mutex); + if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2); +} +EXPORT_SYMBOL(lock_two_nondirectories); + +/** + * unlock_two_nondirectories - release locks from lock_two_nondirectories() + * @inode1: first inode to unlock + * @inode2: second inode to unlock + */ +void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) +{ + if (inode1 && !S_ISDIR(inode1->i_mode)) + mutex_unlock(&inode1->i_mutex); + if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) + mutex_unlock(&inode2->i_mutex); +} +EXPORT_SYMBOL(unlock_two_nondirectories); + +/** * iget5_locked - obtain an inode from a mounted file system * @sb: super block of file system * @hashval: hash value (usually inode number) to get @@ -1575,7 +1602,11 @@ static int __remove_suid(struct dentry *dentry, int kill) struct iattr newattrs; newattrs.ia_valid = ATTR_FORCE | kill; - return notify_change(dentry, &newattrs); + /* + * Note we call this on write, so notify_change will not + * encounter any conflicting delegations: + */ + return notify_change(dentry, &newattrs, NULL); } int file_remove_suid(struct file *file) @@ -1808,14 +1839,18 @@ EXPORT_SYMBOL(inode_init_owner); * inode_owner_or_capable - check current task permissions to inode * @inode: inode being checked * - * Return true if current either has CAP_FOWNER to the inode, or - * owns the file. + * Return true if current either has CAP_FOWNER in a namespace with the + * inode owner uid mapped, or owns the file. */ bool inode_owner_or_capable(const struct inode *inode) { + struct user_namespace *ns; + if (uid_eq(current_fsuid(), inode->i_uid)) return true; - if (inode_capable(inode, CAP_FOWNER)) + + ns = current_user_ns(); + if (ns_capable(ns, CAP_FOWNER) && kuid_has_mapping(ns, inode->i_uid)) return true; return false; } @@ -1867,3 +1902,34 @@ void inode_dio_done(struct inode *inode) wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); } EXPORT_SYMBOL(inode_dio_done); + +/* + * inode_set_flags - atomically set some inode flags + * + * Note: the caller should be holding i_mutex, or else be sure that + * they have exclusive access to the inode structure (i.e., while the + * inode is being instantiated). The reason for the cmpxchg() loop + * --- which wouldn't be necessary if all code paths which modify + * i_flags actually followed this rule, is that there is at least one + * code path which doesn't today --- for example, + * __generic_file_aio_write() calls file_remove_suid() without holding + * i_mutex --- so we use cmpxchg() out of an abundance of caution. + * + * In the long run, i_mutex is overkill, and we should probably look + * at using the i_lock spinlock to protect i_flags, and then make sure + * it is so documented in include/linux/fs.h and that all code follows + * the locking convention!! + */ +void inode_set_flags(struct inode *inode, unsigned int flags, + unsigned int mask) +{ + unsigned int old_flags, new_flags; + + WARN_ON_ONCE(flags & ~mask); + do { + old_flags = ACCESS_ONCE(inode->i_flags); + new_flags = (old_flags & ~mask) | flags; + } while (unlikely(cmpxchg(&inode->i_flags, old_flags, + new_flags) != old_flags)); +} +EXPORT_SYMBOL(inode_set_flags); diff --git a/fs/internal.h b/fs/internal.h index 513e0d859a6..46574240746 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -9,8 +9,6 @@ * 2 of the License, or (at your option) any later version. */ -#include <linux/lglock.h> - struct super_block; struct file_system_type; struct linux_binprm; @@ -62,8 +60,6 @@ extern int sb_prepare_remount_readonly(struct super_block *); extern void __init mnt_init(void); -extern struct lglock vfsmount_lock; - extern int __mnt_want_write(struct vfsmount *); extern int __mnt_want_write_file(struct file *); extern void __mnt_drop_write(struct vfsmount *); @@ -77,9 +73,6 @@ extern void chroot_fs_refs(const struct path *, const struct path *); /* * file_table.c */ -extern void file_sb_list_add(struct file *f, struct super_block *sb); -extern void file_sb_list_del(struct file *f); -extern void mark_files_ro(struct super_block *); extern struct file *get_empty_filp(void); /* diff --git a/fs/ioctl.c b/fs/ioctl.c index fd507fb460f..8ac3fad3619 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -37,7 +37,7 @@ static long vfs_ioctl(struct file *filp, unsigned int cmd, { int error = -ENOTTY; - if (!filp->f_op || !filp->f_op->unlocked_ioctl) + if (!filp->f_op->unlocked_ioctl) goto out; error = filp->f_op->unlocked_ioctl(filp, cmd, arg); @@ -501,7 +501,7 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp, /* Did FASYNC state change ? */ if ((flag ^ filp->f_flags) & FASYNC) { - if (filp->f_op && filp->f_op->fasync) + if (filp->f_op->fasync) /* fasync() adjusts filp->f_flags */ error = filp->f_op->fasync(fd, filp, on); else diff --git a/fs/ioprio.c b/fs/ioprio.c deleted file mode 100644 index e50170ca7c3..00000000000 --- a/fs/ioprio.c +++ /dev/null @@ -1,241 +0,0 @@ -/* - * fs/ioprio.c - * - * Copyright (C) 2004 Jens Axboe <axboe@kernel.dk> - * - * Helper functions for setting/querying io priorities of processes. The - * system calls closely mimmick getpriority/setpriority, see the man page for - * those. The prio argument is a composite of prio class and prio data, where - * the data argument has meaning within that class. The standard scheduling - * classes have 8 distinct prio levels, with 0 being the highest prio and 7 - * being the lowest. - * - * IOW, setting BE scheduling class with prio 2 is done ala: - * - * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2; - * - * ioprio_set(PRIO_PROCESS, pid, prio); - * - * See also Documentation/block/ioprio.txt - * - */ -#include <linux/gfp.h> -#include <linux/kernel.h> -#include <linux/export.h> -#include <linux/ioprio.h> -#include <linux/blkdev.h> -#include <linux/capability.h> -#include <linux/syscalls.h> -#include <linux/security.h> -#include <linux/pid_namespace.h> - -int set_task_ioprio(struct task_struct *task, int ioprio) -{ - int err; - struct io_context *ioc; - const struct cred *cred = current_cred(), *tcred; - - rcu_read_lock(); - tcred = __task_cred(task); - if (!uid_eq(tcred->uid, cred->euid) && - !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) { - rcu_read_unlock(); - return -EPERM; - } - rcu_read_unlock(); - - err = security_task_setioprio(task, ioprio); - if (err) - return err; - - ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); - if (ioc) { - ioc->ioprio = ioprio; - put_io_context(ioc); - } - - return err; -} -EXPORT_SYMBOL_GPL(set_task_ioprio); - -SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) -{ - int class = IOPRIO_PRIO_CLASS(ioprio); - int data = IOPRIO_PRIO_DATA(ioprio); - struct task_struct *p, *g; - struct user_struct *user; - struct pid *pgrp; - kuid_t uid; - int ret; - - switch (class) { - case IOPRIO_CLASS_RT: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - /* fall through, rt has prio field too */ - case IOPRIO_CLASS_BE: - if (data >= IOPRIO_BE_NR || data < 0) - return -EINVAL; - - break; - case IOPRIO_CLASS_IDLE: - break; - case IOPRIO_CLASS_NONE: - if (data) - return -EINVAL; - break; - default: - return -EINVAL; - } - - ret = -ESRCH; - rcu_read_lock(); - switch (which) { - case IOPRIO_WHO_PROCESS: - if (!who) - p = current; - else - p = find_task_by_vpid(who); - if (p) - ret = set_task_ioprio(p, ioprio); - break; - case IOPRIO_WHO_PGRP: - if (!who) - pgrp = task_pgrp(current); - else - pgrp = find_vpid(who); - do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - ret = set_task_ioprio(p, ioprio); - if (ret) - break; - } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); - break; - case IOPRIO_WHO_USER: - uid = make_kuid(current_user_ns(), who); - if (!uid_valid(uid)) - break; - if (!who) - user = current_user(); - else - user = find_user(uid); - - if (!user) - break; - - do_each_thread(g, p) { - if (!uid_eq(task_uid(p), uid)) - continue; - ret = set_task_ioprio(p, ioprio); - if (ret) - goto free_uid; - } while_each_thread(g, p); -free_uid: - if (who) - free_uid(user); - break; - default: - ret = -EINVAL; - } - - rcu_read_unlock(); - return ret; -} - -static int get_task_ioprio(struct task_struct *p) -{ - int ret; - - ret = security_task_getioprio(p); - if (ret) - goto out; - ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM); - if (p->io_context) - ret = p->io_context->ioprio; -out: - return ret; -} - -int ioprio_best(unsigned short aprio, unsigned short bprio) -{ - unsigned short aclass = IOPRIO_PRIO_CLASS(aprio); - unsigned short bclass = IOPRIO_PRIO_CLASS(bprio); - - if (aclass == IOPRIO_CLASS_NONE) - aclass = IOPRIO_CLASS_BE; - if (bclass == IOPRIO_CLASS_NONE) - bclass = IOPRIO_CLASS_BE; - - if (aclass == bclass) - return min(aprio, bprio); - if (aclass > bclass) - return bprio; - else - return aprio; -} - -SYSCALL_DEFINE2(ioprio_get, int, which, int, who) -{ - struct task_struct *g, *p; - struct user_struct *user; - struct pid *pgrp; - kuid_t uid; - int ret = -ESRCH; - int tmpio; - - rcu_read_lock(); - switch (which) { - case IOPRIO_WHO_PROCESS: - if (!who) - p = current; - else - p = find_task_by_vpid(who); - if (p) - ret = get_task_ioprio(p); - break; - case IOPRIO_WHO_PGRP: - if (!who) - pgrp = task_pgrp(current); - else - pgrp = find_vpid(who); - do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - tmpio = get_task_ioprio(p); - if (tmpio < 0) - continue; - if (ret == -ESRCH) - ret = tmpio; - else - ret = ioprio_best(ret, tmpio); - } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); - break; - case IOPRIO_WHO_USER: - uid = make_kuid(current_user_ns(), who); - if (!who) - user = current_user(); - else - user = find_user(uid); - - if (!user) - break; - - do_each_thread(g, p) { - if (!uid_eq(task_uid(p), user->uid)) - continue; - tmpio = get_task_ioprio(p); - if (tmpio < 0) - continue; - if (ret == -ESRCH) - ret = tmpio; - else - ret = ioprio_best(ret, tmpio); - } while_each_thread(g, p); - - if (who) - free_uid(user); - break; - default: - ret = -EINVAL; - } - - rcu_read_unlock(); - return ret; -} diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index e5d408a7ea4..4556ce1af5b 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -93,7 +93,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { isofs_inode_cachep = kmem_cache_create("isofs_inode_cache", sizeof(struct iso_inode_info), @@ -117,6 +117,7 @@ static void destroy_inodecache(void) static int isofs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); if (!(*flags & MS_RDONLY)) return -EROFS; return 0; @@ -181,7 +182,7 @@ struct iso9660_options{ * Compute the hash for the isofs name corresponding to the dentry. */ static int -isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms) +isofs_hash_common(struct qstr *qstr, int ms) { const char *name; int len; @@ -202,7 +203,7 @@ isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms) * Compute the hash for the isofs name corresponding to the dentry. */ static int -isofs_hashi_common(const struct dentry *dentry, struct qstr *qstr, int ms) +isofs_hashi_common(struct qstr *qstr, int ms) { const char *name; int len; @@ -259,13 +260,13 @@ static int isofs_dentry_cmp_common( static int isofs_hash(const struct dentry *dentry, struct qstr *qstr) { - return isofs_hash_common(dentry, qstr, 0); + return isofs_hash_common(qstr, 0); } static int isofs_hashi(const struct dentry *dentry, struct qstr *qstr) { - return isofs_hashi_common(dentry, qstr, 0); + return isofs_hashi_common(qstr, 0); } static int @@ -286,13 +287,13 @@ isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry, static int isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr) { - return isofs_hash_common(dentry, qstr, 1); + return isofs_hash_common(qstr, 1); } static int isofs_hashi_ms(const struct dentry *dentry, struct qstr *qstr) { - return isofs_hashi_common(dentry, qstr, 1); + return isofs_hashi_common(qstr, 1); } static int diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 2d04f9afafd..06fe11e0abf 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -573,7 +573,7 @@ int log_wait_commit(journal_t *journal, tid_t tid) #ifdef CONFIG_JBD_DEBUG spin_lock(&journal->j_state_lock); if (!tid_geq(journal->j_commit_request, tid)) { - printk(KERN_EMERG + printk(KERN_ERR "%s: error: j_commit_request=%d, tid=%d\n", __func__, journal->j_commit_request, tid); } @@ -604,10 +604,8 @@ int log_wait_commit(journal_t *journal, tid_t tid) out_unlock: spin_unlock(&journal->j_state_lock); - if (unlikely(is_journal_aborted(journal))) { - printk(KERN_EMERG "journal commit I/O error\n"); + if (unlikely(is_journal_aborted(journal))) err = -EIO; - } return err; } @@ -2136,7 +2134,7 @@ static void __exit journal_exit(void) #ifdef CONFIG_JBD_DEBUG int n = atomic_read(&nr_journal_heads); if (n) - printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); + printk(KERN_ERR "JBD: leaked %d journal_heads!\n", n); #endif jbd_remove_debugfs_entry(); journal_destroy_caches(); diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c index 25c713e7071..8898bbd2b61 100644 --- a/fs/jbd/revoke.c +++ b/fs/jbd/revoke.c @@ -231,19 +231,15 @@ record_cache_failure: static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size) { - int shift = 0; - int tmp = hash_size; + int i; struct jbd_revoke_table_s *table; table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); if (!table) goto out; - while((tmp >>= 1UL) != 0UL) - shift++; - table->hash_size = hash_size; - table->hash_shift = shift; + table->hash_shift = ilog2(hash_size); table->hash_table = kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); if (!table->hash_table) { @@ -252,8 +248,8 @@ static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size) goto out; } - for (tmp = 0; tmp < hash_size; tmp++) - INIT_LIST_HEAD(&table->hash_table[tmp]); + for (i = 0; i < hash_size; i++) + INIT_LIST_HEAD(&table->hash_table[i]); out: return table; diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index be0c39b66fe..1695ba8334a 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -26,7 +26,6 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/hrtimer.h> -#include <linux/backing-dev.h> static void __journal_temp_unlink_buffer(struct journal_head *jh); @@ -100,10 +99,11 @@ static int start_this_handle(journal_t *journal, handle_t *handle) alloc_transaction: if (!journal->j_running_transaction) { - new_transaction = kzalloc(sizeof(*new_transaction), GFP_NOFS); + new_transaction = kzalloc(sizeof(*new_transaction), + GFP_NOFS|__GFP_NOFAIL); if (!new_transaction) { - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto alloc_transaction; + ret = -ENOMEM; + goto out; } } @@ -675,7 +675,7 @@ repeat: jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!frozen_buffer) { - printk(KERN_EMERG + printk(KERN_ERR "%s: OOM for frozen_buffer\n", __func__); JBUFFER_TRACE(jh, "oom!"); @@ -898,7 +898,7 @@ repeat: if (!jh->b_committed_data) { committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!committed_data) { - printk(KERN_EMERG "%s: No memory for committed data\n", + printk(KERN_ERR "%s: No memory for committed data\n", __func__); err = -ENOMEM; goto out; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index cf2fc059406..6fac7434985 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -43,7 +43,7 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) clear_buffer_uptodate(bh); if (orig_bh) { clear_bit_unlock(BH_Shadow, &orig_bh->b_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&orig_bh->b_state, BH_Shadow); } unlock_buffer(bh); @@ -239,7 +239,7 @@ static int journal_submit_data_buffers(journal_t *journal, spin_lock(&journal->j_list_lock); J_ASSERT(jinode->i_transaction == commit_transaction); clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } spin_unlock(&journal->j_list_lock); @@ -277,7 +277,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, } spin_lock(&journal->j_list_lock); clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } @@ -555,7 +555,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) blk_start_plug(&plug); jbd2_journal_write_revoke_records(journal, commit_transaction, &log_bufs, WRITE_SYNC); - blk_finish_plug(&plug); jbd_debug(3, "JBD2: commit phase 2b\n"); @@ -582,7 +581,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) err = 0; bufs = 0; descriptor = NULL; - blk_start_plug(&plug); while (commit_transaction->t_buffers) { /* Find the next buffer to be journaled... */ @@ -1067,6 +1065,25 @@ restart_loop: goto restart_loop; } + /* Add the transaction to the checkpoint list + * __journal_remove_checkpoint() can not destroy transaction + * under us because it is not marked as T_FINISHED yet */ + if (journal->j_checkpoint_transactions == NULL) { + journal->j_checkpoint_transactions = commit_transaction; + commit_transaction->t_cpnext = commit_transaction; + commit_transaction->t_cpprev = commit_transaction; + } else { + commit_transaction->t_cpnext = + journal->j_checkpoint_transactions; + commit_transaction->t_cpprev = + commit_transaction->t_cpnext->t_cpprev; + commit_transaction->t_cpnext->t_cpprev = + commit_transaction; + commit_transaction->t_cpprev->t_cpnext = + commit_transaction; + } + spin_unlock(&journal->j_list_lock); + /* Done with this transaction! */ jbd_debug(3, "JBD2: commit phase 7\n"); @@ -1085,24 +1102,7 @@ restart_loop: atomic_read(&commit_transaction->t_handle_count); trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, commit_transaction->t_tid, &stats.run); - - /* - * Calculate overall stats - */ - spin_lock(&journal->j_history_lock); - journal->j_stats.ts_tid++; - if (commit_transaction->t_requested) - journal->j_stats.ts_requested++; - journal->j_stats.run.rs_wait += stats.run.rs_wait; - journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay; - journal->j_stats.run.rs_running += stats.run.rs_running; - journal->j_stats.run.rs_locked += stats.run.rs_locked; - journal->j_stats.run.rs_flushing += stats.run.rs_flushing; - journal->j_stats.run.rs_logging += stats.run.rs_logging; - journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; - journal->j_stats.run.rs_blocks += stats.run.rs_blocks; - journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; - spin_unlock(&journal->j_history_lock); + stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0; commit_transaction->t_state = T_COMMIT_CALLBACK; J_ASSERT(commit_transaction == journal->j_committing_transaction); @@ -1122,24 +1122,6 @@ restart_loop: write_unlock(&journal->j_state_lock); - if (journal->j_checkpoint_transactions == NULL) { - journal->j_checkpoint_transactions = commit_transaction; - commit_transaction->t_cpnext = commit_transaction; - commit_transaction->t_cpprev = commit_transaction; - } else { - commit_transaction->t_cpnext = - journal->j_checkpoint_transactions; - commit_transaction->t_cpprev = - commit_transaction->t_cpnext->t_cpprev; - commit_transaction->t_cpnext->t_cpprev = - commit_transaction; - commit_transaction->t_cpprev->t_cpnext = - commit_transaction; - } - spin_unlock(&journal->j_list_lock); - /* Drop all spin_locks because commit_callback may be block. - * __journal_remove_checkpoint() can not destroy transaction - * under us because it is not marked as T_FINISHED yet */ if (journal->j_commit_callback) journal->j_commit_callback(journal, commit_transaction); @@ -1150,7 +1132,7 @@ restart_loop: write_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); commit_transaction->t_state = T_FINISHED; - /* Recheck checkpoint lists after j_list_lock was dropped */ + /* Check if the transaction can be dropped now that we are finished */ if (commit_transaction->t_checkpoint_list == NULL && commit_transaction->t_checkpoint_io_list == NULL) { __jbd2_journal_drop_transaction(journal, commit_transaction); @@ -1159,4 +1141,21 @@ restart_loop: spin_unlock(&journal->j_list_lock); write_unlock(&journal->j_state_lock); wake_up(&journal->j_wait_done_commit); + + /* + * Calculate overall stats + */ + spin_lock(&journal->j_history_lock); + journal->j_stats.ts_tid++; + journal->j_stats.ts_requested += stats.ts_requested; + journal->j_stats.run.rs_wait += stats.run.rs_wait; + journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay; + journal->j_stats.run.rs_running += stats.run.rs_running; + journal->j_stats.run.rs_locked += stats.run.rs_locked; + journal->j_stats.run.rs_flushing += stats.run.rs_flushing; + journal->j_stats.run.rs_logging += stats.run.rs_logging; + journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; + journal->j_stats.run.rs_blocks += stats.run.rs_blocks; + journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; + spin_unlock(&journal->j_history_lock); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 52032647dd4..67b8e303946 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -122,7 +122,7 @@ EXPORT_SYMBOL(__jbd2_debug); #endif /* Checksumming functions */ -int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) +static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) { if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return 1; @@ -143,7 +143,7 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) return cpu_to_be32(csum); } -int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) +static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) { if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return 1; @@ -151,7 +151,7 @@ int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) return sb->s_checksum == jbd2_superblock_csum(j, sb); } -void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) +static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) { if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return; @@ -302,8 +302,8 @@ static void journal_kill_thread(journal_t *journal) journal->j_flags |= JBD2_UNMOUNT; while (journal->j_task) { - wake_up(&journal->j_wait_commit); write_unlock(&journal->j_state_lock); + wake_up(&journal->j_wait_commit); wait_event(journal->j_wait_done_commit, journal->j_task == NULL); write_lock(&journal->j_state_lock); } @@ -702,7 +702,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) read_lock(&journal->j_state_lock); #ifdef CONFIG_JBD2_DEBUG if (!tid_geq(journal->j_commit_request, tid)) { - printk(KERN_EMERG + printk(KERN_ERR "%s: error: j_commit_request=%d, tid=%d\n", __func__, journal->j_commit_request, tid); } @@ -710,18 +710,16 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) while (tid_gt(tid, journal->j_commit_sequence)) { jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n", tid, journal->j_commit_sequence); - wake_up(&journal->j_wait_commit); read_unlock(&journal->j_state_lock); + wake_up(&journal->j_wait_commit); wait_event(journal->j_wait_done_commit, !tid_gt(tid, journal->j_commit_sequence)); read_lock(&journal->j_state_lock); } read_unlock(&journal->j_state_lock); - if (unlikely(is_journal_aborted(journal))) { - printk(KERN_EMERG "journal commit I/O error\n"); + if (unlikely(is_journal_aborted(journal))) err = -EIO; - } return err; } @@ -1527,13 +1525,13 @@ static int journal_get_superblock(journal_t *journal) if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) && JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { /* Can't have checksum v1 and v2 on at the same time! */ - printk(KERN_ERR "JBD: Can't enable checksumming v1 and v2 " + printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 " "at the same time!\n"); goto out; } if (!jbd2_verify_csum_type(journal, sb)) { - printk(KERN_ERR "JBD: Unknown checksum type\n"); + printk(KERN_ERR "JBD2: Unknown checksum type\n"); goto out; } @@ -1541,7 +1539,7 @@ static int journal_get_superblock(journal_t *journal) if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(journal->j_chksum_driver)) { - printk(KERN_ERR "JBD: Cannot load crc32c driver.\n"); + printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); err = PTR_ERR(journal->j_chksum_driver); journal->j_chksum_driver = NULL; goto out; @@ -1550,7 +1548,7 @@ static int journal_get_superblock(journal_t *journal) /* Check superblock checksum */ if (!jbd2_superblock_csum_verify(journal, sb)) { - printk(KERN_ERR "JBD: journal checksum error\n"); + printk(KERN_ERR "JBD2: journal checksum error\n"); goto out; } @@ -1836,7 +1834,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(journal->j_chksum_driver)) { - printk(KERN_ERR "JBD: Cannot load crc32c " + printk(KERN_ERR "JBD2: Cannot load crc32c " "driver.\n"); journal->j_chksum_driver = NULL; return 0; @@ -2645,7 +2643,7 @@ static void __exit journal_exit(void) #ifdef CONFIG_JBD2_DEBUG int n = atomic_read(&nr_journal_heads); if (n) - printk(KERN_EMERG "JBD2: leaked %d journal_heads!\n", n); + printk(KERN_ERR "JBD2: leaked %d journal_heads!\n", n); #endif jbd2_remove_jbd_stats_proc_entry(); jbd2_journal_destroy_caches(); diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 3929c50428b..3b6bb19d60b 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -594,7 +594,7 @@ static int do_one_pass(journal_t *journal, be32_to_cpu(tmp->h_sequence))) { brelse(obh); success = -EIO; - printk(KERN_ERR "JBD: Invalid " + printk(KERN_ERR "JBD2: Invalid " "checksum recovering " "block %llu in log\n", blocknr); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 7aa9a32573b..6f0f590cc5a 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -514,11 +514,13 @@ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, * similarly constrained call sites */ ret = start_this_handle(journal, handle, GFP_NOFS); - if (ret < 0) + if (ret < 0) { jbd2_journal_free_reserved(handle); + return ret; + } handle->h_type = type; handle->h_line_no = line_no; - return ret; + return 0; } EXPORT_SYMBOL(jbd2_journal_start_reserved); @@ -932,7 +934,7 @@ repeat: jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!frozen_buffer) { - printk(KERN_EMERG + printk(KERN_ERR "%s: OOM for frozen_buffer\n", __func__); JBUFFER_TRACE(jh, "oom!"); @@ -1071,7 +1073,6 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) * reused here. */ jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); J_ASSERT_JH(jh, (jh->b_transaction == transaction || jh->b_transaction == NULL || (jh->b_transaction == journal->j_committing_transaction && @@ -1094,12 +1095,14 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) jh->b_modified = 0; JBUFFER_TRACE(jh, "file as BJ_Reserved"); + spin_lock(&journal->j_list_lock); __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); } else if (jh->b_transaction == journal->j_committing_transaction) { /* first access by this transaction */ jh->b_modified = 0; JBUFFER_TRACE(jh, "set next transaction"); + spin_lock(&journal->j_list_lock); jh->b_next_transaction = transaction; } spin_unlock(&journal->j_list_lock); @@ -1166,7 +1169,7 @@ repeat: if (!jh->b_committed_data) { committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!committed_data) { - printk(KERN_EMERG "%s: No memory for committed data\n", + printk(KERN_ERR "%s: No memory for committed data\n", __func__); err = -ENOMEM; goto out; @@ -1290,7 +1293,10 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) * once a transaction -bzzz */ jh->b_modified = 1; - J_ASSERT_JH(jh, handle->h_buffer_credits > 0); + if (handle->h_buffer_credits <= 0) { + ret = -ENOSPC; + goto out_unlock_bh; + } handle->h_buffer_credits--; } @@ -1305,9 +1311,9 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) JBUFFER_TRACE(jh, "fastpath"); if (unlikely(jh->b_transaction != journal->j_running_transaction)) { - printk(KERN_EMERG "JBD: %s: " + printk(KERN_ERR "JBD2: %s: " "jh->b_transaction (%llu, %p, %u) != " - "journal->j_running_transaction (%p, %u)", + "journal->j_running_transaction (%p, %u)\n", journal->j_devname, (unsigned long long) bh->b_blocknr, jh->b_transaction, @@ -1330,30 +1336,25 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) */ if (jh->b_transaction != transaction) { JBUFFER_TRACE(jh, "already on other transaction"); - if (unlikely(jh->b_transaction != - journal->j_committing_transaction)) { - printk(KERN_EMERG "JBD: %s: " - "jh->b_transaction (%llu, %p, %u) != " - "journal->j_committing_transaction (%p, %u)", + if (unlikely(((jh->b_transaction != + journal->j_committing_transaction)) || + (jh->b_next_transaction != transaction))) { + printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: " + "bad jh for block %llu: " + "transaction (%p, %u), " + "jh->b_transaction (%p, %u), " + "jh->b_next_transaction (%p, %u), jlist %u\n", journal->j_devname, (unsigned long long) bh->b_blocknr, + transaction, transaction->t_tid, jh->b_transaction, - jh->b_transaction ? jh->b_transaction->t_tid : 0, - journal->j_committing_transaction, - journal->j_committing_transaction ? - journal->j_committing_transaction->t_tid : 0); - ret = -EINVAL; - } - if (unlikely(jh->b_next_transaction != transaction)) { - printk(KERN_EMERG "JBD: %s: " - "jh->b_next_transaction (%llu, %p, %u) != " - "transaction (%p, %u)", - journal->j_devname, - (unsigned long long) bh->b_blocknr, + jh->b_transaction ? + jh->b_transaction->t_tid : 0, jh->b_next_transaction, jh->b_next_transaction ? jh->b_next_transaction->t_tid : 0, - transaction, transaction->t_tid); + jh->b_jlist); + WARN_ON(1); ret = -EINVAL; } /* And this case is illegal: we can't reuse another @@ -1373,7 +1374,6 @@ out_unlock_bh: jbd2_journal_put_journal_head(jh); out: JBUFFER_TRACE(jh, "exit"); - WARN_ON(ret); /* All errors are bugs, so dump the stack */ return ret; } @@ -1411,7 +1411,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) BUFFER_TRACE(bh, "entry"); jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); if (!buffer_jbd(bh)) goto not_jbd; @@ -1464,6 +1463,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) * we know to remove the checkpoint after we commit. */ + spin_lock(&journal->j_list_lock); if (jh->b_cp_transaction) { __jbd2_journal_temp_unlink_buffer(jh); __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); @@ -1476,6 +1476,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) goto drop; } } + spin_unlock(&journal->j_list_lock); } else if (jh->b_transaction) { J_ASSERT_JH(jh, (jh->b_transaction == journal->j_committing_transaction)); @@ -1487,7 +1488,9 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) if (jh->b_next_transaction) { J_ASSERT(jh->b_next_transaction == transaction); + spin_lock(&journal->j_list_lock); jh->b_next_transaction = NULL; + spin_unlock(&journal->j_list_lock); /* * only drop a reference if this transaction modified @@ -1499,7 +1502,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) } not_jbd: - spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); __brelse(bh); drop: @@ -1586,9 +1588,12 @@ int jbd2_journal_stop(handle_t *handle) * to perform a synchronous write. We do this to detect the * case where a single process is doing a stream of sync * writes. No point in waiting for joiners in that case. + * + * Setting max_batch_time to 0 disables this completely. */ pid = current->pid; - if (handle->h_sync && journal->j_last_sync_writer != pid) { + if (handle->h_sync && journal->j_last_sync_writer != pid && + journal->j_max_batch_time) { u64 commit_time, trans_time; journal->j_last_sync_writer = pid; @@ -1817,11 +1822,11 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) if (buffer_locked(bh) || buffer_dirty(bh)) goto out; - if (jh->b_next_transaction != NULL) + if (jh->b_next_transaction != NULL || jh->b_transaction != NULL) goto out; spin_lock(&journal->j_list_lock); - if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { + if (jh->b_cp_transaction != NULL) { /* written-back checkpointed metadata buffer */ JBUFFER_TRACE(jh, "remove from checkpoint list"); __jbd2_journal_remove_checkpoint(jh); diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 223283c3011..009ec0b5993 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -178,10 +178,6 @@ struct posix_acl *jffs2_get_acl(struct inode *inode, int type) char *value = NULL; int rc, xprefix; - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - switch (type) { case ACL_TYPE_ACCESS: xprefix = JFFS2_XPREFIX_ACL_ACCESS; @@ -232,13 +228,10 @@ static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *a return rc; } -static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) +int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int rc, xprefix; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - switch (type) { case ACL_TYPE_ACCESS: xprefix = JFFS2_XPREFIX_ACL_ACCESS; @@ -277,30 +270,21 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, umode_t *i_mode) { - struct posix_acl *acl; + struct posix_acl *default_acl, *acl; int rc; cache_no_acl(inode); - if (S_ISLNK(*i_mode)) - return 0; /* Symlink always has no-ACL */ - - acl = jffs2_get_acl(dir_i, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (!acl) { - *i_mode &= ~current_umask(); - } else { - if (S_ISDIR(*i_mode)) - set_cached_acl(inode, ACL_TYPE_DEFAULT, acl); - - rc = posix_acl_create(&acl, GFP_KERNEL, i_mode); - if (rc < 0) - return rc; - if (rc > 0) - set_cached_acl(inode, ACL_TYPE_ACCESS, acl); + rc = posix_acl_create(dir_i, i_mode, &default_acl, &acl); + if (rc) + return rc; + if (default_acl) { + set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl); + posix_acl_release(default_acl); + } + if (acl) { + set_cached_acl(inode, ACL_TYPE_ACCESS, acl); posix_acl_release(acl); } return 0; @@ -324,106 +308,3 @@ int jffs2_init_acl_post(struct inode *inode) return 0; } - -int jffs2_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; - int rc; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - rc = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (rc) - return rc; - rc = jffs2_set_acl(inode, ACL_TYPE_ACCESS, acl); - posix_acl_release(acl); - return rc; -} - -static size_t jffs2_acl_access_listxattr(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) -{ - const int retlen = sizeof(POSIX_ACL_XATTR_ACCESS); - - if (list && retlen <= list_size) - strcpy(list, POSIX_ACL_XATTR_ACCESS); - return retlen; -} - -static size_t jffs2_acl_default_listxattr(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) -{ - const int retlen = sizeof(POSIX_ACL_XATTR_DEFAULT); - - if (list && retlen <= list_size) - strcpy(list, POSIX_ACL_XATTR_DEFAULT); - return retlen; -} - -static int jffs2_acl_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - struct posix_acl *acl; - int rc; - - if (name[0] != '\0') - return -EINVAL; - - acl = jffs2_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (!acl) - return -ENODATA; - rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return rc; -} - -static int jffs2_acl_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - struct posix_acl *acl; - int rc; - - if (name[0] != '\0') - return -EINVAL; - if (!inode_owner_or_capable(dentry->d_inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - rc = posix_acl_valid(acl); - if (rc) - goto out; - } - } else { - acl = NULL; - } - rc = jffs2_set_acl(dentry->d_inode, type, acl); - out: - posix_acl_release(acl); - return rc; -} - -const struct xattr_handler jffs2_acl_access_xattr_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_DEFAULT, - .list = jffs2_acl_access_listxattr, - .get = jffs2_acl_getxattr, - .set = jffs2_acl_setxattr, -}; - -const struct xattr_handler jffs2_acl_default_xattr_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = jffs2_acl_default_listxattr, - .get = jffs2_acl_getxattr, - .set = jffs2_acl_setxattr, -}; diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index 9b477246f2a..2e2b5745c3b 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -27,17 +27,14 @@ struct jffs2_acl_header { #ifdef CONFIG_JFFS2_FS_POSIX_ACL struct posix_acl *jffs2_get_acl(struct inode *inode, int type); -extern int jffs2_acl_chmod(struct inode *); +int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *); extern int jffs2_init_acl_post(struct inode *); -extern const struct xattr_handler jffs2_acl_access_xattr_handler; -extern const struct xattr_handler jffs2_acl_default_xattr_handler; - #else #define jffs2_get_acl (NULL) -#define jffs2_acl_chmod(inode) (0) +#define jffs2_set_acl (NULL) #define jffs2_init_acl_pre(dir_i,inode,mode) (0) #define jffs2_init_acl_post(inode) (0) diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c index 2b60ce1996a..bb9cebc9ca8 100644 --- a/fs/jffs2/background.c +++ b/fs/jffs2/background.c @@ -75,10 +75,13 @@ void jffs2_stop_garbage_collect_thread(struct jffs2_sb_info *c) static int jffs2_garbage_collect_thread(void *_c) { struct jffs2_sb_info *c = _c; + sigset_t hupmask; + siginitset(&hupmask, sigmask(SIGHUP)); allow_signal(SIGKILL); allow_signal(SIGSTOP); allow_signal(SIGCONT); + allow_signal(SIGHUP); c->gc_task = current; complete(&c->gc_thread_start); @@ -87,7 +90,7 @@ static int jffs2_garbage_collect_thread(void *_c) set_freezable(); for (;;) { - allow_signal(SIGHUP); + sigprocmask(SIG_UNBLOCK, &hupmask, NULL); again: spin_lock(&c->erase_completion_lock); if (!jffs2_thread_should_wake(c)) { @@ -95,10 +98,9 @@ static int jffs2_garbage_collect_thread(void *_c) spin_unlock(&c->erase_completion_lock); jffs2_dbg(1, "%s(): sleeping...\n", __func__); schedule(); - } else + } else { spin_unlock(&c->erase_completion_lock); - - + } /* Problem - immediately after bootup, the GCD spends a lot * of time in places like jffs2_kill_fragtree(); so much so * that userspace processes (like gdm and X) are starved @@ -150,7 +152,7 @@ static int jffs2_garbage_collect_thread(void *_c) } } /* We don't want SIGHUP to interrupt us. STOP and KILL are OK though. */ - disallow_signal(SIGHUP); + sigprocmask(SIG_BLOCK, &hupmask, NULL); jffs2_dbg(1, "%s(): pass\n", __func__); if (jffs2_garbage_collect_pass(c) == -ENOSPC) { diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c index 16a5047903a..406d9cc84ba 100644 --- a/fs/jffs2/compr_rtime.c +++ b/fs/jffs2/compr_rtime.c @@ -33,7 +33,7 @@ static int jffs2_rtime_compress(unsigned char *data_in, unsigned char *cpage_out, uint32_t *sourcelen, uint32_t *dstlen) { - short positions[256]; + unsigned short positions[256]; int outpos = 0; int pos=0; @@ -74,7 +74,7 @@ static int jffs2_rtime_decompress(unsigned char *data_in, unsigned char *cpage_out, uint32_t srclen, uint32_t destlen) { - short positions[256]; + unsigned short positions[256]; int outpos = 0; int pos=0; diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index e3aac222472..938556025d6 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -59,6 +59,7 @@ const struct inode_operations jffs2_dir_inode_operations = .mknod = jffs2_mknod, .rename = jffs2_rename, .get_acl = jffs2_get_acl, + .set_acl = jffs2_set_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 1506673c087..64989ca9ba9 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -51,10 +51,10 @@ const struct file_operations jffs2_file_operations = { .llseek = generic_file_llseek, .open = generic_file_open, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .unlocked_ioctl=jffs2_ioctl, .mmap = generic_file_readonly_mmap, .fsync = jffs2_fsync, @@ -66,6 +66,7 @@ const struct file_operations jffs2_file_operations = const struct inode_operations jffs2_file_inode_operations = { .get_acl = jffs2_get_acl, + .set_acl = jffs2_set_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index fe3c0527545..601afd1afdd 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -190,15 +190,16 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) int jffs2_setattr(struct dentry *dentry, struct iattr *iattr) { + struct inode *inode = dentry->d_inode; int rc; - rc = inode_change_ok(dentry->d_inode, iattr); + rc = inode_change_ok(inode, iattr); if (rc) return rc; - rc = jffs2_do_setattr(dentry->d_inode, iattr); + rc = jffs2_do_setattr(inode, iattr); if (!rc && (iattr->ia_valid & ATTR_MODE)) - rc = jffs2_acl_chmod(dentry->d_inode); + rc = posix_acl_chmod(inode, inode->i_mode); return rc; } @@ -241,7 +242,7 @@ void jffs2_evict_inode (struct inode *inode) jffs2_dbg(1, "%s(): ino #%lu mode %o\n", __func__, inode->i_ino, inode->i_mode); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); jffs2_do_clear_inode(c, f); } @@ -456,12 +457,14 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r The umask is only applied if there's no default ACL */ ret = jffs2_init_acl_pre(dir_i, inode, &mode); if (ret) { - make_bad_inode(inode); - iput(inode); - return ERR_PTR(ret); + mutex_unlock(&f->sem); + make_bad_inode(inode); + iput(inode); + return ERR_PTR(ret); } ret = jffs2_do_new_inode (c, f, mode, ri); if (ret) { + mutex_unlock(&f->sem); make_bad_inode(inode); iput(inode); return ERR_PTR(ret); @@ -478,6 +481,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r inode->i_size = 0; if (insert_inode_locked(inode) < 0) { + mutex_unlock(&f->sem); make_bad_inode(inode); iput(inode); return ERR_PTR(-EINVAL); @@ -515,6 +519,10 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) c = JFFS2_SB_INFO(sb); + /* Do not support the MLC nand */ + if (c->mtd->type == MTD_MLCNANDFLASH) + return -EINVAL; + #ifndef CONFIG_JFFS2_FS_WRITEBUFFER if (c->mtd->type == MTD_NANDFLASH) { pr_err("Cannot operate on NAND flash unless jffs2 NAND support is compiled in\n"); @@ -682,7 +690,7 @@ unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c, struct inode *inode = OFNI_EDONI_2SFFJ(f); struct page *pg; - pg = read_cache_page_async(inode->i_mapping, offset >> PAGE_CACHE_SHIFT, + pg = read_cache_page(inode->i_mapping, offset >> PAGE_CACHE_SHIFT, (void *)jffs2_do_readpage_unlock, inode); if (IS_ERR(pg)) return (void *)pg; diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c index 4f47aa24b55..b8fd651307a 100644 --- a/fs/jffs2/malloc.c +++ b/fs/jffs2/malloc.c @@ -288,6 +288,8 @@ struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void) struct jffs2_xattr_datum *xd; xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL); dbg_memalloc("%p\n", xd); + if (!xd) + return NULL; xd->class = RAWNODE_CLASS_XATTR_DATUM; xd->node = (void *)xd; @@ -306,6 +308,8 @@ struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void) struct jffs2_xattr_ref *ref; ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL); dbg_memalloc("%p\n", ref); + if (!ref) + return NULL; ref->class = RAWNODE_CLASS_XATTR_REF; ref->node = (void *)ref; diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c index 975a1f562c1..9a5449bc3af 100644 --- a/fs/jffs2/nodelist.c +++ b/fs/jffs2/nodelist.c @@ -564,25 +564,10 @@ struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_ they're killed. */ void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c) { - struct jffs2_node_frag *frag; - struct jffs2_node_frag *parent; - - if (!root->rb_node) - return; + struct jffs2_node_frag *frag, *next; dbg_fragtree("killing\n"); - - frag = (rb_entry(root->rb_node, struct jffs2_node_frag, rb)); - while(frag) { - if (frag->rb.rb_left) { - frag = frag_left(frag); - continue; - } - if (frag->rb.rb_right) { - frag = frag_right(frag); - continue; - } - + rbtree_postorder_for_each_entry_safe(frag, next, root, rb) { if (frag->node && !(--frag->node->frags)) { /* Not a hole, and it's the final remaining frag of this node. Free the node */ @@ -591,17 +576,8 @@ void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c) jffs2_free_full_dnode(frag->node); } - parent = frag_parent(frag); - if (parent) { - if (frag_left(parent) == frag) - parent->rb.rb_left = NULL; - else - parent->rb.rb_right = NULL; - } jffs2_free_node_frag(frag); - frag = parent; - cond_resched(); } } diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h index e4619b00f7c..fa35ff79ab3 100644 --- a/fs/jffs2/nodelist.h +++ b/fs/jffs2/nodelist.h @@ -231,7 +231,7 @@ struct jffs2_tmp_dnode_info uint32_t version; uint32_t data_crc; uint32_t partial_crc; - uint16_t csize; + uint32_t csize; uint16_t overlapped; }; diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c index 03310721712..b6bd4affd9a 100644 --- a/fs/jffs2/nodemgmt.c +++ b/fs/jffs2/nodemgmt.c @@ -179,6 +179,7 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, spin_unlock(&c->erase_completion_lock); schedule(); + remove_wait_queue(&c->erase_wait, &wait); } else spin_unlock(&c->erase_completion_lock); } else if (ret) @@ -211,20 +212,25 @@ out: int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *len, uint32_t sumsize) { - int ret = -EAGAIN; + int ret; minsize = PAD(minsize); jffs2_dbg(1, "%s(): Requested 0x%x bytes\n", __func__, minsize); - spin_lock(&c->erase_completion_lock); - while(ret == -EAGAIN) { + while (true) { + spin_lock(&c->erase_completion_lock); ret = jffs2_do_reserve_space(c, minsize, len, sumsize); if (ret) { jffs2_dbg(1, "%s(): looping, ret is %d\n", __func__, ret); } + spin_unlock(&c->erase_completion_lock); + + if (ret == -EAGAIN) + cond_resched(); + else + break; } - spin_unlock(&c->erase_completion_lock); if (!ret) ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1); diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index ae81b01e6fd..386303dca38 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -543,33 +543,13 @@ static int jffs2_build_inode_fragtree(struct jffs2_sb_info *c, static void jffs2_free_tmp_dnode_info_list(struct rb_root *list) { - struct rb_node *this; - struct jffs2_tmp_dnode_info *tn; - - this = list->rb_node; + struct jffs2_tmp_dnode_info *tn, *next; - /* Now at bottom of tree */ - while (this) { - if (this->rb_left) - this = this->rb_left; - else if (this->rb_right) - this = this->rb_right; - else { - tn = rb_entry(this, struct jffs2_tmp_dnode_info, rb); + rbtree_postorder_for_each_entry_safe(tn, next, list, rb) { jffs2_free_full_dnode(tn->fn); jffs2_free_tmp_dnode_info(tn); - - this = rb_parent(this); - if (!this) - break; - - if (this->rb_left == &tn->rb) - this->rb_left = NULL; - else if (this->rb_right == &tn->rb) - this->rb_right = NULL; - else BUG(); - } } + *list = RB_ROOT; } diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 0defb1cc2a3..0918f0e2e26 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -243,6 +243,7 @@ static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data) struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); int err; + sync_filesystem(sb); err = jffs2_parse_options(c, data); if (err) return -EINVAL; diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c index 6e563332bb2..c7c77b0dfcc 100644 --- a/fs/jffs2/symlink.c +++ b/fs/jffs2/symlink.c @@ -22,7 +22,6 @@ const struct inode_operations jffs2_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = jffs2_follow_link, - .get_acl = jffs2_get_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index 3034e970eb9..ad0f2e2a170 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -22,6 +22,7 @@ #include <linux/crc32.h> #include <linux/jffs2.h> #include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> #include <linux/mtd/mtd.h> #include "nodelist.h" /* -------- xdatum related functions ---------------- @@ -921,8 +922,8 @@ const struct xattr_handler *jffs2_xattr_handlers[] = { &jffs2_security_xattr_handler, #endif #ifdef CONFIG_JFFS2_FS_POSIX_ACL - &jffs2_acl_access_xattr_handler, - &jffs2_acl_default_xattr_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif &jffs2_trusted_xattr_handler, NULL @@ -942,10 +943,10 @@ static const struct xattr_handler *xprefix_to_handler(int xprefix) { #endif #ifdef CONFIG_JFFS2_FS_POSIX_ACL case JFFS2_XPREFIX_ACL_ACCESS: - ret = &jffs2_acl_access_xattr_handler; + ret = &posix_acl_access_xattr_handler; break; case JFFS2_XPREFIX_ACL_DEFAULT: - ret = &jffs2_acl_default_xattr_handler; + ret = &posix_acl_default_xattr_handler; break; #endif case JFFS2_XPREFIX_TRUSTED: diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index d254d6d3599..0c8ca830b11 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -72,7 +72,7 @@ struct posix_acl *jfs_get_acl(struct inode *inode, int type) return acl; } -static int jfs_set_acl(tid_t tid, struct inode *inode, int type, +static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, struct posix_acl *acl) { char *ea_name; @@ -80,21 +80,26 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type, int size = 0; char *value = NULL; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - switch(type) { - case ACL_TYPE_ACCESS: - ea_name = POSIX_ACL_XATTR_ACCESS; - break; - case ACL_TYPE_DEFAULT: - ea_name = POSIX_ACL_XATTR_DEFAULT; - if (!S_ISDIR(inode->i_mode)) - return acl ? -EACCES : 0; - break; - default: - return -EINVAL; + switch (type) { + case ACL_TYPE_ACCESS: + ea_name = POSIX_ACL_XATTR_ACCESS; + if (acl) { + rc = posix_acl_equiv_mode(acl, &inode->i_mode); + if (rc < 0) + return rc; + inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + if (rc == 0) + acl = NULL; + } + break; + case ACL_TYPE_DEFAULT: + ea_name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + return -EINVAL; } + if (acl) { size = posix_acl_xattr_size(acl->a_count); value = kmalloc(size, GFP_KERNEL); @@ -114,65 +119,43 @@ out: return rc; } +int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int rc; + tid_t tid; + + tid = txBegin(inode->i_sb, 0); + mutex_lock(&JFS_IP(inode)->commit_mutex); + rc = __jfs_set_acl(tid, inode, type, acl); + if (!rc) + rc = txCommit(tid, 1, &inode, 0); + txEnd(tid); + mutex_unlock(&JFS_IP(inode)->commit_mutex); + return rc; +} + int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) { - struct posix_acl *acl = NULL; + struct posix_acl *default_acl, *acl; int rc = 0; - if (S_ISLNK(inode->i_mode)) - return 0; + rc = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (rc) + return rc; - acl = jfs_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); + if (default_acl) { + rc = __jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, default_acl); + posix_acl_release(default_acl); + } if (acl) { - if (S_ISDIR(inode->i_mode)) { - rc = jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, acl); - if (rc) - goto cleanup; - } - rc = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); - if (rc < 0) - goto cleanup; /* posix_acl_release(NULL) is no-op */ - if (rc > 0) - rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl); -cleanup: + if (!rc) + rc = __jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl); posix_acl_release(acl); - } else - inode->i_mode &= ~current_umask(); + } JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) | inode->i_mode; return rc; } - -int jfs_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; - int rc; - tid_t tid; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - - rc = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (rc) - return rc; - - tid = txBegin(inode->i_sb, 0); - mutex_lock(&JFS_IP(inode)->commit_mutex); - rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl); - if (!rc) - rc = txCommit(tid, 1, &inode, 0); - txEnd(tid); - mutex_unlock(&JFS_IP(inode)->commit_mutex); - - posix_acl_release(acl); - return rc; -} diff --git a/fs/jfs/file.c b/fs/jfs/file.c index dd7442c5835..33aa0cc1f8b 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -19,6 +19,7 @@ #include <linux/mm.h> #include <linux/fs.h> +#include <linux/posix_acl.h> #include <linux/quotaops.h> #include "jfs_incore.h" #include "jfs_inode.h" @@ -131,7 +132,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr) mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) - rc = jfs_acl_chmod(inode); + rc = posix_acl_chmod(inode, inode->i_mode); return rc; } @@ -143,19 +144,20 @@ const struct inode_operations jfs_file_inode_operations = { .setattr = jfs_setattr, #ifdef CONFIG_JFS_POSIX_ACL .get_acl = jfs_get_acl, + .set_acl = jfs_set_acl, #endif }; const struct file_operations jfs_file_operations = { .open = jfs_open, .llseek = generic_file_llseek, - .write = do_sync_write, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .write = new_sync_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, .fsync = jfs_fsync, .release = jfs_release, .unlocked_ioctl = jfs_ioctl, diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index f4aab719add..bd3df1ca3c9 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -154,7 +154,7 @@ void jfs_evict_inode(struct inode *inode) dquot_initialize(inode); if (JFS_IP(inode)->fileset == FILESYSTEM_I) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (test_cflag(COMMIT_Freewmap, inode)) jfs_free_zero_link(inode); @@ -168,7 +168,7 @@ void jfs_evict_inode(struct inode *inode) dquot_free_inode(inode); } } else { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); } clear_inode(inode); dquot_drop(inode); @@ -331,15 +331,15 @@ static sector_t jfs_bmap(struct address_space *mapping, sector_t block) } static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = file->f_mapping->host; + size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, - jfs_get_block); + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, jfs_get_block); /* * In case of error extending write may have instantiated a few @@ -347,7 +347,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb, */ if (unlikely((rw & WRITE) && ret < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + iov_length(iov, nr_segs); + loff_t end = offset + count; if (end > isize) jfs_write_failed(mapping, end); diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index ad84fe50ca9..489f993b7b1 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -21,8 +21,8 @@ #ifdef CONFIG_JFS_POSIX_ACL struct posix_acl *jfs_get_acl(struct inode *inode, int type); +int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); int jfs_init_acl(tid_t, struct inode *, struct inode *); -int jfs_acl_chmod(struct inode *inode); #else @@ -32,10 +32,5 @@ static inline int jfs_init_acl(tid_t tid, struct inode *inode, return 0; } -static inline int jfs_acl_chmod(struct inode *inode) -{ - return 0; -} - #endif #endif /* _H_JFS_ACL */ diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 370d7b6c594..2d514c7affc 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -1208,7 +1208,7 @@ static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno, * by this leaf. */ l2size = - min((int)leaf[word], NLSTOL2BSZ(nwords)); + min_t(int, leaf[word], NLSTOL2BSZ(nwords)); /* determine how many words were handled. */ @@ -1902,7 +1902,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results) /* determine how many blocks to allocate from this dmap. */ - nb = min(n, (s64)BPERDMAP); + nb = min_t(s64, n, BPERDMAP); /* allocate the blocks from the dmap. */ @@ -2260,7 +2260,8 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, * of bits being allocated and the l2 number * of bits currently described by this leaf. */ - size = min((int)leaf[word], NLSTOL2BSZ(nwords)); + size = min_t(int, leaf[word], + NLSTOL2BSZ(nwords)); /* update the leaf to reflect the allocation. * in addition to setting the leaf value to @@ -3563,7 +3564,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) if (mp == NULL) goto errout; - n = min(nblocks, (s64)BPERDMAP); + n = min_t(s64, nblocks, BPERDMAP); } dp = (struct dmap *) mp->data; diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index c1a3e603279..6b0f816201a 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -29,20 +29,20 @@ void jfs_set_inode_flags(struct inode *inode) { unsigned int flags = JFS_IP(inode)->mode2; - - inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | - S_NOATIME | S_DIRSYNC | S_SYNC); + unsigned int new_fl = 0; if (flags & JFS_IMMUTABLE_FL) - inode->i_flags |= S_IMMUTABLE; + new_fl |= S_IMMUTABLE; if (flags & JFS_APPEND_FL) - inode->i_flags |= S_APPEND; + new_fl |= S_APPEND; if (flags & JFS_NOATIME_FL) - inode->i_flags |= S_NOATIME; + new_fl |= S_NOATIME; if (flags & JFS_DIRSYNC_FL) - inode->i_flags |= S_DIRSYNC; + new_fl |= S_DIRSYNC; if (flags & JFS_SYNC_FL) - inode->i_flags |= S_SYNC; + new_fl |= S_SYNC; + inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND | S_NOATIME | + S_DIRSYNC | S_SYNC); } void jfs_get_inode_flags(struct jfs_inode_info *jfs_ip) @@ -95,7 +95,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) if (insert_inode_locked(inode) < 0) { rc = -EINVAL; - goto fail_unlock; + goto fail_put; } inode_init_owner(inode, parent, mode); @@ -156,7 +156,6 @@ struct inode *ialloc(struct inode *parent, umode_t mode) fail_drop: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; -fail_unlock: clear_nlink(inode); unlock_new_inode(inode); fail_put: diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 360d27c4888..0acddf60af5 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -167,7 +167,7 @@ do { \ * Global list of active external journals */ static LIST_HEAD(jfs_external_logs); -static struct jfs_log *dummy_log = NULL; +static struct jfs_log *dummy_log; static DEFINE_MUTEX(jfs_log_mutex); /* @@ -1998,20 +1998,20 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio = bio_alloc(GFP_NOFS, 1); - bio->bi_sector = bp->l_blkno << (log->l2bsize - 9); + bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); bio->bi_bdev = log->bdev; bio->bi_io_vec[0].bv_page = bp->l_page; bio->bi_io_vec[0].bv_len = LOGPSIZE; bio->bi_io_vec[0].bv_offset = bp->l_offset; bio->bi_vcnt = 1; - bio->bi_size = LOGPSIZE; + bio->bi_iter.bi_size = LOGPSIZE; bio->bi_end_io = lbmIODone; bio->bi_private = bp; /*check if journaling to disk has been disabled*/ if (log->no_integrity) { - bio->bi_size = 0; + bio->bi_iter.bi_size = 0; lbmIODone(bio, 0); } else { submit_bio(READ_SYNC, bio); @@ -2144,21 +2144,21 @@ static void lbmStartIO(struct lbuf * bp) jfs_info("lbmStartIO\n"); bio = bio_alloc(GFP_NOFS, 1); - bio->bi_sector = bp->l_blkno << (log->l2bsize - 9); + bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); bio->bi_bdev = log->bdev; bio->bi_io_vec[0].bv_page = bp->l_page; bio->bi_io_vec[0].bv_len = LOGPSIZE; bio->bi_io_vec[0].bv_offset = bp->l_offset; bio->bi_vcnt = 1; - bio->bi_size = LOGPSIZE; + bio->bi_iter.bi_size = LOGPSIZE; bio->bi_end_io = lbmIODone; bio->bi_private = bp; /* check if journaling to disk has been disabled */ if (log->no_integrity) { - bio->bi_size = 0; + bio->bi_iter.bi_size = 0; lbmIODone(bio, 0); } else { submit_bio(WRITE_SYNC, bio); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index d165cde0c68..49ba7ff1bbb 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -416,7 +416,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) * count from hitting zero before we're through */ inc_io(page); - if (!bio->bi_size) + if (!bio->bi_iter.bi_size) goto dump_bio; submit_bio(WRITE, bio); nr_underway++; @@ -438,7 +438,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) bio = bio_alloc(GFP_NOFS, 1); bio->bi_bdev = inode->i_sb->s_bdev; - bio->bi_sector = pblock << (inode->i_blkbits - 9); + bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9); bio->bi_end_io = metapage_write_end_io; bio->bi_private = page; @@ -452,7 +452,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) if (bio) { if (bio_add_page(bio, page, bio_bytes, bio_offset) < bio_bytes) goto add_failed; - if (!bio->bi_size) + if (!bio->bi_iter.bi_size) goto dump_bio; submit_bio(WRITE, bio); @@ -517,7 +517,8 @@ static int metapage_readpage(struct file *fp, struct page *page) bio = bio_alloc(GFP_NOFS, 1); bio->bi_bdev = inode->i_sb->s_bdev; - bio->bi_sector = pblock << (inode->i_blkbits - 9); + bio->bi_iter.bi_sector = + pblock << (inode->i_blkbits - 9); bio->bi_end_io = metapage_read_end_io; bio->bi_private = page; len = xlen << inode->i_blkbits; diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h index e9e100fd7c0..e8d717dabca 100644 --- a/fs/jfs/jfs_xattr.h +++ b/fs/jfs/jfs_xattr.h @@ -61,6 +61,8 @@ extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t jfs_listxattr(struct dentry *, char *, size_t); extern int jfs_removexattr(struct dentry *, const char *); +extern const struct xattr_handler *jfs_xattr_handlers[]; + #ifdef CONFIG_JFS_SECURITY extern int jfs_init_security(tid_t, struct inode *, struct inode *, const struct qstr *); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index aa8a3370631..d59c7defb1e 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1524,6 +1524,7 @@ const struct inode_operations jfs_dir_inode_operations = { .setattr = jfs_setattr, #ifdef CONFIG_JFS_POSIX_ACL .get_acl = jfs_get_acl, + .set_acl = jfs_set_acl, #endif }; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 6669aa2042c..adf8cb045b9 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -44,19 +44,20 @@ #include "jfs_imap.h" #include "jfs_acl.h" #include "jfs_debug.h" +#include "jfs_xattr.h" MODULE_DESCRIPTION("The Journaled Filesystem (JFS)"); MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM"); MODULE_LICENSE("GPL"); -static struct kmem_cache * jfs_inode_cachep; +static struct kmem_cache *jfs_inode_cachep; static const struct super_operations jfs_super_operations; static const struct export_operations jfs_export_operations; static struct file_system_type jfs_fs_type; #define MAX_COMMIT_THREADS 64 -static int commit_threads = 0; +static int commit_threads; module_param(commit_threads, int, 0); MODULE_PARM_DESC(commit_threads, "Number of commit threads"); @@ -83,8 +84,7 @@ static void jfs_handle_error(struct super_block *sb) panic("JFS (device %s): panic forced after error\n", sb->s_id); else if (sbi->flag & JFS_ERR_REMOUNT_RO) { - jfs_err("ERROR: (device %s): remounting filesystem " - "as read-only\n", + jfs_err("ERROR: (device %s): remounting filesystem as read-only\n", sb->s_id); sb->s_flags |= MS_RDONLY; } @@ -272,7 +272,10 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, case Opt_resize: { char *resize = args[0].from; - *newLVSize = simple_strtoull(resize, &resize, 0); + int rc = kstrtoll(resize, 0, newLVSize); + + if (rc) + goto cleanup; break; } case Opt_resize_nosize: @@ -326,7 +329,11 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, case Opt_uid: { char *uid = args[0].from; - uid_t val = simple_strtoul(uid, &uid, 0); + uid_t val; + int rc = kstrtouint(uid, 0, &val); + + if (rc) + goto cleanup; sbi->uid = make_kuid(current_user_ns(), val); if (!uid_valid(sbi->uid)) goto cleanup; @@ -336,7 +343,11 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, case Opt_gid: { char *gid = args[0].from; - gid_t val = simple_strtoul(gid, &gid, 0); + gid_t val; + int rc = kstrtouint(gid, 0, &val); + + if (rc) + goto cleanup; sbi->gid = make_kgid(current_user_ns(), val); if (!gid_valid(sbi->gid)) goto cleanup; @@ -346,7 +357,10 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, case Opt_umask: { char *umask = args[0].from; - sbi->umask = simple_strtoul(umask, &umask, 8); + int rc = kstrtouint(umask, 8, &sbi->umask); + + if (rc) + goto cleanup; if (sbi->umask & ~0777) { pr_err("JFS: Invalid value of umask\n"); goto cleanup; @@ -362,12 +376,10 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, * -> user has more control over the online trimming */ sbi->minblks_trim = 64; - if (blk_queue_discard(q)) { + if (blk_queue_discard(q)) *flag |= JFS_DISCARD; - } else { - pr_err("JFS: discard option " \ - "not supported on device\n"); - } + else + pr_err("JFS: discard option not supported on device\n"); break; } @@ -379,20 +391,21 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, { struct request_queue *q = bdev_get_queue(sb->s_bdev); char *minblks_trim = args[0].from; + int rc; if (blk_queue_discard(q)) { *flag |= JFS_DISCARD; - sbi->minblks_trim = simple_strtoull( - minblks_trim, &minblks_trim, 0); - } else { - pr_err("JFS: discard option " \ - "not supported on device\n"); - } + rc = kstrtouint(minblks_trim, 0, + &sbi->minblks_trim); + if (rc) + goto cleanup; + } else + pr_err("JFS: discard option not supported on device\n"); break; } default: - printk("jfs: Unrecognized mount option \"%s\" " - " or missing value\n", p); + printk("jfs: Unrecognized mount option \"%s\" or missing value\n", + p); goto cleanup; } } @@ -417,14 +430,13 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) int flag = JFS_SBI(sb)->flag; int ret; - if (!parse_options(data, sb, &newLVSize, &flag)) { + sync_filesystem(sb); + if (!parse_options(data, sb, &newLVSize, &flag)) return -EINVAL; - } if (newLVSize) { if (sb->s_flags & MS_RDONLY) { - pr_err("JFS: resize requires volume" \ - " to be mounted read-write\n"); + pr_err("JFS: resize requires volume to be mounted read-write\n"); return -EROFS; } rc = jfs_extendfs(sb, newLVSize, 0); @@ -450,9 +462,8 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) } if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) { rc = dquot_suspend(sb, -1); - if (rc < 0) { + if (rc < 0) return rc; - } rc = jfs_umount_rw(sb); JFS_SBI(sb)->flag = flag; return rc; @@ -485,7 +496,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) if (!new_valid_dev(sb->s_bdev->bd_dev)) return -EOVERFLOW; - sbi = kzalloc(sizeof (struct jfs_sb_info), GFP_KERNEL); + sbi = kzalloc(sizeof(struct jfs_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; @@ -522,6 +533,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) */ sb->s_op = &jfs_super_operations; sb->s_export_op = &jfs_export_operations; + sb->s_xattr = jfs_xattr_handlers; #ifdef CONFIG_QUOTA sb->dq_op = &dquot_operations; sb->s_qcop = &dquot_quotactl_ops; @@ -545,9 +557,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) rc = jfs_mount(sb); if (rc) { - if (!silent) { + if (!silent) jfs_err("jfs_mount failed w/return code = %d", rc); - } goto out_mount_failed; } if (sb->s_flags & MS_RDONLY) @@ -584,7 +595,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) * Page cache is indexed by long. * I would use MAX_LFS_FILESIZE, but it's only half as big */ - sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, (u64)sb->s_maxbytes); + sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, + (u64)sb->s_maxbytes); #endif sb->s_time_gran = 1; return 0; @@ -594,9 +606,8 @@ out_no_root: out_no_rw: rc = jfs_umount(sb); - if (rc) { + if (rc) jfs_err("jfs_umount failed with return code %d", rc); - } out_mount_failed: filemap_write_and_wait(sbi->direct_inode->i_mapping); truncate_inode_pages(sbi->direct_inode->i_mapping, 0); @@ -921,7 +932,8 @@ static int __init init_jfs_fs(void) commit_threads = MAX_COMMIT_THREADS; for (i = 0; i < commit_threads; i++) { - jfsCommitThread[i] = kthread_run(jfs_lazycommit, NULL, "jfsCommit"); + jfsCommitThread[i] = kthread_run(jfs_lazycommit, NULL, + "jfsCommit"); if (IS_ERR(jfsCommitThread[i])) { rc = PTR_ERR(jfsCommitThread[i]); jfs_err("init_jfs_fs: fork failed w/rc = %d", rc); diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index d3472f4cd53..46325d5c34f 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -666,81 +666,12 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf, } /* - * can_set_system_xattr - * - * This code is specific to the system.* namespace. It contains policy - * which doesn't belong in the main xattr codepath. - */ -static int can_set_system_xattr(struct inode *inode, const char *name, - const void *value, size_t value_len) -{ -#ifdef CONFIG_JFS_POSIX_ACL - struct posix_acl *acl; - int rc; - - if (!inode_owner_or_capable(inode)) - return -EPERM; - - /* - * POSIX_ACL_XATTR_ACCESS is tied to i_mode - */ - if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) { - acl = posix_acl_from_xattr(&init_user_ns, value, value_len); - if (IS_ERR(acl)) { - rc = PTR_ERR(acl); - printk(KERN_ERR "posix_acl_from_xattr returned %d\n", - rc); - return rc; - } - if (acl) { - rc = posix_acl_equiv_mode(acl, &inode->i_mode); - posix_acl_release(acl); - if (rc < 0) { - printk(KERN_ERR - "posix_acl_equiv_mode returned %d\n", - rc); - return rc; - } - mark_inode_dirty(inode); - } - /* - * We're changing the ACL. Get rid of the cached one - */ - forget_cached_acl(inode, ACL_TYPE_ACCESS); - - return 0; - } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) { - acl = posix_acl_from_xattr(&init_user_ns, value, value_len); - if (IS_ERR(acl)) { - rc = PTR_ERR(acl); - printk(KERN_ERR "posix_acl_from_xattr returned %d\n", - rc); - return rc; - } - posix_acl_release(acl); - - /* - * We're changing the default ACL. Get rid of the cached one - */ - forget_cached_acl(inode, ACL_TYPE_DEFAULT); - - return 0; - } -#endif /* CONFIG_JFS_POSIX_ACL */ - return -EOPNOTSUPP; -} - -/* * Most of the permission checking is done by xattr_permission in the vfs. - * The local file system is responsible for handling the system.* namespace. * We also need to verify that this is a namespace that we recognize. */ static int can_set_xattr(struct inode *inode, const char *name, const void *value, size_t value_len) { - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return can_set_system_xattr(inode, name, value, value_len); - if (!strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) { /* * This makes sure that we aren't trying to set an @@ -748,7 +679,7 @@ static int can_set_xattr(struct inode *inode, const char *name, * with "os2." */ if (is_known_namespace(name + XATTR_OS2_PREFIX_LEN)) - return -EOPNOTSUPP; + return -EOPNOTSUPP; return 0; } @@ -860,6 +791,19 @@ int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name, /* Completely new ea list */ xattr_size = sizeof (struct jfs_ea_list); + /* + * The size of EA value is limitted by on-disk format up to + * __le16, there would be an overflow if the size is equal + * to XATTR_SIZE_MAX (65536). In order to avoid this issue, + * we can pre-checkup the value size against USHRT_MAX, and + * return -E2BIG in this case, which is consistent with the + * VFS setxattr interface. + */ + if (value_len >= USHRT_MAX) { + rc = -E2BIG; + goto release; + } + ea = (struct jfs_ea *) ((char *) ealist + xattr_size); ea->flag = 0; ea->namelen = namelen; @@ -874,7 +818,7 @@ int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name, /* DEBUG - If we did this right, these number match */ if (xattr_size != new_size) { printk(KERN_ERR - "jfs_xsetattr: xattr_size = %d, new_size = %d\n", + "__jfs_setxattr: xattr_size = %d, new_size = %d\n", xattr_size, new_size); rc = -EINVAL; @@ -910,6 +854,14 @@ int jfs_setxattr(struct dentry *dentry, const char *name, const void *value, int rc; tid_t tid; + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_setxattr(dentry, name, value, value_len, flags); + if ((rc = can_set_xattr(inode, name, value, value_len))) return rc; @@ -986,6 +938,14 @@ ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data, { int err; + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_getxattr(dentry, name, data, buf_size); + if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { /* * skip past "os2." prefix @@ -1074,6 +1034,14 @@ int jfs_removexattr(struct dentry *dentry, const char *name) int rc; tid_t tid; + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_removexattr(dentry, name); + if ((rc = can_set_xattr(inode, name, NULL, 0))) return rc; @@ -1088,6 +1056,19 @@ int jfs_removexattr(struct dentry *dentry, const char *name) return rc; } +/* + * List of handlers for synthetic system.* attributes. All real ondisk + * attributes are handled directly. + */ +const struct xattr_handler *jfs_xattr_handlers[] = { +#ifdef CONFIG_JFS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + NULL, +}; + + #ifdef CONFIG_JFS_SECURITY static int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_info) diff --git a/fs/kernfs/Kconfig b/fs/kernfs/Kconfig new file mode 100644 index 00000000000..397b5f7a7a1 --- /dev/null +++ b/fs/kernfs/Kconfig @@ -0,0 +1,7 @@ +# +# KERNFS should be selected by its users +# + +config KERNFS + bool + default n diff --git a/fs/kernfs/Makefile b/fs/kernfs/Makefile new file mode 100644 index 00000000000..674337c7667 --- /dev/null +++ b/fs/kernfs/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the kernfs pseudo filesystem +# + +obj-y := mount.o inode.o dir.o file.o symlink.o diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c new file mode 100644 index 00000000000..a693f5b01ae --- /dev/null +++ b/fs/kernfs/dir.c @@ -0,0 +1,1432 @@ +/* + * fs/kernfs/dir.c - kernfs directory implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + */ + +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/idr.h> +#include <linux/slab.h> +#include <linux/security.h> +#include <linux/hash.h> + +#include "kernfs-internal.h" + +DEFINE_MUTEX(kernfs_mutex); +static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */ +static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */ + +#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) + +static bool kernfs_active(struct kernfs_node *kn) +{ + lockdep_assert_held(&kernfs_mutex); + return atomic_read(&kn->active) >= 0; +} + +static bool kernfs_lockdep(struct kernfs_node *kn) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + return kn->flags & KERNFS_LOCKDEP; +#else + return false; +#endif +} + +static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen) +{ + return strlcpy(buf, kn->parent ? kn->name : "/", buflen); +} + +static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf, + size_t buflen) +{ + char *p = buf + buflen; + int len; + + *--p = '\0'; + + do { + len = strlen(kn->name); + if (p - buf < len + 1) { + buf[0] = '\0'; + p = NULL; + break; + } + p -= len; + memcpy(p, kn->name, len); + *--p = '/'; + kn = kn->parent; + } while (kn && kn->parent); + + return p; +} + +/** + * kernfs_name - obtain the name of a given node + * @kn: kernfs_node of interest + * @buf: buffer to copy @kn's name into + * @buflen: size of @buf + * + * Copies the name of @kn into @buf of @buflen bytes. The behavior is + * similar to strlcpy(). It returns the length of @kn's name and if @buf + * isn't long enough, it's filled upto @buflen-1 and nul terminated. + * + * This function can be called from any context. + */ +int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + ret = kernfs_name_locked(kn, buf, buflen); + spin_unlock_irqrestore(&kernfs_rename_lock, flags); + return ret; +} + +/** + * kernfs_path - build full path of a given node + * @kn: kernfs_node of interest + * @buf: buffer to copy @kn's name into + * @buflen: size of @buf + * + * Builds and returns the full path of @kn in @buf of @buflen bytes. The + * path is built from the end of @buf so the returned pointer usually + * doesn't match @buf. If @buf isn't long enough, @buf is nul terminated + * and %NULL is returned. + */ +char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen) +{ + unsigned long flags; + char *p; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + p = kernfs_path_locked(kn, buf, buflen); + spin_unlock_irqrestore(&kernfs_rename_lock, flags); + return p; +} +EXPORT_SYMBOL_GPL(kernfs_path); + +/** + * pr_cont_kernfs_name - pr_cont name of a kernfs_node + * @kn: kernfs_node of interest + * + * This function can be called from any context. + */ +void pr_cont_kernfs_name(struct kernfs_node *kn) +{ + unsigned long flags; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + + kernfs_name_locked(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); + pr_cont("%s", kernfs_pr_cont_buf); + + spin_unlock_irqrestore(&kernfs_rename_lock, flags); +} + +/** + * pr_cont_kernfs_path - pr_cont path of a kernfs_node + * @kn: kernfs_node of interest + * + * This function can be called from any context. + */ +void pr_cont_kernfs_path(struct kernfs_node *kn) +{ + unsigned long flags; + char *p; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + + p = kernfs_path_locked(kn, kernfs_pr_cont_buf, + sizeof(kernfs_pr_cont_buf)); + if (p) + pr_cont("%s", p); + else + pr_cont("<name too long>"); + + spin_unlock_irqrestore(&kernfs_rename_lock, flags); +} + +/** + * kernfs_get_parent - determine the parent node and pin it + * @kn: kernfs_node of interest + * + * Determines @kn's parent, pins and returns it. This function can be + * called from any context. + */ +struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) +{ + struct kernfs_node *parent; + unsigned long flags; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + parent = kn->parent; + kernfs_get(parent); + spin_unlock_irqrestore(&kernfs_rename_lock, flags); + + return parent; +} + +/** + * kernfs_name_hash + * @name: Null terminated string to hash + * @ns: Namespace tag to hash + * + * Returns 31 bit hash of ns + name (so it fits in an off_t ) + */ +static unsigned int kernfs_name_hash(const char *name, const void *ns) +{ + unsigned long hash = init_name_hash(); + unsigned int len = strlen(name); + while (len--) + hash = partial_name_hash(*name++, hash); + hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31)); + hash &= 0x7fffffffU; + /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ + if (hash < 2) + hash += 2; + if (hash >= INT_MAX) + hash = INT_MAX - 1; + return hash; +} + +static int kernfs_name_compare(unsigned int hash, const char *name, + const void *ns, const struct kernfs_node *kn) +{ + if (hash != kn->hash) + return hash - kn->hash; + if (ns != kn->ns) + return ns - kn->ns; + return strcmp(name, kn->name); +} + +static int kernfs_sd_compare(const struct kernfs_node *left, + const struct kernfs_node *right) +{ + return kernfs_name_compare(left->hash, left->name, left->ns, right); +} + +/** + * kernfs_link_sibling - link kernfs_node into sibling rbtree + * @kn: kernfs_node of interest + * + * Link @kn into its sibling rbtree which starts from + * @kn->parent->dir.children. + * + * Locking: + * mutex_lock(kernfs_mutex) + * + * RETURNS: + * 0 on susccess -EEXIST on failure. + */ +static int kernfs_link_sibling(struct kernfs_node *kn) +{ + struct rb_node **node = &kn->parent->dir.children.rb_node; + struct rb_node *parent = NULL; + + while (*node) { + struct kernfs_node *pos; + int result; + + pos = rb_to_kn(*node); + parent = *node; + result = kernfs_sd_compare(kn, pos); + if (result < 0) + node = &pos->rb.rb_left; + else if (result > 0) + node = &pos->rb.rb_right; + else + return -EEXIST; + } + + /* add new node and rebalance the tree */ + rb_link_node(&kn->rb, parent, node); + rb_insert_color(&kn->rb, &kn->parent->dir.children); + + /* successfully added, account subdir number */ + if (kernfs_type(kn) == KERNFS_DIR) + kn->parent->dir.subdirs++; + + return 0; +} + +/** + * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree + * @kn: kernfs_node of interest + * + * Try to unlink @kn from its sibling rbtree which starts from + * kn->parent->dir.children. Returns %true if @kn was actually + * removed, %false if @kn wasn't on the rbtree. + * + * Locking: + * mutex_lock(kernfs_mutex) + */ +static bool kernfs_unlink_sibling(struct kernfs_node *kn) +{ + if (RB_EMPTY_NODE(&kn->rb)) + return false; + + if (kernfs_type(kn) == KERNFS_DIR) + kn->parent->dir.subdirs--; + + rb_erase(&kn->rb, &kn->parent->dir.children); + RB_CLEAR_NODE(&kn->rb); + return true; +} + +/** + * kernfs_get_active - get an active reference to kernfs_node + * @kn: kernfs_node to get an active reference to + * + * Get an active reference of @kn. This function is noop if @kn + * is NULL. + * + * RETURNS: + * Pointer to @kn on success, NULL on failure. + */ +struct kernfs_node *kernfs_get_active(struct kernfs_node *kn) +{ + if (unlikely(!kn)) + return NULL; + + if (!atomic_inc_unless_negative(&kn->active)) + return NULL; + + if (kernfs_lockdep(kn)) + rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_); + return kn; +} + +/** + * kernfs_put_active - put an active reference to kernfs_node + * @kn: kernfs_node to put an active reference to + * + * Put an active reference to @kn. This function is noop if @kn + * is NULL. + */ +void kernfs_put_active(struct kernfs_node *kn) +{ + struct kernfs_root *root = kernfs_root(kn); + int v; + + if (unlikely(!kn)) + return; + + if (kernfs_lockdep(kn)) + rwsem_release(&kn->dep_map, 1, _RET_IP_); + v = atomic_dec_return(&kn->active); + if (likely(v != KN_DEACTIVATED_BIAS)) + return; + + wake_up_all(&root->deactivate_waitq); +} + +/** + * kernfs_drain - drain kernfs_node + * @kn: kernfs_node to drain + * + * Drain existing usages and nuke all existing mmaps of @kn. Mutiple + * removers may invoke this function concurrently on @kn and all will + * return after draining is complete. + */ +static void kernfs_drain(struct kernfs_node *kn) + __releases(&kernfs_mutex) __acquires(&kernfs_mutex) +{ + struct kernfs_root *root = kernfs_root(kn); + + lockdep_assert_held(&kernfs_mutex); + WARN_ON_ONCE(kernfs_active(kn)); + + mutex_unlock(&kernfs_mutex); + + if (kernfs_lockdep(kn)) { + rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); + if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS) + lock_contended(&kn->dep_map, _RET_IP_); + } + + /* but everyone should wait for draining */ + wait_event(root->deactivate_waitq, + atomic_read(&kn->active) == KN_DEACTIVATED_BIAS); + + if (kernfs_lockdep(kn)) { + lock_acquired(&kn->dep_map, _RET_IP_); + rwsem_release(&kn->dep_map, 1, _RET_IP_); + } + + kernfs_unmap_bin_file(kn); + + mutex_lock(&kernfs_mutex); +} + +/** + * kernfs_get - get a reference count on a kernfs_node + * @kn: the target kernfs_node + */ +void kernfs_get(struct kernfs_node *kn) +{ + if (kn) { + WARN_ON(!atomic_read(&kn->count)); + atomic_inc(&kn->count); + } +} +EXPORT_SYMBOL_GPL(kernfs_get); + +/** + * kernfs_put - put a reference count on a kernfs_node + * @kn: the target kernfs_node + * + * Put a reference count of @kn and destroy it if it reached zero. + */ +void kernfs_put(struct kernfs_node *kn) +{ + struct kernfs_node *parent; + struct kernfs_root *root; + + if (!kn || !atomic_dec_and_test(&kn->count)) + return; + root = kernfs_root(kn); + repeat: + /* + * Moving/renaming is always done while holding reference. + * kn->parent won't change beneath us. + */ + parent = kn->parent; + + WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS, + "kernfs_put: %s/%s: released with incorrect active_ref %d\n", + parent ? parent->name : "", kn->name, atomic_read(&kn->active)); + + if (kernfs_type(kn) == KERNFS_LINK) + kernfs_put(kn->symlink.target_kn); + if (!(kn->flags & KERNFS_STATIC_NAME)) + kfree(kn->name); + if (kn->iattr) { + if (kn->iattr->ia_secdata) + security_release_secctx(kn->iattr->ia_secdata, + kn->iattr->ia_secdata_len); + simple_xattrs_free(&kn->iattr->xattrs); + } + kfree(kn->iattr); + ida_simple_remove(&root->ino_ida, kn->ino); + kmem_cache_free(kernfs_node_cache, kn); + + kn = parent; + if (kn) { + if (atomic_dec_and_test(&kn->count)) + goto repeat; + } else { + /* just released the root kn, free @root too */ + ida_destroy(&root->ino_ida); + kfree(root); + } +} +EXPORT_SYMBOL_GPL(kernfs_put); + +static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct kernfs_node *kn; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + /* Always perform fresh lookup for negatives */ + if (!dentry->d_inode) + goto out_bad_unlocked; + + kn = dentry->d_fsdata; + mutex_lock(&kernfs_mutex); + + /* The kernfs node has been deactivated */ + if (!kernfs_active(kn)) + goto out_bad; + + /* The kernfs node has been moved? */ + if (dentry->d_parent->d_fsdata != kn->parent) + goto out_bad; + + /* The kernfs node has been renamed */ + if (strcmp(dentry->d_name.name, kn->name) != 0) + goto out_bad; + + /* The kernfs node has been moved to a different namespace */ + if (kn->parent && kernfs_ns_enabled(kn->parent) && + kernfs_info(dentry->d_sb)->ns != kn->ns) + goto out_bad; + + mutex_unlock(&kernfs_mutex); +out_valid: + return 1; +out_bad: + mutex_unlock(&kernfs_mutex); +out_bad_unlocked: + /* + * @dentry doesn't match the underlying kernfs node, drop the + * dentry and force lookup. If we have submounts we must allow the + * vfs caches to lie about the state of the filesystem to prevent + * leaks and other nasty things, so use check_submounts_and_drop() + * instead of d_drop(). + */ + if (check_submounts_and_drop(dentry) != 0) + goto out_valid; + + return 0; +} + +static void kernfs_dop_release(struct dentry *dentry) +{ + kernfs_put(dentry->d_fsdata); +} + +const struct dentry_operations kernfs_dops = { + .d_revalidate = kernfs_dop_revalidate, + .d_release = kernfs_dop_release, +}; + +/** + * kernfs_node_from_dentry - determine kernfs_node associated with a dentry + * @dentry: the dentry in question + * + * Return the kernfs_node associated with @dentry. If @dentry is not a + * kernfs one, %NULL is returned. + * + * While the returned kernfs_node will stay accessible as long as @dentry + * is accessible, the returned node can be in any state and the caller is + * fully responsible for determining what's accessible. + */ +struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) +{ + if (dentry->d_sb->s_op == &kernfs_sops) + return dentry->d_fsdata; + return NULL; +} + +static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, + const char *name, umode_t mode, + unsigned flags) +{ + char *dup_name = NULL; + struct kernfs_node *kn; + int ret; + + if (!(flags & KERNFS_STATIC_NAME)) { + name = dup_name = kstrdup(name, GFP_KERNEL); + if (!name) + return NULL; + } + + kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL); + if (!kn) + goto err_out1; + + ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL); + if (ret < 0) + goto err_out2; + kn->ino = ret; + + atomic_set(&kn->count, 1); + atomic_set(&kn->active, KN_DEACTIVATED_BIAS); + RB_CLEAR_NODE(&kn->rb); + + kn->name = name; + kn->mode = mode; + kn->flags = flags; + + return kn; + + err_out2: + kmem_cache_free(kernfs_node_cache, kn); + err_out1: + kfree(dup_name); + return NULL; +} + +struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, + const char *name, umode_t mode, + unsigned flags) +{ + struct kernfs_node *kn; + + kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags); + if (kn) { + kernfs_get(parent); + kn->parent = parent; + } + return kn; +} + +/** + * kernfs_add_one - add kernfs_node to parent without warning + * @kn: kernfs_node to be added + * + * The caller must already have initialized @kn->parent. This + * function increments nlink of the parent's inode if @kn is a + * directory and link into the children list of the parent. + * + * RETURNS: + * 0 on success, -EEXIST if entry with the given name already + * exists. + */ +int kernfs_add_one(struct kernfs_node *kn) +{ + struct kernfs_node *parent = kn->parent; + struct kernfs_iattrs *ps_iattr; + bool has_ns; + int ret; + + mutex_lock(&kernfs_mutex); + + ret = -EINVAL; + has_ns = kernfs_ns_enabled(parent); + if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", + has_ns ? "required" : "invalid", parent->name, kn->name)) + goto out_unlock; + + if (kernfs_type(parent) != KERNFS_DIR) + goto out_unlock; + + ret = -ENOENT; + if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent)) + goto out_unlock; + + kn->hash = kernfs_name_hash(kn->name, kn->ns); + + ret = kernfs_link_sibling(kn); + if (ret) + goto out_unlock; + + /* Update timestamps on the parent */ + ps_iattr = parent->iattr; + if (ps_iattr) { + struct iattr *ps_iattrs = &ps_iattr->ia_iattr; + ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; + } + + mutex_unlock(&kernfs_mutex); + + /* + * Activate the new node unless CREATE_DEACTIVATED is requested. + * If not activated here, the kernfs user is responsible for + * activating the node with kernfs_activate(). A node which hasn't + * been activated is not visible to userland and its removal won't + * trigger deactivation. + */ + if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) + kernfs_activate(kn); + return 0; + +out_unlock: + mutex_unlock(&kernfs_mutex); + return ret; +} + +/** + * kernfs_find_ns - find kernfs_node with the given name + * @parent: kernfs_node to search under + * @name: name to look for + * @ns: the namespace tag to use + * + * Look for kernfs_node with name @name under @parent. Returns pointer to + * the found kernfs_node on success, %NULL on failure. + */ +static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, + const unsigned char *name, + const void *ns) +{ + struct rb_node *node = parent->dir.children.rb_node; + bool has_ns = kernfs_ns_enabled(parent); + unsigned int hash; + + lockdep_assert_held(&kernfs_mutex); + + if (has_ns != (bool)ns) { + WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", + has_ns ? "required" : "invalid", parent->name, name); + return NULL; + } + + hash = kernfs_name_hash(name, ns); + while (node) { + struct kernfs_node *kn; + int result; + + kn = rb_to_kn(node); + result = kernfs_name_compare(hash, name, ns, kn); + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return kn; + } + return NULL; +} + +/** + * kernfs_find_and_get_ns - find and get kernfs_node with the given name + * @parent: kernfs_node to search under + * @name: name to look for + * @ns: the namespace tag to use + * + * Look for kernfs_node with name @name under @parent and get a reference + * if found. This function may sleep and returns pointer to the found + * kernfs_node on success, %NULL on failure. + */ +struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, + const char *name, const void *ns) +{ + struct kernfs_node *kn; + + mutex_lock(&kernfs_mutex); + kn = kernfs_find_ns(parent, name, ns); + kernfs_get(kn); + mutex_unlock(&kernfs_mutex); + + return kn; +} +EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns); + +/** + * kernfs_create_root - create a new kernfs hierarchy + * @scops: optional syscall operations for the hierarchy + * @flags: KERNFS_ROOT_* flags + * @priv: opaque data associated with the new directory + * + * Returns the root of the new hierarchy on success, ERR_PTR() value on + * failure. + */ +struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, + unsigned int flags, void *priv) +{ + struct kernfs_root *root; + struct kernfs_node *kn; + + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) + return ERR_PTR(-ENOMEM); + + ida_init(&root->ino_ida); + INIT_LIST_HEAD(&root->supers); + + kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO, + KERNFS_DIR); + if (!kn) { + ida_destroy(&root->ino_ida); + kfree(root); + return ERR_PTR(-ENOMEM); + } + + kn->priv = priv; + kn->dir.root = root; + + root->syscall_ops = scops; + root->flags = flags; + root->kn = kn; + init_waitqueue_head(&root->deactivate_waitq); + + if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) + kernfs_activate(kn); + + return root; +} + +/** + * kernfs_destroy_root - destroy a kernfs hierarchy + * @root: root of the hierarchy to destroy + * + * Destroy the hierarchy anchored at @root by removing all existing + * directories and destroying @root. + */ +void kernfs_destroy_root(struct kernfs_root *root) +{ + kernfs_remove(root->kn); /* will also free @root */ +} + +/** + * kernfs_create_dir_ns - create a directory + * @parent: parent in which to create a new directory + * @name: name of the new directory + * @mode: mode of the new directory + * @priv: opaque data associated with the new directory + * @ns: optional namespace tag of the directory + * + * Returns the created node on success, ERR_PTR() value on failure. + */ +struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, + const char *name, umode_t mode, + void *priv, const void *ns) +{ + struct kernfs_node *kn; + int rc; + + /* allocate */ + kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR); + if (!kn) + return ERR_PTR(-ENOMEM); + + kn->dir.root = parent->dir.root; + kn->ns = ns; + kn->priv = priv; + + /* link in */ + rc = kernfs_add_one(kn); + if (!rc) + return kn; + + kernfs_put(kn); + return ERR_PTR(rc); +} + +static struct dentry *kernfs_iop_lookup(struct inode *dir, + struct dentry *dentry, + unsigned int flags) +{ + struct dentry *ret; + struct kernfs_node *parent = dentry->d_parent->d_fsdata; + struct kernfs_node *kn; + struct inode *inode; + const void *ns = NULL; + + mutex_lock(&kernfs_mutex); + + if (kernfs_ns_enabled(parent)) + ns = kernfs_info(dir->i_sb)->ns; + + kn = kernfs_find_ns(parent, dentry->d_name.name, ns); + + /* no such entry */ + if (!kn || !kernfs_active(kn)) { + ret = NULL; + goto out_unlock; + } + kernfs_get(kn); + dentry->d_fsdata = kn; + + /* attach dentry and inode */ + inode = kernfs_get_inode(dir->i_sb, kn); + if (!inode) { + ret = ERR_PTR(-ENOMEM); + goto out_unlock; + } + + /* instantiate and hash dentry */ + ret = d_materialise_unique(dentry, inode); + out_unlock: + mutex_unlock(&kernfs_mutex); + return ret; +} + +static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry, + umode_t mode) +{ + struct kernfs_node *parent = dir->i_private; + struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops; + int ret; + + if (!scops || !scops->mkdir) + return -EPERM; + + if (!kernfs_get_active(parent)) + return -ENODEV; + + ret = scops->mkdir(parent, dentry->d_name.name, mode); + + kernfs_put_active(parent); + return ret; +} + +static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; + int ret; + + if (!scops || !scops->rmdir) + return -EPERM; + + if (!kernfs_get_active(kn)) + return -ENODEV; + + ret = scops->rmdir(kn); + + kernfs_put_active(kn); + return ret; +} + +static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct kernfs_node *kn = old_dentry->d_fsdata; + struct kernfs_node *new_parent = new_dir->i_private; + struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; + int ret; + + if (!scops || !scops->rename) + return -EPERM; + + if (!kernfs_get_active(kn)) + return -ENODEV; + + if (!kernfs_get_active(new_parent)) { + kernfs_put_active(kn); + return -ENODEV; + } + + ret = scops->rename(kn, new_parent, new_dentry->d_name.name); + + kernfs_put_active(new_parent); + kernfs_put_active(kn); + return ret; +} + +const struct inode_operations kernfs_dir_iops = { + .lookup = kernfs_iop_lookup, + .permission = kernfs_iop_permission, + .setattr = kernfs_iop_setattr, + .getattr = kernfs_iop_getattr, + .setxattr = kernfs_iop_setxattr, + .removexattr = kernfs_iop_removexattr, + .getxattr = kernfs_iop_getxattr, + .listxattr = kernfs_iop_listxattr, + + .mkdir = kernfs_iop_mkdir, + .rmdir = kernfs_iop_rmdir, + .rename = kernfs_iop_rename, +}; + +static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos) +{ + struct kernfs_node *last; + + while (true) { + struct rb_node *rbn; + + last = pos; + + if (kernfs_type(pos) != KERNFS_DIR) + break; + + rbn = rb_first(&pos->dir.children); + if (!rbn) + break; + + pos = rb_to_kn(rbn); + } + + return last; +} + +/** + * kernfs_next_descendant_post - find the next descendant for post-order walk + * @pos: the current position (%NULL to initiate traversal) + * @root: kernfs_node whose descendants to walk + * + * Find the next descendant to visit for post-order traversal of @root's + * descendants. @root is included in the iteration and the last node to be + * visited. + */ +static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, + struct kernfs_node *root) +{ + struct rb_node *rbn; + + lockdep_assert_held(&kernfs_mutex); + + /* if first iteration, visit leftmost descendant which may be root */ + if (!pos) + return kernfs_leftmost_descendant(root); + + /* if we visited @root, we're done */ + if (pos == root) + return NULL; + + /* if there's an unvisited sibling, visit its leftmost descendant */ + rbn = rb_next(&pos->rb); + if (rbn) + return kernfs_leftmost_descendant(rb_to_kn(rbn)); + + /* no sibling left, visit parent */ + return pos->parent; +} + +/** + * kernfs_activate - activate a node which started deactivated + * @kn: kernfs_node whose subtree is to be activated + * + * If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node + * needs to be explicitly activated. A node which hasn't been activated + * isn't visible to userland and deactivation is skipped during its + * removal. This is useful to construct atomic init sequences where + * creation of multiple nodes should either succeed or fail atomically. + * + * The caller is responsible for ensuring that this function is not called + * after kernfs_remove*() is invoked on @kn. + */ +void kernfs_activate(struct kernfs_node *kn) +{ + struct kernfs_node *pos; + + mutex_lock(&kernfs_mutex); + + pos = NULL; + while ((pos = kernfs_next_descendant_post(pos, kn))) { + if (!pos || (pos->flags & KERNFS_ACTIVATED)) + continue; + + WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb)); + WARN_ON_ONCE(atomic_read(&pos->active) != KN_DEACTIVATED_BIAS); + + atomic_sub(KN_DEACTIVATED_BIAS, &pos->active); + pos->flags |= KERNFS_ACTIVATED; + } + + mutex_unlock(&kernfs_mutex); +} + +static void __kernfs_remove(struct kernfs_node *kn) +{ + struct kernfs_node *pos; + + lockdep_assert_held(&kernfs_mutex); + + /* + * Short-circuit if non-root @kn has already finished removal. + * This is for kernfs_remove_self() which plays with active ref + * after removal. + */ + if (!kn || (kn->parent && RB_EMPTY_NODE(&kn->rb))) + return; + + pr_debug("kernfs %s: removing\n", kn->name); + + /* prevent any new usage under @kn by deactivating all nodes */ + pos = NULL; + while ((pos = kernfs_next_descendant_post(pos, kn))) + if (kernfs_active(pos)) + atomic_add(KN_DEACTIVATED_BIAS, &pos->active); + + /* deactivate and unlink the subtree node-by-node */ + do { + pos = kernfs_leftmost_descendant(kn); + + /* + * kernfs_drain() drops kernfs_mutex temporarily and @pos's + * base ref could have been put by someone else by the time + * the function returns. Make sure it doesn't go away + * underneath us. + */ + kernfs_get(pos); + + /* + * Drain iff @kn was activated. This avoids draining and + * its lockdep annotations for nodes which have never been + * activated and allows embedding kernfs_remove() in create + * error paths without worrying about draining. + */ + if (kn->flags & KERNFS_ACTIVATED) + kernfs_drain(pos); + else + WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); + + /* + * kernfs_unlink_sibling() succeeds once per node. Use it + * to decide who's responsible for cleanups. + */ + if (!pos->parent || kernfs_unlink_sibling(pos)) { + struct kernfs_iattrs *ps_iattr = + pos->parent ? pos->parent->iattr : NULL; + + /* update timestamps on the parent */ + if (ps_iattr) { + ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME; + ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME; + } + + kernfs_put(pos); + } + + kernfs_put(pos); + } while (pos != kn); +} + +/** + * kernfs_remove - remove a kernfs_node recursively + * @kn: the kernfs_node to remove + * + * Remove @kn along with all its subdirectories and files. + */ +void kernfs_remove(struct kernfs_node *kn) +{ + mutex_lock(&kernfs_mutex); + __kernfs_remove(kn); + mutex_unlock(&kernfs_mutex); +} + +/** + * kernfs_break_active_protection - break out of active protection + * @kn: the self kernfs_node + * + * The caller must be running off of a kernfs operation which is invoked + * with an active reference - e.g. one of kernfs_ops. Each invocation of + * this function must also be matched with an invocation of + * kernfs_unbreak_active_protection(). + * + * This function releases the active reference of @kn the caller is + * holding. Once this function is called, @kn may be removed at any point + * and the caller is solely responsible for ensuring that the objects it + * dereferences are accessible. + */ +void kernfs_break_active_protection(struct kernfs_node *kn) +{ + /* + * Take out ourself out of the active ref dependency chain. If + * we're called without an active ref, lockdep will complain. + */ + kernfs_put_active(kn); +} + +/** + * kernfs_unbreak_active_protection - undo kernfs_break_active_protection() + * @kn: the self kernfs_node + * + * If kernfs_break_active_protection() was called, this function must be + * invoked before finishing the kernfs operation. Note that while this + * function restores the active reference, it doesn't and can't actually + * restore the active protection - @kn may already or be in the process of + * being removed. Once kernfs_break_active_protection() is invoked, that + * protection is irreversibly gone for the kernfs operation instance. + * + * While this function may be called at any point after + * kernfs_break_active_protection() is invoked, its most useful location + * would be right before the enclosing kernfs operation returns. + */ +void kernfs_unbreak_active_protection(struct kernfs_node *kn) +{ + /* + * @kn->active could be in any state; however, the increment we do + * here will be undone as soon as the enclosing kernfs operation + * finishes and this temporary bump can't break anything. If @kn + * is alive, nothing changes. If @kn is being deactivated, the + * soon-to-follow put will either finish deactivation or restore + * deactivated state. If @kn is already removed, the temporary + * bump is guaranteed to be gone before @kn is released. + */ + atomic_inc(&kn->active); + if (kernfs_lockdep(kn)) + rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_); +} + +/** + * kernfs_remove_self - remove a kernfs_node from its own method + * @kn: the self kernfs_node to remove + * + * The caller must be running off of a kernfs operation which is invoked + * with an active reference - e.g. one of kernfs_ops. This can be used to + * implement a file operation which deletes itself. + * + * For example, the "delete" file for a sysfs device directory can be + * implemented by invoking kernfs_remove_self() on the "delete" file + * itself. This function breaks the circular dependency of trying to + * deactivate self while holding an active ref itself. It isn't necessary + * to modify the usual removal path to use kernfs_remove_self(). The + * "delete" implementation can simply invoke kernfs_remove_self() on self + * before proceeding with the usual removal path. kernfs will ignore later + * kernfs_remove() on self. + * + * kernfs_remove_self() can be called multiple times concurrently on the + * same kernfs_node. Only the first one actually performs removal and + * returns %true. All others will wait until the kernfs operation which + * won self-removal finishes and return %false. Note that the losers wait + * for the completion of not only the winning kernfs_remove_self() but also + * the whole kernfs_ops which won the arbitration. This can be used to + * guarantee, for example, all concurrent writes to a "delete" file to + * finish only after the whole operation is complete. + */ +bool kernfs_remove_self(struct kernfs_node *kn) +{ + bool ret; + + mutex_lock(&kernfs_mutex); + kernfs_break_active_protection(kn); + + /* + * SUICIDAL is used to arbitrate among competing invocations. Only + * the first one will actually perform removal. When the removal + * is complete, SUICIDED is set and the active ref is restored + * while holding kernfs_mutex. The ones which lost arbitration + * waits for SUICDED && drained which can happen only after the + * enclosing kernfs operation which executed the winning instance + * of kernfs_remove_self() finished. + */ + if (!(kn->flags & KERNFS_SUICIDAL)) { + kn->flags |= KERNFS_SUICIDAL; + __kernfs_remove(kn); + kn->flags |= KERNFS_SUICIDED; + ret = true; + } else { + wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq; + DEFINE_WAIT(wait); + + while (true) { + prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE); + + if ((kn->flags & KERNFS_SUICIDED) && + atomic_read(&kn->active) == KN_DEACTIVATED_BIAS) + break; + + mutex_unlock(&kernfs_mutex); + schedule(); + mutex_lock(&kernfs_mutex); + } + finish_wait(waitq, &wait); + WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb)); + ret = false; + } + + /* + * This must be done while holding kernfs_mutex; otherwise, waiting + * for SUICIDED && deactivated could finish prematurely. + */ + kernfs_unbreak_active_protection(kn); + + mutex_unlock(&kernfs_mutex); + return ret; +} + +/** + * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it + * @parent: parent of the target + * @name: name of the kernfs_node to remove + * @ns: namespace tag of the kernfs_node to remove + * + * Look for the kernfs_node with @name and @ns under @parent and remove it. + * Returns 0 on success, -ENOENT if such entry doesn't exist. + */ +int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, + const void *ns) +{ + struct kernfs_node *kn; + + if (!parent) { + WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n", + name); + return -ENOENT; + } + + mutex_lock(&kernfs_mutex); + + kn = kernfs_find_ns(parent, name, ns); + if (kn) + __kernfs_remove(kn); + + mutex_unlock(&kernfs_mutex); + + if (kn) + return 0; + else + return -ENOENT; +} + +/** + * kernfs_rename_ns - move and rename a kernfs_node + * @kn: target node + * @new_parent: new parent to put @sd under + * @new_name: new name + * @new_ns: new namespace tag + */ +int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name, const void *new_ns) +{ + struct kernfs_node *old_parent; + const char *old_name = NULL; + int error; + + /* can't move or rename root */ + if (!kn->parent) + return -EINVAL; + + mutex_lock(&kernfs_mutex); + + error = -ENOENT; + if (!kernfs_active(kn) || !kernfs_active(new_parent)) + goto out; + + error = 0; + if ((kn->parent == new_parent) && (kn->ns == new_ns) && + (strcmp(kn->name, new_name) == 0)) + goto out; /* nothing to rename */ + + error = -EEXIST; + if (kernfs_find_ns(new_parent, new_name, new_ns)) + goto out; + + /* rename kernfs_node */ + if (strcmp(kn->name, new_name) != 0) { + error = -ENOMEM; + new_name = kstrdup(new_name, GFP_KERNEL); + if (!new_name) + goto out; + } else { + new_name = NULL; + } + + /* + * Move to the appropriate place in the appropriate directories rbtree. + */ + kernfs_unlink_sibling(kn); + kernfs_get(new_parent); + + /* rename_lock protects ->parent and ->name accessors */ + spin_lock_irq(&kernfs_rename_lock); + + old_parent = kn->parent; + kn->parent = new_parent; + + kn->ns = new_ns; + if (new_name) { + if (!(kn->flags & KERNFS_STATIC_NAME)) + old_name = kn->name; + kn->flags &= ~KERNFS_STATIC_NAME; + kn->name = new_name; + } + + spin_unlock_irq(&kernfs_rename_lock); + + kn->hash = kernfs_name_hash(kn->name, kn->ns); + kernfs_link_sibling(kn); + + kernfs_put(old_parent); + kfree(old_name); + + error = 0; + out: + mutex_unlock(&kernfs_mutex); + return error; +} + +/* Relationship between s_mode and the DT_xxx types */ +static inline unsigned char dt_type(struct kernfs_node *kn) +{ + return (kn->mode >> 12) & 15; +} + +static int kernfs_dir_fop_release(struct inode *inode, struct file *filp) +{ + kernfs_put(filp->private_data); + return 0; +} + +static struct kernfs_node *kernfs_dir_pos(const void *ns, + struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos) +{ + if (pos) { + int valid = kernfs_active(pos) && + pos->parent == parent && hash == pos->hash; + kernfs_put(pos); + if (!valid) + pos = NULL; + } + if (!pos && (hash > 1) && (hash < INT_MAX)) { + struct rb_node *node = parent->dir.children.rb_node; + while (node) { + pos = rb_to_kn(node); + + if (hash < pos->hash) + node = node->rb_left; + else if (hash > pos->hash) + node = node->rb_right; + else + break; + } + } + /* Skip over entries which are dying/dead or in the wrong namespace */ + while (pos && (!kernfs_active(pos) || pos->ns != ns)) { + struct rb_node *node = rb_next(&pos->rb); + if (!node) + pos = NULL; + else + pos = rb_to_kn(node); + } + return pos; +} + +static struct kernfs_node *kernfs_dir_next_pos(const void *ns, + struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos) +{ + pos = kernfs_dir_pos(ns, parent, ino, pos); + if (pos) { + do { + struct rb_node *node = rb_next(&pos->rb); + if (!node) + pos = NULL; + else + pos = rb_to_kn(node); + } while (pos && (!kernfs_active(pos) || pos->ns != ns)); + } + return pos; +} + +static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) +{ + struct dentry *dentry = file->f_path.dentry; + struct kernfs_node *parent = dentry->d_fsdata; + struct kernfs_node *pos = file->private_data; + const void *ns = NULL; + + if (!dir_emit_dots(file, ctx)) + return 0; + mutex_lock(&kernfs_mutex); + + if (kernfs_ns_enabled(parent)) + ns = kernfs_info(dentry->d_sb)->ns; + + for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos); + pos; + pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) { + const char *name = pos->name; + unsigned int type = dt_type(pos); + int len = strlen(name); + ino_t ino = pos->ino; + + ctx->pos = pos->hash; + file->private_data = pos; + kernfs_get(pos); + + mutex_unlock(&kernfs_mutex); + if (!dir_emit(ctx, name, len, ino, type)) + return 0; + mutex_lock(&kernfs_mutex); + } + mutex_unlock(&kernfs_mutex); + file->private_data = NULL; + ctx->pos = INT_MAX; + return 0; +} + +static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset, + int whence) +{ + struct inode *inode = file_inode(file); + loff_t ret; + + mutex_lock(&inode->i_mutex); + ret = generic_file_llseek(file, offset, whence); + mutex_unlock(&inode->i_mutex); + + return ret; +} + +const struct file_operations kernfs_dir_fops = { + .read = generic_read_dir, + .iterate = kernfs_fop_readdir, + .release = kernfs_dir_fop_release, + .llseek = kernfs_dir_fop_llseek, +}; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c new file mode 100644 index 00000000000..d895b4b7b66 --- /dev/null +++ b/fs/kernfs/file.c @@ -0,0 +1,952 @@ +/* + * fs/kernfs/file.c - kernfs file implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + */ + +#include <linux/fs.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/poll.h> +#include <linux/pagemap.h> +#include <linux/sched.h> +#include <linux/fsnotify.h> + +#include "kernfs-internal.h" + +/* + * There's one kernfs_open_file for each open file and one kernfs_open_node + * for each kernfs_node with one or more open files. + * + * kernfs_node->attr.open points to kernfs_open_node. attr.open is + * protected by kernfs_open_node_lock. + * + * filp->private_data points to seq_file whose ->private points to + * kernfs_open_file. kernfs_open_files are chained at + * kernfs_open_node->files, which is protected by kernfs_open_file_mutex. + */ +static DEFINE_SPINLOCK(kernfs_open_node_lock); +static DEFINE_MUTEX(kernfs_open_file_mutex); + +struct kernfs_open_node { + atomic_t refcnt; + atomic_t event; + wait_queue_head_t poll; + struct list_head files; /* goes through kernfs_open_file.list */ +}; + +/* + * kernfs_notify() may be called from any context and bounces notifications + * through a work item. To minimize space overhead in kernfs_node, the + * pending queue is implemented as a singly linked list of kernfs_nodes. + * The list is terminated with the self pointer so that whether a + * kernfs_node is on the list or not can be determined by testing the next + * pointer for NULL. + */ +#define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list) + +static DEFINE_SPINLOCK(kernfs_notify_lock); +static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL; + +static struct kernfs_open_file *kernfs_of(struct file *file) +{ + return ((struct seq_file *)file->private_data)->private; +} + +/* + * Determine the kernfs_ops for the given kernfs_node. This function must + * be called while holding an active reference. + */ +static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn) +{ + if (kn->flags & KERNFS_LOCKDEP) + lockdep_assert_held(kn); + return kn->attr.ops; +} + +/* + * As kernfs_seq_stop() is also called after kernfs_seq_start() or + * kernfs_seq_next() failure, it needs to distinguish whether it's stopping + * a seq_file iteration which is fully initialized with an active reference + * or an aborted kernfs_seq_start() due to get_active failure. The + * position pointer is the only context for each seq_file iteration and + * thus the stop condition should be encoded in it. As the return value is + * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable + * choice to indicate get_active failure. + * + * Unfortunately, this is complicated due to the optional custom seq_file + * operations which may return ERR_PTR(-ENODEV) too. kernfs_seq_stop() + * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or + * custom seq_file operations and thus can't decide whether put_active + * should be performed or not only on ERR_PTR(-ENODEV). + * + * This is worked around by factoring out the custom seq_stop() and + * put_active part into kernfs_seq_stop_active(), skipping it from + * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after + * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures + * that kernfs_seq_stop_active() is skipped only after get_active failure. + */ +static void kernfs_seq_stop_active(struct seq_file *sf, void *v) +{ + struct kernfs_open_file *of = sf->private; + const struct kernfs_ops *ops = kernfs_ops(of->kn); + + if (ops->seq_stop) + ops->seq_stop(sf, v); + kernfs_put_active(of->kn); +} + +static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) +{ + struct kernfs_open_file *of = sf->private; + const struct kernfs_ops *ops; + + /* + * @of->mutex nests outside active ref and is just to ensure that + * the ops aren't called concurrently for the same open file. + */ + mutex_lock(&of->mutex); + if (!kernfs_get_active(of->kn)) + return ERR_PTR(-ENODEV); + + ops = kernfs_ops(of->kn); + if (ops->seq_start) { + void *next = ops->seq_start(sf, ppos); + /* see the comment above kernfs_seq_stop_active() */ + if (next == ERR_PTR(-ENODEV)) + kernfs_seq_stop_active(sf, next); + return next; + } else { + /* + * The same behavior and code as single_open(). Returns + * !NULL if pos is at the beginning; otherwise, NULL. + */ + return NULL + !*ppos; + } +} + +static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos) +{ + struct kernfs_open_file *of = sf->private; + const struct kernfs_ops *ops = kernfs_ops(of->kn); + + if (ops->seq_next) { + void *next = ops->seq_next(sf, v, ppos); + /* see the comment above kernfs_seq_stop_active() */ + if (next == ERR_PTR(-ENODEV)) + kernfs_seq_stop_active(sf, next); + return next; + } else { + /* + * The same behavior and code as single_open(), always + * terminate after the initial read. + */ + ++*ppos; + return NULL; + } +} + +static void kernfs_seq_stop(struct seq_file *sf, void *v) +{ + struct kernfs_open_file *of = sf->private; + + if (v != ERR_PTR(-ENODEV)) + kernfs_seq_stop_active(sf, v); + mutex_unlock(&of->mutex); +} + +static int kernfs_seq_show(struct seq_file *sf, void *v) +{ + struct kernfs_open_file *of = sf->private; + + of->event = atomic_read(&of->kn->attr.open->event); + + return of->kn->attr.ops->seq_show(sf, v); +} + +static const struct seq_operations kernfs_seq_ops = { + .start = kernfs_seq_start, + .next = kernfs_seq_next, + .stop = kernfs_seq_stop, + .show = kernfs_seq_show, +}; + +/* + * As reading a bin file can have side-effects, the exact offset and bytes + * specified in read(2) call should be passed to the read callback making + * it difficult to use seq_file. Implement simplistic custom buffering for + * bin files. + */ +static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, + char __user *user_buf, size_t count, + loff_t *ppos) +{ + ssize_t len = min_t(size_t, count, PAGE_SIZE); + const struct kernfs_ops *ops; + char *buf; + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* + * @of->mutex nests outside active ref and is just to ensure that + * the ops aren't called concurrently for the same open file. + */ + mutex_lock(&of->mutex); + if (!kernfs_get_active(of->kn)) { + len = -ENODEV; + mutex_unlock(&of->mutex); + goto out_free; + } + + ops = kernfs_ops(of->kn); + if (ops->read) + len = ops->read(of, buf, len, *ppos); + else + len = -EINVAL; + + kernfs_put_active(of->kn); + mutex_unlock(&of->mutex); + + if (len < 0) + goto out_free; + + if (copy_to_user(user_buf, buf, len)) { + len = -EFAULT; + goto out_free; + } + + *ppos += len; + + out_free: + kfree(buf); + return len; +} + +/** + * kernfs_fop_read - kernfs vfs read callback + * @file: file pointer + * @user_buf: data to write + * @count: number of bytes + * @ppos: starting offset + */ +static ssize_t kernfs_fop_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct kernfs_open_file *of = kernfs_of(file); + + if (of->kn->flags & KERNFS_HAS_SEQ_SHOW) + return seq_read(file, user_buf, count, ppos); + else + return kernfs_file_direct_read(of, user_buf, count, ppos); +} + +/** + * kernfs_fop_write - kernfs vfs write callback + * @file: file pointer + * @user_buf: data to write + * @count: number of bytes + * @ppos: starting offset + * + * Copy data in from userland and pass it to the matching kernfs write + * operation. + * + * There is no easy way for us to know if userspace is only doing a partial + * write, so we don't support them. We expect the entire buffer to come on + * the first write. Hint: if you're writing a value, first read the file, + * modify only the the value you're changing, then write entire buffer + * back. + */ +static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct kernfs_open_file *of = kernfs_of(file); + const struct kernfs_ops *ops; + size_t len; + char *buf; + + if (of->atomic_write_len) { + len = count; + if (len > of->atomic_write_len) + return -E2BIG; + } else { + len = min_t(size_t, count, PAGE_SIZE); + } + + buf = kmalloc(len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, user_buf, len)) { + len = -EFAULT; + goto out_free; + } + buf[len] = '\0'; /* guarantee string termination */ + + /* + * @of->mutex nests outside active ref and is just to ensure that + * the ops aren't called concurrently for the same open file. + */ + mutex_lock(&of->mutex); + if (!kernfs_get_active(of->kn)) { + mutex_unlock(&of->mutex); + len = -ENODEV; + goto out_free; + } + + ops = kernfs_ops(of->kn); + if (ops->write) + len = ops->write(of, buf, len, *ppos); + else + len = -EINVAL; + + kernfs_put_active(of->kn); + mutex_unlock(&of->mutex); + + if (len > 0) + *ppos += len; +out_free: + kfree(buf); + return len; +} + +static void kernfs_vma_open(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + + if (!of->vm_ops) + return; + + if (!kernfs_get_active(of->kn)) + return; + + if (of->vm_ops->open) + of->vm_ops->open(vma); + + kernfs_put_active(of->kn); +} + +static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return VM_FAULT_SIGBUS; + + if (!kernfs_get_active(of->kn)) + return VM_FAULT_SIGBUS; + + ret = VM_FAULT_SIGBUS; + if (of->vm_ops->fault) + ret = of->vm_ops->fault(vma, vmf); + + kernfs_put_active(of->kn); + return ret; +} + +static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return VM_FAULT_SIGBUS; + + if (!kernfs_get_active(of->kn)) + return VM_FAULT_SIGBUS; + + ret = 0; + if (of->vm_ops->page_mkwrite) + ret = of->vm_ops->page_mkwrite(vma, vmf); + else + file_update_time(file); + + kernfs_put_active(of->kn); + return ret; +} + +static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return -EINVAL; + + if (!kernfs_get_active(of->kn)) + return -EINVAL; + + ret = -EINVAL; + if (of->vm_ops->access) + ret = of->vm_ops->access(vma, addr, buf, len, write); + + kernfs_put_active(of->kn); + return ret; +} + +#ifdef CONFIG_NUMA +static int kernfs_vma_set_policy(struct vm_area_struct *vma, + struct mempolicy *new) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return 0; + + if (!kernfs_get_active(of->kn)) + return -EINVAL; + + ret = 0; + if (of->vm_ops->set_policy) + ret = of->vm_ops->set_policy(vma, new); + + kernfs_put_active(of->kn); + return ret; +} + +static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma, + unsigned long addr) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + struct mempolicy *pol; + + if (!of->vm_ops) + return vma->vm_policy; + + if (!kernfs_get_active(of->kn)) + return vma->vm_policy; + + pol = vma->vm_policy; + if (of->vm_ops->get_policy) + pol = of->vm_ops->get_policy(vma, addr); + + kernfs_put_active(of->kn); + return pol; +} + +static int kernfs_vma_migrate(struct vm_area_struct *vma, + const nodemask_t *from, const nodemask_t *to, + unsigned long flags) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return 0; + + if (!kernfs_get_active(of->kn)) + return 0; + + ret = 0; + if (of->vm_ops->migrate) + ret = of->vm_ops->migrate(vma, from, to, flags); + + kernfs_put_active(of->kn); + return ret; +} +#endif + +static const struct vm_operations_struct kernfs_vm_ops = { + .open = kernfs_vma_open, + .fault = kernfs_vma_fault, + .page_mkwrite = kernfs_vma_page_mkwrite, + .access = kernfs_vma_access, +#ifdef CONFIG_NUMA + .set_policy = kernfs_vma_set_policy, + .get_policy = kernfs_vma_get_policy, + .migrate = kernfs_vma_migrate, +#endif +}; + +static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct kernfs_open_file *of = kernfs_of(file); + const struct kernfs_ops *ops; + int rc; + + /* + * mmap path and of->mutex are prone to triggering spurious lockdep + * warnings and we don't want to add spurious locking dependency + * between the two. Check whether mmap is actually implemented + * without grabbing @of->mutex by testing HAS_MMAP flag. See the + * comment in kernfs_file_open() for more details. + */ + if (!(of->kn->flags & KERNFS_HAS_MMAP)) + return -ENODEV; + + mutex_lock(&of->mutex); + + rc = -ENODEV; + if (!kernfs_get_active(of->kn)) + goto out_unlock; + + ops = kernfs_ops(of->kn); + rc = ops->mmap(of, vma); + if (rc) + goto out_put; + + /* + * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() + * to satisfy versions of X which crash if the mmap fails: that + * substitutes a new vm_file, and we don't then want bin_vm_ops. + */ + if (vma->vm_file != file) + goto out_put; + + rc = -EINVAL; + if (of->mmapped && of->vm_ops != vma->vm_ops) + goto out_put; + + /* + * It is not possible to successfully wrap close. + * So error if someone is trying to use close. + */ + rc = -EINVAL; + if (vma->vm_ops && vma->vm_ops->close) + goto out_put; + + rc = 0; + of->mmapped = 1; + of->vm_ops = vma->vm_ops; + vma->vm_ops = &kernfs_vm_ops; +out_put: + kernfs_put_active(of->kn); +out_unlock: + mutex_unlock(&of->mutex); + + return rc; +} + +/** + * kernfs_get_open_node - get or create kernfs_open_node + * @kn: target kernfs_node + * @of: kernfs_open_file for this instance of open + * + * If @kn->attr.open exists, increment its reference count; otherwise, + * create one. @of is chained to the files list. + * + * LOCKING: + * Kernel thread context (may sleep). + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int kernfs_get_open_node(struct kernfs_node *kn, + struct kernfs_open_file *of) +{ + struct kernfs_open_node *on, *new_on = NULL; + + retry: + mutex_lock(&kernfs_open_file_mutex); + spin_lock_irq(&kernfs_open_node_lock); + + if (!kn->attr.open && new_on) { + kn->attr.open = new_on; + new_on = NULL; + } + + on = kn->attr.open; + if (on) { + atomic_inc(&on->refcnt); + list_add_tail(&of->list, &on->files); + } + + spin_unlock_irq(&kernfs_open_node_lock); + mutex_unlock(&kernfs_open_file_mutex); + + if (on) { + kfree(new_on); + return 0; + } + + /* not there, initialize a new one and retry */ + new_on = kmalloc(sizeof(*new_on), GFP_KERNEL); + if (!new_on) + return -ENOMEM; + + atomic_set(&new_on->refcnt, 0); + atomic_set(&new_on->event, 1); + init_waitqueue_head(&new_on->poll); + INIT_LIST_HEAD(&new_on->files); + goto retry; +} + +/** + * kernfs_put_open_node - put kernfs_open_node + * @kn: target kernfs_nodet + * @of: associated kernfs_open_file + * + * Put @kn->attr.open and unlink @of from the files list. If + * reference count reaches zero, disassociate and free it. + * + * LOCKING: + * None. + */ +static void kernfs_put_open_node(struct kernfs_node *kn, + struct kernfs_open_file *of) +{ + struct kernfs_open_node *on = kn->attr.open; + unsigned long flags; + + mutex_lock(&kernfs_open_file_mutex); + spin_lock_irqsave(&kernfs_open_node_lock, flags); + + if (of) + list_del(&of->list); + + if (atomic_dec_and_test(&on->refcnt)) + kn->attr.open = NULL; + else + on = NULL; + + spin_unlock_irqrestore(&kernfs_open_node_lock, flags); + mutex_unlock(&kernfs_open_file_mutex); + + kfree(on); +} + +static int kernfs_fop_open(struct inode *inode, struct file *file) +{ + struct kernfs_node *kn = file->f_path.dentry->d_fsdata; + struct kernfs_root *root = kernfs_root(kn); + const struct kernfs_ops *ops; + struct kernfs_open_file *of; + bool has_read, has_write, has_mmap; + int error = -EACCES; + + if (!kernfs_get_active(kn)) + return -ENODEV; + + ops = kernfs_ops(kn); + + has_read = ops->seq_show || ops->read || ops->mmap; + has_write = ops->write || ops->mmap; + has_mmap = ops->mmap; + + /* see the flag definition for details */ + if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) { + if ((file->f_mode & FMODE_WRITE) && + (!(inode->i_mode & S_IWUGO) || !has_write)) + goto err_out; + + if ((file->f_mode & FMODE_READ) && + (!(inode->i_mode & S_IRUGO) || !has_read)) + goto err_out; + } + + /* allocate a kernfs_open_file for the file */ + error = -ENOMEM; + of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL); + if (!of) + goto err_out; + + /* + * The following is done to give a different lockdep key to + * @of->mutex for files which implement mmap. This is a rather + * crude way to avoid false positive lockdep warning around + * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and + * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under + * which mm->mmap_sem nests, while holding @of->mutex. As each + * open file has a separate mutex, it's okay as long as those don't + * happen on the same file. At this point, we can't easily give + * each file a separate locking class. Let's differentiate on + * whether the file has mmap or not for now. + * + * Both paths of the branch look the same. They're supposed to + * look that way and give @of->mutex different static lockdep keys. + */ + if (has_mmap) + mutex_init(&of->mutex); + else + mutex_init(&of->mutex); + + of->kn = kn; + of->file = file; + + /* + * Write path needs to atomic_write_len outside active reference. + * Cache it in open_file. See kernfs_fop_write() for details. + */ + of->atomic_write_len = ops->atomic_write_len; + + /* + * Always instantiate seq_file even if read access doesn't use + * seq_file or is not requested. This unifies private data access + * and readable regular files are the vast majority anyway. + */ + if (ops->seq_show) + error = seq_open(file, &kernfs_seq_ops); + else + error = seq_open(file, NULL); + if (error) + goto err_free; + + ((struct seq_file *)file->private_data)->private = of; + + /* seq_file clears PWRITE unconditionally, restore it if WRITE */ + if (file->f_mode & FMODE_WRITE) + file->f_mode |= FMODE_PWRITE; + + /* make sure we have open node struct */ + error = kernfs_get_open_node(kn, of); + if (error) + goto err_close; + + /* open succeeded, put active references */ + kernfs_put_active(kn); + return 0; + +err_close: + seq_release(inode, file); +err_free: + kfree(of); +err_out: + kernfs_put_active(kn); + return error; +} + +static int kernfs_fop_release(struct inode *inode, struct file *filp) +{ + struct kernfs_node *kn = filp->f_path.dentry->d_fsdata; + struct kernfs_open_file *of = kernfs_of(filp); + + kernfs_put_open_node(kn, of); + seq_release(inode, filp); + kfree(of); + + return 0; +} + +void kernfs_unmap_bin_file(struct kernfs_node *kn) +{ + struct kernfs_open_node *on; + struct kernfs_open_file *of; + + if (!(kn->flags & KERNFS_HAS_MMAP)) + return; + + spin_lock_irq(&kernfs_open_node_lock); + on = kn->attr.open; + if (on) + atomic_inc(&on->refcnt); + spin_unlock_irq(&kernfs_open_node_lock); + if (!on) + return; + + mutex_lock(&kernfs_open_file_mutex); + list_for_each_entry(of, &on->files, list) { + struct inode *inode = file_inode(of->file); + unmap_mapping_range(inode->i_mapping, 0, 0, 1); + } + mutex_unlock(&kernfs_open_file_mutex); + + kernfs_put_open_node(kn, NULL); +} + +/* + * Kernfs attribute files are pollable. The idea is that you read + * the content and then you use 'poll' or 'select' to wait for + * the content to change. When the content changes (assuming the + * manager for the kobject supports notification), poll will + * return POLLERR|POLLPRI, and select will return the fd whether + * it is waiting for read, write, or exceptions. + * Once poll/select indicates that the value has changed, you + * need to close and re-open the file, or seek to 0 and read again. + * Reminder: this only works for attributes which actively support + * it, and it is not possible to test an attribute from userspace + * to see if it supports poll (Neither 'poll' nor 'select' return + * an appropriate error code). When in doubt, set a suitable timeout value. + */ +static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait) +{ + struct kernfs_open_file *of = kernfs_of(filp); + struct kernfs_node *kn = filp->f_path.dentry->d_fsdata; + struct kernfs_open_node *on = kn->attr.open; + + /* need parent for the kobj, grab both */ + if (!kernfs_get_active(kn)) + goto trigger; + + poll_wait(filp, &on->poll, wait); + + kernfs_put_active(kn); + + if (of->event != atomic_read(&on->event)) + goto trigger; + + return DEFAULT_POLLMASK; + + trigger: + return DEFAULT_POLLMASK|POLLERR|POLLPRI; +} + +static void kernfs_notify_workfn(struct work_struct *work) +{ + struct kernfs_node *kn; + struct kernfs_open_node *on; + struct kernfs_super_info *info; +repeat: + /* pop one off the notify_list */ + spin_lock_irq(&kernfs_notify_lock); + kn = kernfs_notify_list; + if (kn == KERNFS_NOTIFY_EOL) { + spin_unlock_irq(&kernfs_notify_lock); + return; + } + kernfs_notify_list = kn->attr.notify_next; + kn->attr.notify_next = NULL; + spin_unlock_irq(&kernfs_notify_lock); + + /* kick poll */ + spin_lock_irq(&kernfs_open_node_lock); + + on = kn->attr.open; + if (on) { + atomic_inc(&on->event); + wake_up_interruptible(&on->poll); + } + + spin_unlock_irq(&kernfs_open_node_lock); + + /* kick fsnotify */ + mutex_lock(&kernfs_mutex); + + list_for_each_entry(info, &kernfs_root(kn)->supers, node) { + struct inode *inode; + struct dentry *dentry; + + inode = ilookup(info->sb, kn->ino); + if (!inode) + continue; + + dentry = d_find_any_alias(inode); + if (dentry) { + fsnotify_parent(NULL, dentry, FS_MODIFY); + fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE, + NULL, 0); + dput(dentry); + } + + iput(inode); + } + + mutex_unlock(&kernfs_mutex); + kernfs_put(kn); + goto repeat; +} + +/** + * kernfs_notify - notify a kernfs file + * @kn: file to notify + * + * Notify @kn such that poll(2) on @kn wakes up. Maybe be called from any + * context. + */ +void kernfs_notify(struct kernfs_node *kn) +{ + static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn); + unsigned long flags; + + if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) + return; + + spin_lock_irqsave(&kernfs_notify_lock, flags); + if (!kn->attr.notify_next) { + kernfs_get(kn); + kn->attr.notify_next = kernfs_notify_list; + kernfs_notify_list = kn; + schedule_work(&kernfs_notify_work); + } + spin_unlock_irqrestore(&kernfs_notify_lock, flags); +} +EXPORT_SYMBOL_GPL(kernfs_notify); + +const struct file_operations kernfs_file_fops = { + .read = kernfs_fop_read, + .write = kernfs_fop_write, + .llseek = generic_file_llseek, + .mmap = kernfs_fop_mmap, + .open = kernfs_fop_open, + .release = kernfs_fop_release, + .poll = kernfs_fop_poll, +}; + +/** + * __kernfs_create_file - kernfs internal function to create a file + * @parent: directory to create the file in + * @name: name of the file + * @mode: mode of the file + * @size: size of the file + * @ops: kernfs operations for the file + * @priv: private data for the file + * @ns: optional namespace tag of the file + * @static_name: don't copy file name + * @key: lockdep key for the file's active_ref, %NULL to disable lockdep + * + * Returns the created node on success, ERR_PTR() value on error. + */ +struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, + const char *name, + umode_t mode, loff_t size, + const struct kernfs_ops *ops, + void *priv, const void *ns, + bool name_is_static, + struct lock_class_key *key) +{ + struct kernfs_node *kn; + unsigned flags; + int rc; + + flags = KERNFS_FILE; + if (name_is_static) + flags |= KERNFS_STATIC_NAME; + + kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags); + if (!kn) + return ERR_PTR(-ENOMEM); + + kn->attr.ops = ops; + kn->attr.size = size; + kn->ns = ns; + kn->priv = priv; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (key) { + lockdep_init_map(&kn->dep_map, "s_active", key, 0); + kn->flags |= KERNFS_LOCKDEP; + } +#endif + + /* + * kn->attr.ops is accesible only while holding active ref. We + * need to know whether some ops are implemented outside active + * ref. Cache their existence in flags. + */ + if (ops->seq_show) + kn->flags |= KERNFS_HAS_SEQ_SHOW; + if (ops->mmap) + kn->flags |= KERNFS_HAS_MMAP; + + rc = kernfs_add_one(kn); + if (rc) { + kernfs_put(kn); + return ERR_PTR(rc); + } + return kn; +} diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c new file mode 100644 index 00000000000..985217626e6 --- /dev/null +++ b/fs/kernfs/inode.c @@ -0,0 +1,383 @@ +/* + * fs/kernfs/inode.c - kernfs inode implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + */ + +#include <linux/pagemap.h> +#include <linux/backing-dev.h> +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/xattr.h> +#include <linux/security.h> + +#include "kernfs-internal.h" + +static const struct address_space_operations kernfs_aops = { + .readpage = simple_readpage, + .write_begin = simple_write_begin, + .write_end = simple_write_end, +}; + +static struct backing_dev_info kernfs_bdi = { + .name = "kernfs", + .ra_pages = 0, /* No readahead */ + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, +}; + +static const struct inode_operations kernfs_iops = { + .permission = kernfs_iop_permission, + .setattr = kernfs_iop_setattr, + .getattr = kernfs_iop_getattr, + .setxattr = kernfs_iop_setxattr, + .removexattr = kernfs_iop_removexattr, + .getxattr = kernfs_iop_getxattr, + .listxattr = kernfs_iop_listxattr, +}; + +void __init kernfs_inode_init(void) +{ + if (bdi_init(&kernfs_bdi)) + panic("failed to init kernfs_bdi"); +} + +static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) +{ + static DEFINE_MUTEX(iattr_mutex); + struct kernfs_iattrs *ret; + struct iattr *iattrs; + + mutex_lock(&iattr_mutex); + + if (kn->iattr) + goto out_unlock; + + kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL); + if (!kn->iattr) + goto out_unlock; + iattrs = &kn->iattr->ia_iattr; + + /* assign default attributes */ + iattrs->ia_mode = kn->mode; + iattrs->ia_uid = GLOBAL_ROOT_UID; + iattrs->ia_gid = GLOBAL_ROOT_GID; + iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; + + simple_xattrs_init(&kn->iattr->xattrs); +out_unlock: + ret = kn->iattr; + mutex_unlock(&iattr_mutex); + return ret; +} + +static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) +{ + struct kernfs_iattrs *attrs; + struct iattr *iattrs; + unsigned int ia_valid = iattr->ia_valid; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + iattrs = &attrs->ia_iattr; + + if (ia_valid & ATTR_UID) + iattrs->ia_uid = iattr->ia_uid; + if (ia_valid & ATTR_GID) + iattrs->ia_gid = iattr->ia_gid; + if (ia_valid & ATTR_ATIME) + iattrs->ia_atime = iattr->ia_atime; + if (ia_valid & ATTR_MTIME) + iattrs->ia_mtime = iattr->ia_mtime; + if (ia_valid & ATTR_CTIME) + iattrs->ia_ctime = iattr->ia_ctime; + if (ia_valid & ATTR_MODE) { + umode_t mode = iattr->ia_mode; + iattrs->ia_mode = kn->mode = mode; + } + return 0; +} + +/** + * kernfs_setattr - set iattr on a node + * @kn: target node + * @iattr: iattr to set + * + * Returns 0 on success, -errno on failure. + */ +int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) +{ + int ret; + + mutex_lock(&kernfs_mutex); + ret = __kernfs_setattr(kn, iattr); + mutex_unlock(&kernfs_mutex); + return ret; +} + +int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + struct kernfs_node *kn = dentry->d_fsdata; + int error; + + if (!kn) + return -EINVAL; + + mutex_lock(&kernfs_mutex); + error = inode_change_ok(inode, iattr); + if (error) + goto out; + + error = __kernfs_setattr(kn, iattr); + if (error) + goto out; + + /* this ignores size changes */ + setattr_copy(inode, iattr); + +out: + mutex_unlock(&kernfs_mutex); + return error; +} + +static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata, + u32 *secdata_len) +{ + struct kernfs_iattrs *attrs; + void *old_secdata; + size_t old_secdata_len; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + old_secdata = attrs->ia_secdata; + old_secdata_len = attrs->ia_secdata_len; + + attrs->ia_secdata = *secdata; + attrs->ia_secdata_len = *secdata_len; + + *secdata = old_secdata; + *secdata_len = old_secdata_len; + return 0; +} + +int kernfs_iop_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_iattrs *attrs; + void *secdata; + int error; + u32 secdata_len = 0; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { + const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; + error = security_inode_setsecurity(dentry->d_inode, suffix, + value, size, flags); + if (error) + return error; + error = security_inode_getsecctx(dentry->d_inode, + &secdata, &secdata_len); + if (error) + return error; + + mutex_lock(&kernfs_mutex); + error = kernfs_node_setsecdata(kn, &secdata, &secdata_len); + mutex_unlock(&kernfs_mutex); + + if (secdata) + security_release_secctx(secdata, secdata_len); + return error; + } else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) { + return simple_xattr_set(&attrs->xattrs, name, value, size, + flags); + } + + return -EINVAL; +} + +int kernfs_iop_removexattr(struct dentry *dentry, const char *name) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_iattrs *attrs; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + return simple_xattr_remove(&attrs->xattrs, name); +} + +ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf, + size_t size) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_iattrs *attrs; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + return simple_xattr_get(&attrs->xattrs, name, buf, size); +} + +ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_iattrs *attrs; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + return simple_xattr_list(&attrs->xattrs, buf, size); +} + +static inline void set_default_inode_attr(struct inode *inode, umode_t mode) +{ + inode->i_mode = mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; +} + +static inline void set_inode_attr(struct inode *inode, struct iattr *iattr) +{ + inode->i_uid = iattr->ia_uid; + inode->i_gid = iattr->ia_gid; + inode->i_atime = iattr->ia_atime; + inode->i_mtime = iattr->ia_mtime; + inode->i_ctime = iattr->ia_ctime; +} + +static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) +{ + struct kernfs_iattrs *attrs = kn->iattr; + + inode->i_mode = kn->mode; + if (attrs) { + /* + * kernfs_node has non-default attributes get them from + * persistent copy in kernfs_node. + */ + set_inode_attr(inode, &attrs->ia_iattr); + security_inode_notifysecctx(inode, attrs->ia_secdata, + attrs->ia_secdata_len); + } + + if (kernfs_type(kn) == KERNFS_DIR) + set_nlink(inode, kn->dir.subdirs + 2); +} + +int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct inode *inode = dentry->d_inode; + + mutex_lock(&kernfs_mutex); + kernfs_refresh_inode(kn, inode); + mutex_unlock(&kernfs_mutex); + + generic_fillattr(inode, stat); + return 0; +} + +static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) +{ + kernfs_get(kn); + inode->i_private = kn; + inode->i_mapping->a_ops = &kernfs_aops; + inode->i_mapping->backing_dev_info = &kernfs_bdi; + inode->i_op = &kernfs_iops; + + set_default_inode_attr(inode, kn->mode); + kernfs_refresh_inode(kn, inode); + + /* initialize inode according to type */ + switch (kernfs_type(kn)) { + case KERNFS_DIR: + inode->i_op = &kernfs_dir_iops; + inode->i_fop = &kernfs_dir_fops; + break; + case KERNFS_FILE: + inode->i_size = kn->attr.size; + inode->i_fop = &kernfs_file_fops; + break; + case KERNFS_LINK: + inode->i_op = &kernfs_symlink_iops; + break; + default: + BUG(); + } + + unlock_new_inode(inode); +} + +/** + * kernfs_get_inode - get inode for kernfs_node + * @sb: super block + * @kn: kernfs_node to allocate inode for + * + * Get inode for @kn. If such inode doesn't exist, a new inode is + * allocated and basics are initialized. New inode is returned + * locked. + * + * LOCKING: + * Kernel thread context (may sleep). + * + * RETURNS: + * Pointer to allocated inode on success, NULL on failure. + */ +struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) +{ + struct inode *inode; + + inode = iget_locked(sb, kn->ino); + if (inode && (inode->i_state & I_NEW)) + kernfs_init_inode(kn, inode); + + return inode; +} + +/* + * The kernfs_node serves as both an inode and a directory entry for + * kernfs. To prevent the kernfs inode numbers from being freed + * prematurely we take a reference to kernfs_node from the kernfs inode. A + * super_operations.evict_inode() implementation is needed to drop that + * reference upon inode destruction. + */ +void kernfs_evict_inode(struct inode *inode) +{ + struct kernfs_node *kn = inode->i_private; + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + kernfs_put(kn); +} + +int kernfs_iop_permission(struct inode *inode, int mask) +{ + struct kernfs_node *kn; + + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + + kn = inode->i_private; + + mutex_lock(&kernfs_mutex); + kernfs_refresh_inode(kn, inode); + mutex_unlock(&kernfs_mutex); + + return generic_permission(inode, mask); +} diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h new file mode 100644 index 00000000000..dc84a3ef9ca --- /dev/null +++ b/fs/kernfs/kernfs-internal.h @@ -0,0 +1,120 @@ +/* + * fs/kernfs/kernfs-internal.h - kernfs internal header file + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de> + * + * This file is released under the GPLv2. + */ + +#ifndef __KERNFS_INTERNAL_H +#define __KERNFS_INTERNAL_H + +#include <linux/lockdep.h> +#include <linux/fs.h> +#include <linux/mutex.h> +#include <linux/xattr.h> + +#include <linux/kernfs.h> + +struct kernfs_iattrs { + struct iattr ia_iattr; + void *ia_secdata; + u32 ia_secdata_len; + + struct simple_xattrs xattrs; +}; + +/* +1 to avoid triggering overflow warning when negating it */ +#define KN_DEACTIVATED_BIAS (INT_MIN + 1) + +/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */ + +/** + * kernfs_root - find out the kernfs_root a kernfs_node belongs to + * @kn: kernfs_node of interest + * + * Return the kernfs_root @kn belongs to. + */ +static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn) +{ + /* if parent exists, it's always a dir; otherwise, @sd is a dir */ + if (kn->parent) + kn = kn->parent; + return kn->dir.root; +} + +/* + * mount.c + */ +struct kernfs_super_info { + struct super_block *sb; + + /* + * The root associated with this super_block. Each super_block is + * identified by the root and ns it's associated with. + */ + struct kernfs_root *root; + + /* + * Each sb is associated with one namespace tag, currently the + * network namespace of the task which mounted this kernfs + * instance. If multiple tags become necessary, make the following + * an array and compare kernfs_node tag against every entry. + */ + const void *ns; + + /* anchored at kernfs_root->supers, protected by kernfs_mutex */ + struct list_head node; +}; +#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) + +extern const struct super_operations kernfs_sops; +extern struct kmem_cache *kernfs_node_cache; + +/* + * inode.c + */ +struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn); +void kernfs_evict_inode(struct inode *inode); +int kernfs_iop_permission(struct inode *inode, int mask); +int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr); +int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); +int kernfs_iop_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags); +int kernfs_iop_removexattr(struct dentry *dentry, const char *name); +ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf, + size_t size); +ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); +void kernfs_inode_init(void); + +/* + * dir.c + */ +extern struct mutex kernfs_mutex; +extern const struct dentry_operations kernfs_dops; +extern const struct file_operations kernfs_dir_fops; +extern const struct inode_operations kernfs_dir_iops; + +struct kernfs_node *kernfs_get_active(struct kernfs_node *kn); +void kernfs_put_active(struct kernfs_node *kn); +int kernfs_add_one(struct kernfs_node *kn); +struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, + const char *name, umode_t mode, + unsigned flags); + +/* + * file.c + */ +extern const struct file_operations kernfs_file_fops; + +void kernfs_unmap_bin_file(struct kernfs_node *kn); + +/* + * symlink.c + */ +extern const struct inode_operations kernfs_symlink_iops; + +#endif /* __KERNFS_INTERNAL_H */ diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c new file mode 100644 index 00000000000..f973ae9b05f --- /dev/null +++ b/fs/kernfs/mount.c @@ -0,0 +1,250 @@ +/* + * fs/kernfs/mount.c - kernfs mount implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + */ + +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/init.h> +#include <linux/magic.h> +#include <linux/slab.h> +#include <linux/pagemap.h> + +#include "kernfs-internal.h" + +struct kmem_cache *kernfs_node_cache; + +static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct kernfs_root *root = kernfs_info(sb)->root; + struct kernfs_syscall_ops *scops = root->syscall_ops; + + if (scops && scops->remount_fs) + return scops->remount_fs(root, flags, data); + return 0; +} + +static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) +{ + struct kernfs_root *root = kernfs_root(dentry->d_fsdata); + struct kernfs_syscall_ops *scops = root->syscall_ops; + + if (scops && scops->show_options) + return scops->show_options(sf, root); + return 0; +} + +const struct super_operations kernfs_sops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .evict_inode = kernfs_evict_inode, + + .remount_fs = kernfs_sop_remount_fs, + .show_options = kernfs_sop_show_options, +}; + +/** + * kernfs_root_from_sb - determine kernfs_root associated with a super_block + * @sb: the super_block in question + * + * Return the kernfs_root associated with @sb. If @sb is not a kernfs one, + * %NULL is returned. + */ +struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) +{ + if (sb->s_op == &kernfs_sops) + return kernfs_info(sb)->root; + return NULL; +} + +static int kernfs_fill_super(struct super_block *sb, unsigned long magic) +{ + struct kernfs_super_info *info = kernfs_info(sb); + struct inode *inode; + struct dentry *root; + + info->sb = sb; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = magic; + sb->s_op = &kernfs_sops; + sb->s_time_gran = 1; + + /* get root inode, initialize and unlock it */ + mutex_lock(&kernfs_mutex); + inode = kernfs_get_inode(sb, info->root->kn); + mutex_unlock(&kernfs_mutex); + if (!inode) { + pr_debug("kernfs: could not get root inode\n"); + return -ENOMEM; + } + + /* instantiate and link root dentry */ + root = d_make_root(inode); + if (!root) { + pr_debug("%s: could not get root dentry!\n", __func__); + return -ENOMEM; + } + kernfs_get(info->root->kn); + root->d_fsdata = info->root->kn; + sb->s_root = root; + sb->s_d_op = &kernfs_dops; + return 0; +} + +static int kernfs_test_super(struct super_block *sb, void *data) +{ + struct kernfs_super_info *sb_info = kernfs_info(sb); + struct kernfs_super_info *info = data; + + return sb_info->root == info->root && sb_info->ns == info->ns; +} + +static int kernfs_set_super(struct super_block *sb, void *data) +{ + int error; + error = set_anon_super(sb, data); + if (!error) + sb->s_fs_info = data; + return error; +} + +/** + * kernfs_super_ns - determine the namespace tag of a kernfs super_block + * @sb: super_block of interest + * + * Return the namespace tag associated with kernfs super_block @sb. + */ +const void *kernfs_super_ns(struct super_block *sb) +{ + struct kernfs_super_info *info = kernfs_info(sb); + + return info->ns; +} + +/** + * kernfs_mount_ns - kernfs mount helper + * @fs_type: file_system_type of the fs being mounted + * @flags: mount flags specified for the mount + * @root: kernfs_root of the hierarchy being mounted + * @magic: file system specific magic number + * @new_sb_created: tell the caller if we allocated a new superblock + * @ns: optional namespace tag of the mount + * + * This is to be called from each kernfs user's file_system_type->mount() + * implementation, which should pass through the specified @fs_type and + * @flags, and specify the hierarchy and namespace tag to mount via @root + * and @ns, respectively. + * + * The return value can be passed to the vfs layer verbatim. + */ +struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, + struct kernfs_root *root, unsigned long magic, + bool *new_sb_created, const void *ns) +{ + struct super_block *sb; + struct kernfs_super_info *info; + int error; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + + info->root = root; + info->ns = ns; + + sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info); + if (IS_ERR(sb) || sb->s_fs_info != info) + kfree(info); + if (IS_ERR(sb)) + return ERR_CAST(sb); + + if (new_sb_created) + *new_sb_created = !sb->s_root; + + if (!sb->s_root) { + struct kernfs_super_info *info = kernfs_info(sb); + + error = kernfs_fill_super(sb, magic); + if (error) { + deactivate_locked_super(sb); + return ERR_PTR(error); + } + sb->s_flags |= MS_ACTIVE; + + mutex_lock(&kernfs_mutex); + list_add(&info->node, &root->supers); + mutex_unlock(&kernfs_mutex); + } + + return dget(sb->s_root); +} + +/** + * kernfs_kill_sb - kill_sb for kernfs + * @sb: super_block being killed + * + * This can be used directly for file_system_type->kill_sb(). If a kernfs + * user needs extra cleanup, it can implement its own kill_sb() and call + * this function at the end. + */ +void kernfs_kill_sb(struct super_block *sb) +{ + struct kernfs_super_info *info = kernfs_info(sb); + struct kernfs_node *root_kn = sb->s_root->d_fsdata; + + mutex_lock(&kernfs_mutex); + list_del(&info->node); + mutex_unlock(&kernfs_mutex); + + /* + * Remove the superblock from fs_supers/s_instances + * so we can't find it, before freeing kernfs_super_info. + */ + kill_anon_super(sb); + kfree(info); + kernfs_put(root_kn); +} + +/** + * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root + * @kernfs_root: the kernfs_root in question + * @ns: the namespace tag + * + * Pin the superblock so the superblock won't be destroyed in subsequent + * operations. This can be used to block ->kill_sb() which may be useful + * for kernfs users which dynamically manage superblocks. + * + * Returns NULL if there's no superblock associated to this kernfs_root, or + * -EINVAL if the superblock is being freed. + */ +struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns) +{ + struct kernfs_super_info *info; + struct super_block *sb = NULL; + + mutex_lock(&kernfs_mutex); + list_for_each_entry(info, &root->supers, node) { + if (info->ns == ns) { + sb = info->sb; + if (!atomic_inc_not_zero(&info->sb->s_active)) + sb = ERR_PTR(-EINVAL); + break; + } + } + mutex_unlock(&kernfs_mutex); + return sb; +} + +void __init kernfs_init(void) +{ + kernfs_node_cache = kmem_cache_create("kernfs_node_cache", + sizeof(struct kernfs_node), + 0, SLAB_PANIC, NULL); + kernfs_inode_init(); +} diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c new file mode 100644 index 00000000000..8a198898e39 --- /dev/null +++ b/fs/kernfs/symlink.c @@ -0,0 +1,147 @@ +/* + * fs/kernfs/symlink.c - kernfs symlink implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + */ + +#include <linux/fs.h> +#include <linux/gfp.h> +#include <linux/namei.h> + +#include "kernfs-internal.h" + +/** + * kernfs_create_link - create a symlink + * @parent: directory to create the symlink in + * @name: name of the symlink + * @target: target node for the symlink to point to + * + * Returns the created node on success, ERR_PTR() value on error. + */ +struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, + const char *name, + struct kernfs_node *target) +{ + struct kernfs_node *kn; + int error; + + kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK); + if (!kn) + return ERR_PTR(-ENOMEM); + + if (kernfs_ns_enabled(parent)) + kn->ns = target->ns; + kn->symlink.target_kn = target; + kernfs_get(target); /* ref owned by symlink */ + + error = kernfs_add_one(kn); + if (!error) + return kn; + + kernfs_put(kn); + return ERR_PTR(error); +} + +static int kernfs_get_target_path(struct kernfs_node *parent, + struct kernfs_node *target, char *path) +{ + struct kernfs_node *base, *kn; + char *s = path; + int len = 0; + + /* go up to the root, stop at the base */ + base = parent; + while (base->parent) { + kn = target->parent; + while (kn->parent && base != kn) + kn = kn->parent; + + if (base == kn) + break; + + strcpy(s, "../"); + s += 3; + base = base->parent; + } + + /* determine end of target string for reverse fillup */ + kn = target; + while (kn->parent && kn != base) { + len += strlen(kn->name) + 1; + kn = kn->parent; + } + + /* check limits */ + if (len < 2) + return -EINVAL; + len--; + if ((s - path) + len > PATH_MAX) + return -ENAMETOOLONG; + + /* reverse fillup of target string from target to base */ + kn = target; + while (kn->parent && kn != base) { + int slen = strlen(kn->name); + + len -= slen; + strncpy(s + len, kn->name, slen); + if (len) + s[--len] = '/'; + + kn = kn->parent; + } + + return 0; +} + +static int kernfs_getlink(struct dentry *dentry, char *path) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *parent = kn->parent; + struct kernfs_node *target = kn->symlink.target_kn; + int error; + + mutex_lock(&kernfs_mutex); + error = kernfs_get_target_path(parent, target, path); + mutex_unlock(&kernfs_mutex); + + return error; +} + +static void *kernfs_iop_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + int error = -ENOMEM; + unsigned long page = get_zeroed_page(GFP_KERNEL); + if (page) { + error = kernfs_getlink(dentry, (char *) page); + if (error < 0) + free_page((unsigned long)page); + } + nd_set_link(nd, error ? ERR_PTR(error) : (char *)page); + return NULL; +} + +static void kernfs_iop_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + char *page = nd_get_link(nd); + if (!IS_ERR(page)) + free_page((unsigned long)page); +} + +const struct inode_operations kernfs_symlink_iops = { + .setxattr = kernfs_iop_setxattr, + .removexattr = kernfs_iop_removexattr, + .getxattr = kernfs_iop_getxattr, + .listxattr = kernfs_iop_listxattr, + .readlink = generic_readlink, + .follow_link = kernfs_iop_follow_link, + .put_link = kernfs_iop_put_link, + .setattr = kernfs_iop_setattr, + .getattr = kernfs_iop_getattr, + .permission = kernfs_iop_permission, +}; diff --git a/fs/libfs.c b/fs/libfs.c index 8c501849315..88e3e00e2ec 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -3,6 +3,7 @@ * Library for filesystems writers. */ +#include <linux/blkdev.h> #include <linux/export.h> #include <linux/pagemap.h> #include <linux/slab.h> @@ -47,10 +48,16 @@ EXPORT_SYMBOL(simple_statfs); * Retaining negative dentries for an in-memory filesystem just wastes * memory and lookup time: arrange for them to be deleted immediately. */ -static int simple_delete_dentry(const struct dentry *dentry) +int always_delete_dentry(const struct dentry *dentry) { return 1; } +EXPORT_SYMBOL(always_delete_dentry); + +const struct dentry_operations simple_dentry_operations = { + .d_delete = always_delete_dentry, +}; +EXPORT_SYMBOL(simple_dentry_operations); /* * Lookup the data. This is trivial - if the dentry didn't already @@ -58,10 +65,6 @@ static int simple_delete_dentry(const struct dentry *dentry) */ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - static const struct dentry_operations simple_dentry_operations = { - .d_delete = simple_delete_dentry, - }; - if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); if (!dentry->d_sb->s_d_op) @@ -921,16 +924,19 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, EXPORT_SYMBOL_GPL(generic_fh_to_parent); /** - * generic_file_fsync - generic fsync implementation for simple filesystems + * __generic_file_fsync - generic fsync implementation for simple filesystems + * * @file: file to synchronize + * @start: start offset in bytes + * @end: end offset in bytes (inclusive) * @datasync: only synchronize essential metadata if true * * This is a generic implementation of the fsync method for simple * filesystems which track all non-inode metadata in the buffers list * hanging off the address_space structure. */ -int generic_file_fsync(struct file *file, loff_t start, loff_t end, - int datasync) +int __generic_file_fsync(struct file *file, loff_t start, loff_t end, + int datasync) { struct inode *inode = file->f_mapping->host; int err; @@ -950,10 +956,34 @@ int generic_file_fsync(struct file *file, loff_t start, loff_t end, err = sync_inode_metadata(inode, 1); if (ret == 0) ret = err; + out: mutex_unlock(&inode->i_mutex); return ret; } +EXPORT_SYMBOL(__generic_file_fsync); + +/** + * generic_file_fsync - generic fsync implementation for simple filesystems + * with flush + * @file: file to synchronize + * @start: start offset in bytes + * @end: end offset in bytes (inclusive) + * @datasync: only synchronize essential metadata if true + * + */ + +int generic_file_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct inode *inode = file->f_mapping->host; + int err; + + err = __generic_file_fsync(file, start, end, datasync); + if (err) + return err; + return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); +} EXPORT_SYMBOL(generic_file_fsync); /** @@ -1002,3 +1032,46 @@ void kfree_put_link(struct dentry *dentry, struct nameidata *nd, kfree(s); } EXPORT_SYMBOL(kfree_put_link); + +/* + * nop .set_page_dirty method so that people can use .page_mkwrite on + * anon inodes. + */ +static int anon_set_page_dirty(struct page *page) +{ + return 0; +}; + +/* + * A single inode exists for all anon_inode files. Contrary to pipes, + * anon_inode inodes have no associated per-instance data, so we need + * only allocate one of them. + */ +struct inode *alloc_anon_inode(struct super_block *s) +{ + static const struct address_space_operations anon_aops = { + .set_page_dirty = anon_set_page_dirty, + }; + struct inode *inode = new_inode_pseudo(s); + + if (!inode) + return ERR_PTR(-ENOMEM); + + inode->i_ino = get_next_ino(); + inode->i_mapping->a_ops = &anon_aops; + + /* + * Mark the inode dirty from the very beginning, + * that way it will never be moved to the dirty + * list because mark_inode_dirty() will think + * that it already _is_ on the dirty list. + */ + inode->i_state = I_DIRTY; + inode->i_mode = S_IRUSR | S_IWUSR; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + inode->i_flags |= S_PRIVATE; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + return inode; +} +EXPORT_SYMBOL(alloc_anon_inode); diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c index 00ec0b9c94d..d3e40db2893 100644 --- a/fs/lockd/clnt4xdr.c +++ b/fs/lockd/clnt4xdr.c @@ -14,6 +14,8 @@ #include <linux/sunrpc/stats.h> #include <linux/lockd/lockd.h> +#include <uapi/linux/nfs3.h> + #define NLMDBG_FACILITY NLMDBG_XDR #if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ) diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c index 9a55797a1cd..3e9f7874b97 100644 --- a/fs/lockd/clntxdr.c +++ b/fs/lockd/clntxdr.c @@ -15,6 +15,8 @@ #include <linux/sunrpc/stats.h> #include <linux/lockd/lockd.h> +#include <uapi/linux/nfs2.h> + #define NLMDBG_FACILITY NLMDBG_XDR #if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 10d6c41aeca..8f27c93f8d2 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -235,6 +235,7 @@ out_err: if (warned++ == 0) printk(KERN_WARNING "lockd_up: makesock failed, error=%d\n", err); + svc_shutdown_net(serv, net); return err; } @@ -435,7 +436,7 @@ EXPORT_SYMBOL_GPL(lockd_down); * Sysctl parameters (same as module parameters, different interface). */ -static ctl_table nlm_sysctls[] = { +static struct ctl_table nlm_sysctls[] = { { .procname = "nlm_grace_period", .data = &nlm_grace_period, @@ -489,7 +490,7 @@ static ctl_table nlm_sysctls[] = { { } }; -static ctl_table nlm_sysctl_dir[] = { +static struct ctl_table nlm_sysctl_dir[] = { { .procname = "nfs", .mode = 0555, @@ -498,7 +499,7 @@ static ctl_table nlm_sysctl_dir[] = { { } }; -static ctl_table nlm_sysctl_root[] = { +static struct ctl_table nlm_sysctl_root[] = { { .procname = "fs", .mode = 0555, @@ -621,8 +622,8 @@ static int __init init_nlm(void) err_pernet: #ifdef CONFIG_SYSCTL unregister_sysctl_table(nlm_sysctl_table); -#endif err_sysctl: +#endif return err; } diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index e066a390297..ab798a88ec1 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -779,6 +779,7 @@ nlmsvc_grant_blocked(struct nlm_block *block) struct nlm_file *file = block->b_file; struct nlm_lock *lock = &block->b_call->a_args.lock; int error; + loff_t fl_start, fl_end; dprintk("lockd: grant blocked lock %p\n", block); @@ -796,9 +797,16 @@ nlmsvc_grant_blocked(struct nlm_block *block) } /* Try the lock operation again */ + /* vfs_lock_file() can mangle fl_start and fl_end, but we need + * them unchanged for the GRANT_MSG + */ lock->fl.fl_flags |= FL_SLEEP; + fl_start = lock->fl.fl_start; + fl_end = lock->fl.fl_end; error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); lock->fl.fl_flags &= ~FL_SLEEP; + lock->fl.fl_start = fl_start; + lock->fl.fl_end = fl_end; switch (error) { case 0: diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index dc5c75930f0..b6f3b84b6e9 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -14,12 +14,11 @@ #include <linux/mutex.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/addr.h> -#include <linux/nfsd/nfsfh.h> -#include <linux/nfsd/export.h> #include <linux/lockd/lockd.h> #include <linux/lockd/share.h> #include <linux/module.h> #include <linux/mount.h> +#include <uapi/linux/nfs2.h> #define NLMDBG_FACILITY NLMDBG_SVCSUBS diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 964666c68a8..9340e7e10ef 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -16,6 +16,8 @@ #include <linux/sunrpc/stats.h> #include <linux/lockd/lockd.h> +#include <uapi/linux/nfs2.h> + #define NLMDBG_FACILITY NLMDBG_XDR diff --git a/fs/locks.c b/fs/locks.c index b27a3005d78..717fbc404e6 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -130,11 +130,15 @@ #include <linux/percpu.h> #include <linux/lglock.h> +#define CREATE_TRACE_POINTS +#include <trace/events/filelock.h> + #include <asm/uaccess.h> #define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) -#define IS_LEASE(fl) (fl->fl_flags & FL_LEASE) +#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG)) +#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK) static bool lease_breaking(struct file_lock *fl) { @@ -321,6 +325,7 @@ static int flock_make_lock(struct file *filp, struct file_lock **lock, return -ENOMEM; fl->fl_file = filp; + fl->fl_owner = (fl_owner_t)filp; fl->fl_pid = current->tgid; fl->fl_flags = FL_FLOCK; fl->fl_type = type; @@ -344,48 +349,43 @@ static int assign_type(struct file_lock *fl, long type) return 0; } -/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX - * style lock. - */ -static int flock_to_posix_lock(struct file *filp, struct file_lock *fl, - struct flock *l) +static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl, + struct flock64 *l) { - off_t start, end; - switch (l->l_whence) { case SEEK_SET: - start = 0; + fl->fl_start = 0; break; case SEEK_CUR: - start = filp->f_pos; + fl->fl_start = filp->f_pos; break; case SEEK_END: - start = i_size_read(file_inode(filp)); + fl->fl_start = i_size_read(file_inode(filp)); break; default: return -EINVAL; } + if (l->l_start > OFFSET_MAX - fl->fl_start) + return -EOVERFLOW; + fl->fl_start += l->l_start; + if (fl->fl_start < 0) + return -EINVAL; /* POSIX-1996 leaves the case l->l_len < 0 undefined; POSIX-2001 defines it. */ - start += l->l_start; - if (start < 0) - return -EINVAL; - fl->fl_end = OFFSET_MAX; if (l->l_len > 0) { - end = start + l->l_len - 1; - fl->fl_end = end; + if (l->l_len - 1 > OFFSET_MAX - fl->fl_start) + return -EOVERFLOW; + fl->fl_end = fl->fl_start + l->l_len - 1; + } else if (l->l_len < 0) { - end = start - 1; - fl->fl_end = end; - start += l->l_len; - if (start < 0) + if (fl->fl_start + l->l_len < 0) return -EINVAL; - } - fl->fl_start = start; /* we record the absolute position */ - if (fl->fl_end < fl->fl_start) - return -EOVERFLOW; - + fl->fl_end = fl->fl_start - 1; + fl->fl_start += l->l_len; + } else + fl->fl_end = OFFSET_MAX; + fl->fl_owner = current->files; fl->fl_pid = current->tgid; fl->fl_file = filp; @@ -396,52 +396,21 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl, return assign_type(fl, l->l_type); } -#if BITS_PER_LONG == 32 -static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl, - struct flock64 *l) +/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX + * style lock. + */ +static int flock_to_posix_lock(struct file *filp, struct file_lock *fl, + struct flock *l) { - loff_t start; - - switch (l->l_whence) { - case SEEK_SET: - start = 0; - break; - case SEEK_CUR: - start = filp->f_pos; - break; - case SEEK_END: - start = i_size_read(file_inode(filp)); - break; - default: - return -EINVAL; - } - - start += l->l_start; - if (start < 0) - return -EINVAL; - fl->fl_end = OFFSET_MAX; - if (l->l_len > 0) { - fl->fl_end = start + l->l_len - 1; - } else if (l->l_len < 0) { - fl->fl_end = start - 1; - start += l->l_len; - if (start < 0) - return -EINVAL; - } - fl->fl_start = start; /* we record the absolute position */ - if (fl->fl_end < fl->fl_start) - return -EOVERFLOW; - - fl->fl_owner = current->files; - fl->fl_pid = current->tgid; - fl->fl_file = filp; - fl->fl_flags = FL_POSIX; - fl->fl_ops = NULL; - fl->fl_lmops = NULL; + struct flock64 ll = { + .l_type = l->l_type, + .l_whence = l->l_whence, + .l_start = l->l_start, + .l_len = l->l_len, + }; - return assign_type(fl, l->l_type); + return flock64_to_posix_lock(filp, fl, &ll); } -#endif /* default lease lock manager operations */ static void lease_break_callback(struct file_lock *fl) @@ -462,7 +431,7 @@ static int lease_init(struct file *filp, long type, struct file_lock *fl) if (assign_type(fl, type) != 0) return -EINVAL; - fl->fl_owner = current->files; + fl->fl_owner = (fl_owner_t)current->files; fl->fl_pid = current->tgid; fl->fl_file = filp; @@ -511,8 +480,7 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2) } /* Must be called with the i_lock held! */ -static inline void -locks_insert_global_locks(struct file_lock *fl) +static void locks_insert_global_locks(struct file_lock *fl) { lg_local_lock(&file_lock_lglock); fl->fl_link_cpu = smp_processor_id(); @@ -521,8 +489,7 @@ locks_insert_global_locks(struct file_lock *fl) } /* Must be called with the i_lock held! */ -static inline void -locks_delete_global_locks(struct file_lock *fl) +static void locks_delete_global_locks(struct file_lock *fl) { /* * Avoid taking lock if already unhashed. This is safe since this check @@ -544,14 +511,12 @@ posix_owner_key(struct file_lock *fl) return (unsigned long)fl->fl_owner; } -static inline void -locks_insert_global_blocked(struct file_lock *waiter) +static void locks_insert_global_blocked(struct file_lock *waiter) { hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter)); } -static inline void -locks_delete_global_blocked(struct file_lock *waiter) +static void locks_delete_global_blocked(struct file_lock *waiter) { hash_del(&waiter->fl_link); } @@ -581,7 +546,7 @@ static void locks_delete_block(struct file_lock *waiter) * it seems like the reasonable thing to do. * * Must be called with both the i_lock and blocked_lock_lock held. The fl_block - * list itself is protected by the file_lock_list, but by ensuring that the + * list itself is protected by the blocked_lock_lock, but by ensuring that the * i_lock is also held on insertions we can avoid taking the blocked_lock_lock * in some cases when we see that the fl_block list is empty. */ @@ -591,7 +556,7 @@ static void __locks_insert_block(struct file_lock *blocker, BUG_ON(!list_empty(&waiter->fl_block)); waiter->fl_next = blocker; list_add_tail(&waiter->fl_block, &blocker->fl_block); - if (IS_POSIX(blocker)) + if (IS_POSIX(blocker) && !IS_OFDLCK(blocker)) locks_insert_global_blocked(waiter); } @@ -652,15 +617,18 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl) locks_insert_global_locks(fl); } -/* - * Delete a lock and then free it. - * Wake up processes that are blocked waiting for this lock, - * notify the FS that the lock has been cleared and - * finally free the lock. +/** + * locks_delete_lock - Delete a lock and then free it. + * @thisfl_p: pointer that points to the fl_next field of the previous + * inode->i_flock list entry + * + * Unlink a lock from all lists and free the namespace reference, but don't + * free it yet. Wake up processes that are blocked waiting for this lock and + * notify the FS that the lock has been cleared. * * Must be called with the i_lock held! */ -static void locks_delete_lock(struct file_lock **thisfl_p) +static void locks_unlink_lock(struct file_lock **thisfl_p) { struct file_lock *fl = *thisfl_p; @@ -675,6 +643,18 @@ static void locks_delete_lock(struct file_lock **thisfl_p) } locks_wake_up_blocks(fl); +} + +/* + * Unlink a lock from all lists and free it. + * + * Must be called with i_lock held! + */ +static void locks_delete_lock(struct file_lock **thisfl_p) +{ + struct file_lock *fl = *thisfl_p; + + locks_unlink_lock(thisfl_p); locks_free_lock(fl); } @@ -769,8 +749,16 @@ EXPORT_SYMBOL(posix_test_lock); * Note: the above assumption may not be true when handling lock * requests from a broken NFS client. It may also fail in the presence * of tasks (such as posix threads) sharing the same open file table. - * * To handle those cases, we just bail out after a few iterations. + * + * For FL_OFDLCK locks, the owner is the filp, not the files_struct. + * Because the owner is not even nominally tied to a thread of + * execution, the deadlock detection below can't reasonably work well. Just + * skip it for those. + * + * In principle, we could do a more limited deadlock detection on FL_OFDLCK + * locks that just checks for the case where two tasks are attempting to + * upgrade from read to write locks on the same inode. */ #define MAX_DEADLK_ITERATIONS 10 @@ -793,6 +781,13 @@ static int posix_locks_deadlock(struct file_lock *caller_fl, { int i = 0; + /* + * This deadlock detector can't reasonably detect deadlocks with + * FL_OFDLCK locks, since they aren't owned by a process, per-se. + */ + if (IS_OFDLCK(caller_fl)) + return 0; + while ((block_fl = what_owner_is_waiting_for(block_fl))) { if (i++ > MAX_DEADLK_ITERATIONS) return 0; @@ -1152,13 +1147,14 @@ EXPORT_SYMBOL(posix_lock_file_wait); /** * locks_mandatory_locked - Check for an active lock - * @inode: the file to check + * @file: the file to check * * Searches the inode's list of locks to find any POSIX locks which conflict. * This function is called from locks_verify_locked() only. */ -int locks_mandatory_locked(struct inode *inode) +int locks_mandatory_locked(struct file *file) { + struct inode *inode = file_inode(file); fl_owner_t owner = current->files; struct file_lock *fl; @@ -1169,7 +1165,7 @@ int locks_mandatory_locked(struct inode *inode) for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { if (!IS_POSIX(fl)) continue; - if (fl->fl_owner != owner) + if (fl->fl_owner != owner && fl->fl_owner != (fl_owner_t)file) break; } spin_unlock(&inode->i_lock); @@ -1195,19 +1191,30 @@ int locks_mandatory_area(int read_write, struct inode *inode, { struct file_lock fl; int error; + bool sleep = false; locks_init_lock(&fl); - fl.fl_owner = current->files; fl.fl_pid = current->tgid; fl.fl_file = filp; fl.fl_flags = FL_POSIX | FL_ACCESS; if (filp && !(filp->f_flags & O_NONBLOCK)) - fl.fl_flags |= FL_SLEEP; + sleep = true; fl.fl_type = (read_write == FLOCK_VERIFY_WRITE) ? F_WRLCK : F_RDLCK; fl.fl_start = offset; fl.fl_end = offset + count - 1; for (;;) { + if (filp) { + fl.fl_owner = (fl_owner_t)filp; + fl.fl_flags &= ~FL_SLEEP; + error = __posix_lock_file(inode, &fl, NULL); + if (!error) + break; + } + + if (sleep) + fl.fl_flags |= FL_SLEEP; + fl.fl_owner = current->files; error = __posix_lock_file(inode, &fl, NULL); if (error != FILE_LOCK_DEFERRED) break; @@ -1283,6 +1290,7 @@ static void time_out_leases(struct inode *inode) before = &inode->i_flock; while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) { + trace_time_out_leases(inode, fl); if (past_time(fl->fl_downgrade_time)) lease_modify(before, F_RDLCK); if (past_time(fl->fl_break_time)) @@ -1292,28 +1300,40 @@ static void time_out_leases(struct inode *inode) } } +static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker) +{ + if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE)) + return false; + return locks_conflict(breaker, lease); +} + /** * __break_lease - revoke all outstanding leases on file * @inode: the inode of the file to return - * @mode: the open mode (read or write) + * @mode: O_RDONLY: break only write leases; O_WRONLY or O_RDWR: + * break all leases + * @type: FL_LEASE: break leases and delegations; FL_DELEG: break + * only delegations * * break_lease (inlined for speed) has checked there already is at least * some kind of lock (maybe a lease) on this file. Leases are broken on * a call to open() or truncate(). This function can sleep unless you * specified %O_NONBLOCK to your open(). */ -int __break_lease(struct inode *inode, unsigned int mode) +int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) { int error = 0; struct file_lock *new_fl, *flock; struct file_lock *fl; unsigned long break_time; int i_have_this_lease = 0; + bool lease_conflict = false; int want_write = (mode & O_ACCMODE) != O_RDONLY; new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK); if (IS_ERR(new_fl)) return PTR_ERR(new_fl); + new_fl->fl_flags = type; spin_lock(&inode->i_lock); @@ -1323,13 +1343,16 @@ int __break_lease(struct inode *inode, unsigned int mode) if ((flock == NULL) || !IS_LEASE(flock)) goto out; - if (!locks_conflict(flock, new_fl)) + for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) { + if (leases_conflict(fl, new_fl)) { + lease_conflict = true; + if (fl->fl_owner == current->files) + i_have_this_lease = 1; + } + } + if (!lease_conflict) goto out; - for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) - if (fl->fl_owner == current->files) - i_have_this_lease = 1; - break_time = 0; if (lease_break_time > 0) { break_time = jiffies + lease_break_time * HZ; @@ -1338,6 +1361,8 @@ int __break_lease(struct inode *inode, unsigned int mode) } for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) { + if (!leases_conflict(fl, new_fl)) + continue; if (want_write) { if (fl->fl_flags & FL_UNLOCK_PENDING) continue; @@ -1353,22 +1378,24 @@ int __break_lease(struct inode *inode, unsigned int mode) } if (i_have_this_lease || (mode & O_NONBLOCK)) { + trace_break_lease_noblock(inode, new_fl); error = -EWOULDBLOCK; goto out; } restart: break_time = flock->fl_break_time; - if (break_time != 0) { + if (break_time != 0) break_time -= jiffies; - if (break_time == 0) - break_time++; - } + if (break_time == 0) + break_time++; locks_insert_block(flock, new_fl); + trace_break_lease_block(inode, new_fl); spin_unlock(&inode->i_lock); error = wait_event_interruptible_timeout(new_fl->fl_wait, !new_fl->fl_next, break_time); spin_lock(&inode->i_lock); + trace_break_lease_unblock(inode, new_fl); locks_delete_block(new_fl); if (error >= 0) { if (error == 0) @@ -1379,7 +1406,7 @@ restart: */ for (flock = inode->i_flock; flock && IS_LEASE(flock); flock = flock->fl_next) { - if (locks_conflict(new_fl, flock)) + if (leases_conflict(new_fl, flock)) goto restart; } error = 0; @@ -1455,21 +1482,63 @@ int fcntl_getlease(struct file *filp) return type; } +/** + * check_conflicting_open - see if the given dentry points to a file that has + * an existing open that would conflict with the + * desired lease. + * @dentry: dentry to check + * @arg: type of lease that we're trying to acquire + * + * Check to see if there's an existing open fd on this file that would + * conflict with the lease we're trying to set. + */ +static int +check_conflicting_open(const struct dentry *dentry, const long arg) +{ + int ret = 0; + struct inode *inode = dentry->d_inode; + + if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) + return -EAGAIN; + + if ((arg == F_WRLCK) && ((d_count(dentry) > 1) || + (atomic_read(&inode->i_count) > 1))) + ret = -EAGAIN; + + return ret; +} + static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp) { struct file_lock *fl, **before, **my_before = NULL, *lease; struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; + bool is_deleg = (*flp)->fl_flags & FL_DELEG; int error; lease = *flp; + trace_generic_add_lease(inode, lease); - error = -EAGAIN; - if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) - goto out; - if ((arg == F_WRLCK) - && ((d_count(dentry) > 1) - || (atomic_read(&inode->i_count) > 1))) + /* + * In the delegation case we need mutual exclusion with + * a number of operations that take the i_mutex. We trylock + * because delegations are an optional optimization, and if + * there's some chance of a conflict--we'd rather not + * bother, maybe that's a sign this just isn't a good file to + * hand out a delegation on. + */ + if (is_deleg && !mutex_trylock(&inode->i_mutex)) + return -EAGAIN; + + if (is_deleg && arg == F_WRLCK) { + /* Write delegations are not currently supported: */ + mutex_unlock(&inode->i_mutex); + WARN_ON_ONCE(1); + return -EINVAL; + } + + error = check_conflicting_open(dentry, arg); + if (error) goto out; /* @@ -1514,9 +1583,22 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp goto out; locks_insert_lock(before, lease); - return 0; - + /* + * The check in break_lease() is lockless. It's possible for another + * open to race in after we did the earlier check for a conflicting + * open but before the lease was inserted. Check again for a + * conflicting open and cancel the lease if there is one. + * + * We also add a barrier here to ensure that the insertion of the lock + * precedes these checks. + */ + smp_mb(); + error = check_conflicting_open(dentry, arg); + if (error) + locks_unlink_lock(flp); out: + if (is_deleg) + mutex_unlock(&inode->i_mutex); return error; } @@ -1526,6 +1608,8 @@ static int generic_delete_lease(struct file *filp, struct file_lock **flp) struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; + trace_generic_delete_lease(inode, *flp); + for (before = &inode->i_flock; ((fl = *before) != NULL) && IS_LEASE(fl); before = &fl->fl_next) { @@ -1579,7 +1663,7 @@ EXPORT_SYMBOL(generic_setlease); static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease) { - if (filp->f_op && filp->f_op->setlease) + if (filp->f_op->setlease) return filp->f_op->setlease(filp, arg, lease); else return generic_setlease(filp, arg, lease); @@ -1771,7 +1855,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) if (error) goto out_free; - if (f.file->f_op && f.file->f_op->flock) + if (f.file->f_op->flock) error = f.file->f_op->flock(f.file, (can_sleep) ? F_SETLKW : F_SETLK, lock); @@ -1797,7 +1881,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) */ int vfs_test_lock(struct file *filp, struct file_lock *fl) { - if (filp->f_op && filp->f_op->lock) + if (filp->f_op->lock) return filp->f_op->lock(filp, F_GETLK, fl); posix_test_lock(filp, fl); return 0; @@ -1806,7 +1890,7 @@ EXPORT_SYMBOL_GPL(vfs_test_lock); static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) { - flock->l_pid = fl->fl_pid; + flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid; #if BITS_PER_LONG == 32 /* * Make sure we can represent the posix lock via @@ -1828,7 +1912,7 @@ static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) #if BITS_PER_LONG == 32 static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) { - flock->l_pid = fl->fl_pid; + flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid; flock->l_start = fl->fl_start; flock->l_len = fl->fl_end == OFFSET_MAX ? 0 : fl->fl_end - fl->fl_start + 1; @@ -1840,7 +1924,7 @@ static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) /* Report the first existing lock that would conflict with l. * This implements the F_GETLK command of fcntl(). */ -int fcntl_getlk(struct file *filp, struct flock __user *l) +int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l) { struct file_lock file_lock; struct flock flock; @@ -1857,6 +1941,16 @@ int fcntl_getlk(struct file *filp, struct flock __user *l) if (error) goto out; + if (cmd == F_OFD_GETLK) { + error = -EINVAL; + if (flock.l_pid != 0) + goto out; + + cmd = F_GETLK; + file_lock.fl_flags |= FL_OFDLCK; + file_lock.fl_owner = (fl_owner_t)filp; + } + error = vfs_test_lock(filp, &file_lock); if (error) goto out; @@ -1909,7 +2003,7 @@ out: */ int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) { - if (filp->f_op && filp->f_op->lock) + if (filp->f_op->lock) return filp->f_op->lock(filp, cmd, fl); else return posix_lock_file(filp, fl, conf); @@ -1940,6 +2034,22 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd, return error; } +/* Ensure that fl->fl_filp has compatible f_mode for F_SETLK calls */ +static int +check_fmode_for_setlk(struct file_lock *fl) +{ + switch (fl->fl_type) { + case F_RDLCK: + if (!(fl->fl_file->f_mode & FMODE_READ)) + return -EBADF; + break; + case F_WRLCK: + if (!(fl->fl_file->f_mode & FMODE_WRITE)) + return -EBADF; + } + return 0; +} + /* Apply the lock described by l to an open file descriptor. * This implements both the F_SETLK and F_SETLKW commands of fcntl(). */ @@ -1976,25 +2086,36 @@ again: error = flock_to_posix_lock(filp, file_lock, &flock); if (error) goto out; - if (cmd == F_SETLKW) { - file_lock->fl_flags |= FL_SLEEP; - } - - error = -EBADF; - switch (flock.l_type) { - case F_RDLCK: - if (!(filp->f_mode & FMODE_READ)) - goto out; - break; - case F_WRLCK: - if (!(filp->f_mode & FMODE_WRITE)) + + error = check_fmode_for_setlk(file_lock); + if (error) + goto out; + + /* + * If the cmd is requesting file-private locks, then set the + * FL_OFDLCK flag and override the owner. + */ + switch (cmd) { + case F_OFD_SETLK: + error = -EINVAL; + if (flock.l_pid != 0) goto out; + + cmd = F_SETLK; + file_lock->fl_flags |= FL_OFDLCK; + file_lock->fl_owner = (fl_owner_t)filp; break; - case F_UNLCK: - break; - default: + case F_OFD_SETLKW: error = -EINVAL; - goto out; + if (flock.l_pid != 0) + goto out; + + cmd = F_SETLKW; + file_lock->fl_flags |= FL_OFDLCK; + file_lock->fl_owner = (fl_owner_t)filp; + /* Fallthrough */ + case F_SETLKW: + file_lock->fl_flags |= FL_SLEEP; } error = do_lock_file_wait(filp, cmd, file_lock); @@ -2025,7 +2146,7 @@ out: /* Report the first existing lock that would conflict with l. * This implements the F_GETLK command of fcntl(). */ -int fcntl_getlk64(struct file *filp, struct flock64 __user *l) +int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) { struct file_lock file_lock; struct flock64 flock; @@ -2042,6 +2163,16 @@ int fcntl_getlk64(struct file *filp, struct flock64 __user *l) if (error) goto out; + if (cmd == F_OFD_GETLK) { + error = -EINVAL; + if (flock.l_pid != 0) + goto out; + + cmd = F_GETLK64; + file_lock.fl_flags |= FL_OFDLCK; + file_lock.fl_owner = (fl_owner_t)filp; + } + error = vfs_test_lock(filp, &file_lock); if (error) goto out; @@ -2094,25 +2225,36 @@ again: error = flock64_to_posix_lock(filp, file_lock, &flock); if (error) goto out; - if (cmd == F_SETLKW64) { - file_lock->fl_flags |= FL_SLEEP; - } - - error = -EBADF; - switch (flock.l_type) { - case F_RDLCK: - if (!(filp->f_mode & FMODE_READ)) - goto out; - break; - case F_WRLCK: - if (!(filp->f_mode & FMODE_WRITE)) + + error = check_fmode_for_setlk(file_lock); + if (error) + goto out; + + /* + * If the cmd is requesting file-private locks, then set the + * FL_OFDLCK flag and override the owner. + */ + switch (cmd) { + case F_OFD_SETLK: + error = -EINVAL; + if (flock.l_pid != 0) goto out; + + cmd = F_SETLK64; + file_lock->fl_flags |= FL_OFDLCK; + file_lock->fl_owner = (fl_owner_t)filp; break; - case F_UNLCK: - break; - default: + case F_OFD_SETLKW: error = -EINVAL; - goto out; + if (flock.l_pid != 0) + goto out; + + cmd = F_SETLKW64; + file_lock->fl_flags |= FL_OFDLCK; + file_lock->fl_owner = (fl_owner_t)filp; + /* Fallthrough */ + case F_SETLKW64: + file_lock->fl_flags |= FL_SLEEP; } error = do_lock_file_wait(filp, cmd, file_lock); @@ -2173,7 +2315,7 @@ EXPORT_SYMBOL(locks_remove_posix); /* * This function is called on the last close of an open file. */ -void locks_remove_flock(struct file *filp) +void locks_remove_file(struct file *filp) { struct inode * inode = file_inode(filp); struct file_lock *fl; @@ -2182,8 +2324,11 @@ void locks_remove_flock(struct file *filp) if (!inode->i_flock) return; - if (filp->f_op && filp->f_op->flock) { + locks_remove_posix(filp, (fl_owner_t)filp); + + if (filp->f_op->flock) { struct file_lock fl = { + .fl_owner = (fl_owner_t)filp, .fl_pid = current->tgid, .fl_file = filp, .fl_flags = FL_FLOCK, @@ -2200,16 +2345,28 @@ void locks_remove_flock(struct file *filp) while ((fl = *before) != NULL) { if (fl->fl_file == filp) { - if (IS_FLOCK(fl)) { - locks_delete_lock(before); - continue; - } if (IS_LEASE(fl)) { lease_modify(before, F_UNLCK); continue; } - /* What? */ - BUG(); + + /* + * There's a leftover lock on the list of a type that + * we didn't expect to see. Most likely a classic + * POSIX lock that ended up not getting released + * properly, or that raced onto the list somehow. Log + * some info about it and then just remove it from + * the list. + */ + WARN(!IS_FLOCK(fl), + "leftover lock: dev=%u:%u ino=%lu type=%hhd flags=0x%x start=%lld end=%lld\n", + MAJOR(inode->i_sb->s_dev), + MINOR(inode->i_sb->s_dev), inode->i_ino, + fl->fl_type, fl->fl_flags, + fl->fl_start, fl->fl_end); + + locks_delete_lock(before); + continue; } before = &fl->fl_next; } @@ -2246,7 +2403,7 @@ EXPORT_SYMBOL(posix_unblock_lock); */ int vfs_cancel_lock(struct file *filp, struct file_lock *fl) { - if (filp->f_op && filp->f_op->lock) + if (filp->f_op->lock) return filp->f_op->lock(filp, F_CANCELLK, fl); return 0; } @@ -2278,26 +2435,32 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, seq_printf(f, "%lld:%s ", id, pfx); if (IS_POSIX(fl)) { - seq_printf(f, "%6s %s ", - (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ", + if (fl->fl_flags & FL_ACCESS) + seq_puts(f, "ACCESS"); + else if (IS_OFDLCK(fl)) + seq_puts(f, "OFDLCK"); + else + seq_puts(f, "POSIX "); + + seq_printf(f, " %s ", (inode == NULL) ? "*NOINODE*" : mandatory_lock(inode) ? "MANDATORY" : "ADVISORY "); } else if (IS_FLOCK(fl)) { if (fl->fl_type & LOCK_MAND) { - seq_printf(f, "FLOCK MSNFS "); + seq_puts(f, "FLOCK MSNFS "); } else { - seq_printf(f, "FLOCK ADVISORY "); + seq_puts(f, "FLOCK ADVISORY "); } } else if (IS_LEASE(fl)) { - seq_printf(f, "LEASE "); + seq_puts(f, "LEASE "); if (lease_breaking(fl)) - seq_printf(f, "BREAKING "); + seq_puts(f, "BREAKING "); else if (fl->fl_file) - seq_printf(f, "ACTIVE "); + seq_puts(f, "ACTIVE "); else - seq_printf(f, "BREAKER "); + seq_puts(f, "BREAKER "); } else { - seq_printf(f, "UNKNOWN UNKNOWN "); + seq_puts(f, "UNKNOWN UNKNOWN "); } if (fl->fl_type & LOCK_MAND) { seq_printf(f, "%s ", @@ -2329,7 +2492,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, else seq_printf(f, "%Ld %Ld\n", fl->fl_start, fl->fl_end); } else { - seq_printf(f, "0 EOF\n"); + seq_puts(f, "0 EOF\n"); } } @@ -2349,6 +2512,7 @@ static int locks_show(struct seq_file *f, void *v) } static void *locks_start(struct seq_file *f, loff_t *pos) + __acquires(&blocked_lock_lock) { struct locks_iterator *iter = f->private; @@ -2367,6 +2531,7 @@ static void *locks_next(struct seq_file *f, void *v, loff_t *pos) } static void locks_stop(struct seq_file *f, void *v) + __releases(&blocked_lock_lock) { spin_unlock(&blocked_lock_lock); lg_global_unlock(&file_lock_lglock); diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index 550475ca6a0..76279e11982 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -14,16 +14,10 @@ #define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) -static void request_complete(struct bio *bio, int err) -{ - complete((struct completion *)bio->bi_private); -} - static int sync_request(struct page *page, struct block_device *bdev, int rw) { struct bio bio; struct bio_vec bio_vec; - struct completion complete; bio_init(&bio); bio.bi_max_vecs = 1; @@ -32,16 +26,11 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw) bio_vec.bv_len = PAGE_SIZE; bio_vec.bv_offset = 0; bio.bi_vcnt = 1; - bio.bi_size = PAGE_SIZE; bio.bi_bdev = bdev; - bio.bi_sector = page->index * (PAGE_SIZE >> 9); - init_completion(&complete); - bio.bi_private = &complete; - bio.bi_end_io = request_complete; - - submit_bio(rw, &bio); - wait_for_completion(&complete); - return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO; + bio.bi_iter.bi_sector = page->index * (PAGE_SIZE >> 9); + bio.bi_iter.bi_size = PAGE_SIZE; + + return submit_bio_wait(rw, &bio); } static int bdev_readpage(void *_sb, struct page *page) @@ -67,22 +56,18 @@ static DECLARE_WAIT_QUEUE_HEAD(wq); static void writeseg_end_io(struct bio *bio, int err) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec; + int i; struct super_block *sb = bio->bi_private; struct logfs_super *super = logfs_super(sb); - struct page *page; BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ BUG_ON(err); - BUG_ON(bio->bi_vcnt == 0); - do { - page = bvec->bv_page; - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - end_page_writeback(page); - page_cache_release(page); - } while (bvec >= bio->bi_io_vec); + + bio_for_each_segment_all(bvec, bio, i) { + end_page_writeback(bvec->bv_page); + page_cache_release(bvec->bv_page); + } bio_put(bio); if (atomic_dec_and_test(&super->s_pending_writes)) wake_up(&wq); @@ -107,9 +92,9 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, if (i >= max_pages) { /* Block layer cannot split bios :( */ bio->bi_vcnt = i; - bio->bi_size = i * PAGE_SIZE; + bio->bi_iter.bi_size = i * PAGE_SIZE; bio->bi_bdev = super->s_bdev; - bio->bi_sector = ofs >> 9; + bio->bi_iter.bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = writeseg_end_io; atomic_inc(&super->s_pending_writes); @@ -134,9 +119,9 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, unlock_page(page); } bio->bi_vcnt = nr_pages; - bio->bi_size = nr_pages * PAGE_SIZE; + bio->bi_iter.bi_size = nr_pages * PAGE_SIZE; bio->bi_bdev = super->s_bdev; - bio->bi_sector = ofs >> 9; + bio->bi_iter.bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = writeseg_end_io; atomic_inc(&super->s_pending_writes); @@ -199,9 +184,9 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, if (i >= max_pages) { /* Block layer cannot split bios :( */ bio->bi_vcnt = i; - bio->bi_size = i * PAGE_SIZE; + bio->bi_iter.bi_size = i * PAGE_SIZE; bio->bi_bdev = super->s_bdev; - bio->bi_sector = ofs >> 9; + bio->bi_iter.bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = erase_end_io; atomic_inc(&super->s_pending_writes); @@ -220,9 +205,9 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, bio->bi_io_vec[i].bv_offset = 0; } bio->bi_vcnt = nr_pages; - bio->bi_size = nr_pages * PAGE_SIZE; + bio->bi_iter.bi_size = nr_pages * PAGE_SIZE; bio->bi_bdev = super->s_bdev; - bio->bi_sector = ofs >> 9; + bio->bi_iter.bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = erase_end_io; atomic_inc(&super->s_pending_writes); diff --git a/fs/logfs/file.c b/fs/logfs/file.c index 57914fc32b6..8538752df2f 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c @@ -264,15 +264,15 @@ const struct inode_operations logfs_reg_iops = { }; const struct file_operations logfs_reg_fops = { - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .fsync = logfs_fsync, .unlocked_ioctl = logfs_ioctl, .llseek = generic_file_llseek, .mmap = generic_file_readonly_mmap, .open = generic_file_open, - .read = do_sync_read, - .write = do_sync_write, + .read = new_sync_read, + .write = new_sync_write, }; const struct address_space_operations logfs_reg_aops = { diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 9a59cbade2f..48140315f62 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -2180,7 +2180,7 @@ void logfs_evict_inode(struct inode *inode) do_delete_inode(inode); } } - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); /* Cheaper version of write_inode. All changes are concealed in diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c index d448a777166..7f9b096d8d5 100644 --- a/fs/logfs/segment.c +++ b/fs/logfs/segment.c @@ -62,7 +62,8 @@ static struct page *get_mapping_page(struct super_block *sb, pgoff_t index, page = read_cache_page(mapping, index, filler, sb); else { page = find_or_create_page(mapping, index, GFP_NOFS); - unlock_page(page); + if (page) + unlock_page(page); } return page; } diff --git a/fs/mbcache.c b/fs/mbcache.c index e519e45bf67..187477ded6b 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -26,6 +26,41 @@ * back on the lru list. */ +/* + * Lock descriptions and usage: + * + * Each hash chain of both the block and index hash tables now contains + * a built-in lock used to serialize accesses to the hash chain. + * + * Accesses to global data structures mb_cache_list and mb_cache_lru_list + * are serialized via the global spinlock mb_cache_spinlock. + * + * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize + * accesses to its local data, such as e_used and e_queued. + * + * Lock ordering: + * + * Each block hash chain's lock has the highest lock order, followed by an + * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's + * lock), and mb_cach_spinlock, with the lowest order. While holding + * either a block or index hash chain lock, a thread can acquire an + * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock. + * + * Synchronization: + * + * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and + * index hash chian, it needs to lock the corresponding hash chain. For each + * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to + * prevent either any simultaneous release or free on the entry and also + * to serialize accesses to either the e_used or e_queued member of the entry. + * + * To avoid having a dangling reference to an already freed + * mb_cache_entry, an mb_cache_entry is only freed when it is not on a + * block hash chain and also no longer being referenced, both e_used, + * and e_queued are 0's. When an mb_cache_entry is explicitly freed it is + * first removed from a block hash chain. + */ + #include <linux/kernel.h> #include <linux/module.h> @@ -34,9 +69,11 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/sched.h> -#include <linux/init.h> +#include <linux/list_bl.h> #include <linux/mbcache.h> - +#include <linux/init.h> +#include <linux/blockgroup_lock.h> +#include <linux/log2.h> #ifdef MB_CACHE_DEBUG # define mb_debug(f...) do { \ @@ -57,8 +94,14 @@ #define MB_CACHE_WRITER ((unsigned short)~0U >> 1) +#define MB_CACHE_ENTRY_LOCK_BITS ilog2(NR_BG_LOCKS) +#define MB_CACHE_ENTRY_LOCK_INDEX(ce) \ + (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS)) + static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue); - +static struct blockgroup_lock *mb_cache_bg_lock; +static struct kmem_cache *mb_cache_kmem_cache; + MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>"); MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); MODULE_LICENSE("GPL"); @@ -86,58 +129,110 @@ static LIST_HEAD(mb_cache_list); static LIST_HEAD(mb_cache_lru_list); static DEFINE_SPINLOCK(mb_cache_spinlock); +static inline void +__spin_lock_mb_cache_entry(struct mb_cache_entry *ce) +{ + spin_lock(bgl_lock_ptr(mb_cache_bg_lock, + MB_CACHE_ENTRY_LOCK_INDEX(ce))); +} + +static inline void +__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce) +{ + spin_unlock(bgl_lock_ptr(mb_cache_bg_lock, + MB_CACHE_ENTRY_LOCK_INDEX(ce))); +} + static inline int -__mb_cache_entry_is_hashed(struct mb_cache_entry *ce) +__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce) { - return !list_empty(&ce->e_block_list); + return !hlist_bl_unhashed(&ce->e_block_list); } -static void -__mb_cache_entry_unhash(struct mb_cache_entry *ce) +static inline void +__mb_cache_entry_unhash_block(struct mb_cache_entry *ce) { - if (__mb_cache_entry_is_hashed(ce)) { - list_del_init(&ce->e_block_list); - list_del(&ce->e_index.o_list); - } + if (__mb_cache_entry_is_block_hashed(ce)) + hlist_bl_del_init(&ce->e_block_list); } +static inline int +__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce) +{ + return !hlist_bl_unhashed(&ce->e_index.o_list); +} + +static inline void +__mb_cache_entry_unhash_index(struct mb_cache_entry *ce) +{ + if (__mb_cache_entry_is_index_hashed(ce)) + hlist_bl_del_init(&ce->e_index.o_list); +} + +/* + * __mb_cache_entry_unhash_unlock() + * + * This function is called to unhash both the block and index hash + * chain. + * It assumes both the block and index hash chain is locked upon entry. + * It also unlock both hash chains both exit + */ +static inline void +__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce) +{ + __mb_cache_entry_unhash_index(ce); + hlist_bl_unlock(ce->e_index_hash_p); + __mb_cache_entry_unhash_block(ce); + hlist_bl_unlock(ce->e_block_hash_p); +} static void __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask) { struct mb_cache *cache = ce->e_cache; - mb_assert(!(ce->e_used || ce->e_queued)); + mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))); kmem_cache_free(cache->c_entry_cache, ce); atomic_dec(&cache->c_entry_count); } - static void -__mb_cache_entry_release_unlock(struct mb_cache_entry *ce) - __releases(mb_cache_spinlock) +__mb_cache_entry_release(struct mb_cache_entry *ce) { + /* First lock the entry to serialize access to its local data. */ + __spin_lock_mb_cache_entry(ce); /* Wake up all processes queuing for this cache entry. */ if (ce->e_queued) wake_up_all(&mb_cache_queue); if (ce->e_used >= MB_CACHE_WRITER) ce->e_used -= MB_CACHE_WRITER; + /* + * Make sure that all cache entries on lru_list have + * both e_used and e_qued of 0s. + */ ce->e_used--; - if (!(ce->e_used || ce->e_queued)) { - if (!__mb_cache_entry_is_hashed(ce)) + if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) { + if (!__mb_cache_entry_is_block_hashed(ce)) { + __spin_unlock_mb_cache_entry(ce); goto forget; - mb_assert(list_empty(&ce->e_lru_list)); - list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); + } + /* + * Need access to lru list, first drop entry lock, + * then reacquire the lock in the proper order. + */ + spin_lock(&mb_cache_spinlock); + if (list_empty(&ce->e_lru_list)) + list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); + spin_unlock(&mb_cache_spinlock); } - spin_unlock(&mb_cache_spinlock); + __spin_unlock_mb_cache_entry(ce); return; forget: - spin_unlock(&mb_cache_spinlock); + mb_assert(list_empty(&ce->e_lru_list)); __mb_cache_entry_forget(ce, GFP_KERNEL); } - /* * mb_cache_shrink_scan() memory pressure callback * @@ -160,17 +255,34 @@ mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) mb_debug("trying to free %d entries", nr_to_scan); spin_lock(&mb_cache_spinlock); - while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) { + while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) { struct mb_cache_entry *ce = list_entry(mb_cache_lru_list.next, - struct mb_cache_entry, e_lru_list); - list_move_tail(&ce->e_lru_list, &free_list); - __mb_cache_entry_unhash(ce); - freed++; + struct mb_cache_entry, e_lru_list); + list_del_init(&ce->e_lru_list); + if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)) + continue; + spin_unlock(&mb_cache_spinlock); + /* Prevent any find or get operation on the entry */ + hlist_bl_lock(ce->e_block_hash_p); + hlist_bl_lock(ce->e_index_hash_p); + /* Ignore if it is touched by a find/get */ + if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) || + !list_empty(&ce->e_lru_list)) { + hlist_bl_unlock(ce->e_index_hash_p); + hlist_bl_unlock(ce->e_block_hash_p); + spin_lock(&mb_cache_spinlock); + continue; + } + __mb_cache_entry_unhash_unlock(ce); + list_add_tail(&ce->e_lru_list, &free_list); + spin_lock(&mb_cache_spinlock); } spin_unlock(&mb_cache_spinlock); + list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) { __mb_cache_entry_forget(entry, gfp_mask); + freed++; } return freed; } @@ -215,29 +327,40 @@ mb_cache_create(const char *name, int bucket_bits) int n, bucket_count = 1 << bucket_bits; struct mb_cache *cache = NULL; + if (!mb_cache_bg_lock) { + mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock), + GFP_KERNEL); + if (!mb_cache_bg_lock) + return NULL; + bgl_lock_init(mb_cache_bg_lock); + } + cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL); if (!cache) return NULL; cache->c_name = name; atomic_set(&cache->c_entry_count, 0); cache->c_bucket_bits = bucket_bits; - cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head), - GFP_KERNEL); + cache->c_block_hash = kmalloc(bucket_count * + sizeof(struct hlist_bl_head), GFP_KERNEL); if (!cache->c_block_hash) goto fail; for (n=0; n<bucket_count; n++) - INIT_LIST_HEAD(&cache->c_block_hash[n]); - cache->c_index_hash = kmalloc(bucket_count * sizeof(struct list_head), - GFP_KERNEL); + INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]); + cache->c_index_hash = kmalloc(bucket_count * + sizeof(struct hlist_bl_head), GFP_KERNEL); if (!cache->c_index_hash) goto fail; for (n=0; n<bucket_count; n++) - INIT_LIST_HEAD(&cache->c_index_hash[n]); - cache->c_entry_cache = kmem_cache_create(name, - sizeof(struct mb_cache_entry), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); - if (!cache->c_entry_cache) - goto fail2; + INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]); + if (!mb_cache_kmem_cache) { + mb_cache_kmem_cache = kmem_cache_create(name, + sizeof(struct mb_cache_entry), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); + if (!mb_cache_kmem_cache) + goto fail2; + } + cache->c_entry_cache = mb_cache_kmem_cache; /* * Set an upper limit on the number of cache entries so that the hash @@ -273,21 +396,47 @@ void mb_cache_shrink(struct block_device *bdev) { LIST_HEAD(free_list); - struct list_head *l, *ltmp; + struct list_head *l; + struct mb_cache_entry *ce, *tmp; + l = &mb_cache_lru_list; spin_lock(&mb_cache_spinlock); - list_for_each_safe(l, ltmp, &mb_cache_lru_list) { - struct mb_cache_entry *ce = - list_entry(l, struct mb_cache_entry, e_lru_list); + while (!list_is_last(l, &mb_cache_lru_list)) { + l = l->next; + ce = list_entry(l, struct mb_cache_entry, e_lru_list); if (ce->e_bdev == bdev) { - list_move_tail(&ce->e_lru_list, &free_list); - __mb_cache_entry_unhash(ce); + list_del_init(&ce->e_lru_list); + if (ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt)) + continue; + spin_unlock(&mb_cache_spinlock); + /* + * Prevent any find or get operation on the entry. + */ + hlist_bl_lock(ce->e_block_hash_p); + hlist_bl_lock(ce->e_index_hash_p); + /* Ignore if it is touched by a find/get */ + if (ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt) || + !list_empty(&ce->e_lru_list)) { + hlist_bl_unlock(ce->e_index_hash_p); + hlist_bl_unlock(ce->e_block_hash_p); + l = &mb_cache_lru_list; + spin_lock(&mb_cache_spinlock); + continue; + } + __mb_cache_entry_unhash_unlock(ce); + mb_assert(!(ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt))); + list_add_tail(&ce->e_lru_list, &free_list); + l = &mb_cache_lru_list; + spin_lock(&mb_cache_spinlock); } } spin_unlock(&mb_cache_spinlock); - list_for_each_safe(l, ltmp, &free_list) { - __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, - e_lru_list), GFP_KERNEL); + + list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) { + __mb_cache_entry_forget(ce, GFP_KERNEL); } } @@ -303,23 +452,27 @@ void mb_cache_destroy(struct mb_cache *cache) { LIST_HEAD(free_list); - struct list_head *l, *ltmp; + struct mb_cache_entry *ce, *tmp; spin_lock(&mb_cache_spinlock); - list_for_each_safe(l, ltmp, &mb_cache_lru_list) { - struct mb_cache_entry *ce = - list_entry(l, struct mb_cache_entry, e_lru_list); - if (ce->e_cache == cache) { + list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) { + if (ce->e_cache == cache) list_move_tail(&ce->e_lru_list, &free_list); - __mb_cache_entry_unhash(ce); - } } list_del(&cache->c_cache_list); spin_unlock(&mb_cache_spinlock); - list_for_each_safe(l, ltmp, &free_list) { - __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, - e_lru_list), GFP_KERNEL); + list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) { + list_del_init(&ce->e_lru_list); + /* + * Prevent any find or get operation on the entry. + */ + hlist_bl_lock(ce->e_block_hash_p); + hlist_bl_lock(ce->e_index_hash_p); + mb_assert(!(ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt))); + __mb_cache_entry_unhash_unlock(ce); + __mb_cache_entry_forget(ce, GFP_KERNEL); } if (atomic_read(&cache->c_entry_count) > 0) { @@ -328,8 +481,10 @@ mb_cache_destroy(struct mb_cache *cache) atomic_read(&cache->c_entry_count)); } - kmem_cache_destroy(cache->c_entry_cache); - + if (list_empty(&mb_cache_list)) { + kmem_cache_destroy(mb_cache_kmem_cache); + mb_cache_kmem_cache = NULL; + } kfree(cache->c_index_hash); kfree(cache->c_block_hash); kfree(cache); @@ -346,28 +501,61 @@ mb_cache_destroy(struct mb_cache *cache) struct mb_cache_entry * mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags) { - struct mb_cache_entry *ce = NULL; + struct mb_cache_entry *ce; if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) { + struct list_head *l; + + l = &mb_cache_lru_list; spin_lock(&mb_cache_spinlock); - if (!list_empty(&mb_cache_lru_list)) { - ce = list_entry(mb_cache_lru_list.next, - struct mb_cache_entry, e_lru_list); - list_del_init(&ce->e_lru_list); - __mb_cache_entry_unhash(ce); + while (!list_is_last(l, &mb_cache_lru_list)) { + l = l->next; + ce = list_entry(l, struct mb_cache_entry, e_lru_list); + if (ce->e_cache == cache) { + list_del_init(&ce->e_lru_list); + if (ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt)) + continue; + spin_unlock(&mb_cache_spinlock); + /* + * Prevent any find or get operation on the + * entry. + */ + hlist_bl_lock(ce->e_block_hash_p); + hlist_bl_lock(ce->e_index_hash_p); + /* Ignore if it is touched by a find/get */ + if (ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt) || + !list_empty(&ce->e_lru_list)) { + hlist_bl_unlock(ce->e_index_hash_p); + hlist_bl_unlock(ce->e_block_hash_p); + l = &mb_cache_lru_list; + spin_lock(&mb_cache_spinlock); + continue; + } + mb_assert(list_empty(&ce->e_lru_list)); + mb_assert(!(ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt))); + __mb_cache_entry_unhash_unlock(ce); + goto found; + } } spin_unlock(&mb_cache_spinlock); } - if (!ce) { - ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); - if (!ce) - return NULL; - atomic_inc(&cache->c_entry_count); - INIT_LIST_HEAD(&ce->e_lru_list); - INIT_LIST_HEAD(&ce->e_block_list); - ce->e_cache = cache; - ce->e_queued = 0; - } + + ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); + if (!ce) + return NULL; + atomic_inc(&cache->c_entry_count); + INIT_LIST_HEAD(&ce->e_lru_list); + INIT_HLIST_BL_NODE(&ce->e_block_list); + INIT_HLIST_BL_NODE(&ce->e_index.o_list); + ce->e_cache = cache; + ce->e_queued = 0; + atomic_set(&ce->e_refcnt, 0); +found: + ce->e_block_hash_p = &cache->c_block_hash[0]; + ce->e_index_hash_p = &cache->c_index_hash[0]; ce->e_used = 1 + MB_CACHE_WRITER; return ce; } @@ -393,29 +581,38 @@ mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev, { struct mb_cache *cache = ce->e_cache; unsigned int bucket; - struct list_head *l; - int error = -EBUSY; + struct hlist_bl_node *l; + struct hlist_bl_head *block_hash_p; + struct hlist_bl_head *index_hash_p; + struct mb_cache_entry *lce; + mb_assert(ce); bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), cache->c_bucket_bits); - spin_lock(&mb_cache_spinlock); - list_for_each_prev(l, &cache->c_block_hash[bucket]) { - struct mb_cache_entry *ce = - list_entry(l, struct mb_cache_entry, e_block_list); - if (ce->e_bdev == bdev && ce->e_block == block) - goto out; + block_hash_p = &cache->c_block_hash[bucket]; + hlist_bl_lock(block_hash_p); + hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) { + if (lce->e_bdev == bdev && lce->e_block == block) { + hlist_bl_unlock(block_hash_p); + return -EBUSY; + } } - __mb_cache_entry_unhash(ce); + mb_assert(!__mb_cache_entry_is_block_hashed(ce)); + __mb_cache_entry_unhash_block(ce); + __mb_cache_entry_unhash_index(ce); ce->e_bdev = bdev; ce->e_block = block; - list_add(&ce->e_block_list, &cache->c_block_hash[bucket]); + ce->e_block_hash_p = block_hash_p; ce->e_index.o_key = key; + hlist_bl_add_head(&ce->e_block_list, block_hash_p); + hlist_bl_unlock(block_hash_p); bucket = hash_long(key, cache->c_bucket_bits); - list_add(&ce->e_index.o_list, &cache->c_index_hash[bucket]); - error = 0; -out: - spin_unlock(&mb_cache_spinlock); - return error; + index_hash_p = &cache->c_index_hash[bucket]; + hlist_bl_lock(index_hash_p); + ce->e_index_hash_p = index_hash_p; + hlist_bl_add_head(&ce->e_index.o_list, index_hash_p); + hlist_bl_unlock(index_hash_p); + return 0; } @@ -429,24 +626,26 @@ out: void mb_cache_entry_release(struct mb_cache_entry *ce) { - spin_lock(&mb_cache_spinlock); - __mb_cache_entry_release_unlock(ce); + __mb_cache_entry_release(ce); } /* * mb_cache_entry_free() * - * This is equivalent to the sequence mb_cache_entry_takeout() -- - * mb_cache_entry_release(). */ void mb_cache_entry_free(struct mb_cache_entry *ce) { - spin_lock(&mb_cache_spinlock); + mb_assert(ce); mb_assert(list_empty(&ce->e_lru_list)); - __mb_cache_entry_unhash(ce); - __mb_cache_entry_release_unlock(ce); + hlist_bl_lock(ce->e_index_hash_p); + __mb_cache_entry_unhash_index(ce); + hlist_bl_unlock(ce->e_index_hash_p); + hlist_bl_lock(ce->e_block_hash_p); + __mb_cache_entry_unhash_block(ce); + hlist_bl_unlock(ce->e_block_hash_p); + __mb_cache_entry_release(ce); } @@ -463,84 +662,110 @@ mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev, sector_t block) { unsigned int bucket; - struct list_head *l; + struct hlist_bl_node *l; struct mb_cache_entry *ce; + struct hlist_bl_head *block_hash_p; bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), cache->c_bucket_bits); - spin_lock(&mb_cache_spinlock); - list_for_each(l, &cache->c_block_hash[bucket]) { - ce = list_entry(l, struct mb_cache_entry, e_block_list); + block_hash_p = &cache->c_block_hash[bucket]; + /* First serialize access to the block corresponding hash chain. */ + hlist_bl_lock(block_hash_p); + hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) { + mb_assert(ce->e_block_hash_p == block_hash_p); if (ce->e_bdev == bdev && ce->e_block == block) { - DEFINE_WAIT(wait); + /* + * Prevent a free from removing the entry. + */ + atomic_inc(&ce->e_refcnt); + hlist_bl_unlock(block_hash_p); + __spin_lock_mb_cache_entry(ce); + atomic_dec(&ce->e_refcnt); + if (ce->e_used > 0) { + DEFINE_WAIT(wait); + while (ce->e_used > 0) { + ce->e_queued++; + prepare_to_wait(&mb_cache_queue, &wait, + TASK_UNINTERRUPTIBLE); + __spin_unlock_mb_cache_entry(ce); + schedule(); + __spin_lock_mb_cache_entry(ce); + ce->e_queued--; + } + finish_wait(&mb_cache_queue, &wait); + } + ce->e_used += 1 + MB_CACHE_WRITER; + __spin_unlock_mb_cache_entry(ce); - if (!list_empty(&ce->e_lru_list)) + if (!list_empty(&ce->e_lru_list)) { + spin_lock(&mb_cache_spinlock); list_del_init(&ce->e_lru_list); - - while (ce->e_used > 0) { - ce->e_queued++; - prepare_to_wait(&mb_cache_queue, &wait, - TASK_UNINTERRUPTIBLE); spin_unlock(&mb_cache_spinlock); - schedule(); - spin_lock(&mb_cache_spinlock); - ce->e_queued--; } - finish_wait(&mb_cache_queue, &wait); - ce->e_used += 1 + MB_CACHE_WRITER; - - if (!__mb_cache_entry_is_hashed(ce)) { - __mb_cache_entry_release_unlock(ce); + if (!__mb_cache_entry_is_block_hashed(ce)) { + __mb_cache_entry_release(ce); return NULL; } - goto cleanup; + return ce; } } - ce = NULL; - -cleanup: - spin_unlock(&mb_cache_spinlock); - return ce; + hlist_bl_unlock(block_hash_p); + return NULL; } #if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) static struct mb_cache_entry * -__mb_cache_entry_find(struct list_head *l, struct list_head *head, +__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head, struct block_device *bdev, unsigned int key) { - while (l != head) { + + /* The index hash chain is alredy acquire by caller. */ + while (l != NULL) { struct mb_cache_entry *ce = - list_entry(l, struct mb_cache_entry, e_index.o_list); + hlist_bl_entry(l, struct mb_cache_entry, + e_index.o_list); + mb_assert(ce->e_index_hash_p == head); if (ce->e_bdev == bdev && ce->e_index.o_key == key) { - DEFINE_WAIT(wait); - - if (!list_empty(&ce->e_lru_list)) - list_del_init(&ce->e_lru_list); - + /* + * Prevent a free from removing the entry. + */ + atomic_inc(&ce->e_refcnt); + hlist_bl_unlock(head); + __spin_lock_mb_cache_entry(ce); + atomic_dec(&ce->e_refcnt); + ce->e_used++; /* Incrementing before holding the lock gives readers priority over writers. */ - ce->e_used++; - while (ce->e_used >= MB_CACHE_WRITER) { - ce->e_queued++; - prepare_to_wait(&mb_cache_queue, &wait, - TASK_UNINTERRUPTIBLE); - spin_unlock(&mb_cache_spinlock); - schedule(); - spin_lock(&mb_cache_spinlock); - ce->e_queued--; + if (ce->e_used >= MB_CACHE_WRITER) { + DEFINE_WAIT(wait); + + while (ce->e_used >= MB_CACHE_WRITER) { + ce->e_queued++; + prepare_to_wait(&mb_cache_queue, &wait, + TASK_UNINTERRUPTIBLE); + __spin_unlock_mb_cache_entry(ce); + schedule(); + __spin_lock_mb_cache_entry(ce); + ce->e_queued--; + } + finish_wait(&mb_cache_queue, &wait); } - finish_wait(&mb_cache_queue, &wait); - - if (!__mb_cache_entry_is_hashed(ce)) { - __mb_cache_entry_release_unlock(ce); + __spin_unlock_mb_cache_entry(ce); + if (!list_empty(&ce->e_lru_list)) { spin_lock(&mb_cache_spinlock); + list_del_init(&ce->e_lru_list); + spin_unlock(&mb_cache_spinlock); + } + if (!__mb_cache_entry_is_block_hashed(ce)) { + __mb_cache_entry_release(ce); return ERR_PTR(-EAGAIN); } return ce; } l = l->next; } + hlist_bl_unlock(head); return NULL; } @@ -562,13 +787,17 @@ mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev, unsigned int key) { unsigned int bucket = hash_long(key, cache->c_bucket_bits); - struct list_head *l; - struct mb_cache_entry *ce; - - spin_lock(&mb_cache_spinlock); - l = cache->c_index_hash[bucket].next; - ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key); - spin_unlock(&mb_cache_spinlock); + struct hlist_bl_node *l; + struct mb_cache_entry *ce = NULL; + struct hlist_bl_head *index_hash_p; + + index_hash_p = &cache->c_index_hash[bucket]; + hlist_bl_lock(index_hash_p); + if (!hlist_bl_empty(index_hash_p)) { + l = hlist_bl_first(index_hash_p); + ce = __mb_cache_entry_find(l, index_hash_p, bdev, key); + } else + hlist_bl_unlock(index_hash_p); return ce; } @@ -597,13 +826,17 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev, { struct mb_cache *cache = prev->e_cache; unsigned int bucket = hash_long(key, cache->c_bucket_bits); - struct list_head *l; + struct hlist_bl_node *l; struct mb_cache_entry *ce; + struct hlist_bl_head *index_hash_p; - spin_lock(&mb_cache_spinlock); + index_hash_p = &cache->c_index_hash[bucket]; + mb_assert(prev->e_index_hash_p == index_hash_p); + hlist_bl_lock(index_hash_p); + mb_assert(!hlist_bl_empty(index_hash_p)); l = prev->e_index.o_list.next; - ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key); - __mb_cache_entry_release_unlock(prev); + ce = __mb_cache_entry_find(l, index_hash_p, bdev, key); + __mb_cache_entry_release(prev); return ce; } diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig index 6624684dd5d..f2a0cfcef11 100644 --- a/fs/minix/Kconfig +++ b/fs/minix/Kconfig @@ -18,7 +18,7 @@ config MINIX_FS config MINIX_FS_NATIVE_ENDIAN def_bool MINIX_FS - depends on H8300 || M32R || MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU) + depends on M32R || MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU) config MINIX_FS_BIG_ENDIAN_16BIT_INDEXED def_bool MINIX_FS diff --git a/fs/minix/file.c b/fs/minix/file.c index adc6f549423..a967de085ac 100644 --- a/fs/minix/file.c +++ b/fs/minix/file.c @@ -14,10 +14,10 @@ */ const struct file_operations minix_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 0332109162a..f007a335557 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -26,7 +26,7 @@ static int minix_remount (struct super_block * sb, int * flags, char * data); static void minix_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (!inode->i_nlink) { inode->i_size = 0; minix_truncate(inode); @@ -86,7 +86,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { minix_inode_cachep = kmem_cache_create("minix_inode_cache", sizeof(struct minix_inode_info), @@ -123,6 +123,7 @@ static int minix_remount (struct super_block * sb, int * flags, char * data) struct minix_sb_info * sbi = minix_sb(sb); struct minix_super_block * ms; + sync_filesystem(sb); ms = sbi->s_ms; if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; diff --git a/fs/mount.h b/fs/mount.h index 64a858143ff..d55297f2fa0 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -10,7 +10,7 @@ struct mnt_namespace { struct user_namespace *user_ns; u64 seq; /* Sequence number to prevent loops */ wait_queue_head_t poll; - int event; + u64 event; }; struct mnt_pcp { @@ -19,16 +19,17 @@ struct mnt_pcp { }; struct mountpoint { - struct list_head m_hash; + struct hlist_node m_hash; struct dentry *m_dentry; int m_count; }; struct mount { - struct list_head mnt_hash; + struct hlist_node mnt_hash; struct mount *mnt_parent; struct dentry *mnt_mountpoint; struct vfsmount mnt; + struct rcu_head mnt_rcu; #ifdef CONFIG_SMP struct mnt_pcp __percpu *mnt_pcp; #else @@ -55,7 +56,7 @@ struct mount { int mnt_group_id; /* peer group identifier */ int mnt_expiry_mark; /* true if marked for expiry */ int mnt_pinned; - int mnt_ghosts; + struct path mnt_ex_mountpoint; }; #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */ @@ -73,21 +74,39 @@ static inline int mnt_has_parent(struct mount *mnt) static inline int is_mounted(struct vfsmount *mnt) { /* neither detached nor internal? */ - return !IS_ERR_OR_NULL(real_mount(mnt)); + return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns); } -extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *, int); +extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); +extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *); + +extern bool legitimize_mnt(struct vfsmount *, unsigned); static inline void get_mnt_ns(struct mnt_namespace *ns) { atomic_inc(&ns->count); } +extern seqlock_t mount_lock; + +static inline void lock_mount_hash(void) +{ + write_seqlock(&mount_lock); +} + +static inline void unlock_mount_hash(void) +{ + write_sequnlock(&mount_lock); +} + struct proc_mounts { struct seq_file m; struct mnt_namespace *ns; struct path root; int (*show)(struct seq_file *, struct vfsmount *); + void *cached_mount; + u64 cached_event; + loff_t cached_index; }; #define proc_mounts(p) (container_of((p), struct proc_mounts, m)) diff --git a/fs/mpage.c b/fs/mpage.c index 0face1c4d4c..5f9ed622274 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -43,31 +43,14 @@ */ static void mpage_end_io(struct bio *bio, int err) { - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + page_endio(page, bio_data_dir(bio), err); + } - do { - struct page *page = bvec->bv_page; - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - if (bio_data_dir(bio) == READ) { - if (uptodate) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); - } else { /* bio_data_dir(bio) == WRITE */ - if (!uptodate) { - SetPageError(page); - if (page->mapping) - set_bit(AS_EIO, &page->mapping->flags); - } - end_page_writeback(page); - } - } while (bvec >= bio->bi_io_vec); bio_put(bio); } @@ -94,7 +77,7 @@ mpage_alloc(struct block_device *bdev, if (bio) { bio->bi_bdev = bdev; - bio->bi_sector = first_sector; + bio->bi_iter.bi_sector = first_sector; } return bio; } @@ -286,6 +269,11 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, alloc_new: if (bio == NULL) { + if (first_hole == blocks_per_page) { + if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9), + page)) + goto out; + } bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), min_t(int, nr_pages, bio_get_nr_vecs(bdev)), GFP_KERNEL); @@ -440,6 +428,35 @@ struct mpage_data { unsigned use_writepage; }; +/* + * We have our BIO, so we can now mark the buffers clean. Make + * sure to only clean buffers which we know we'll be writing. + */ +static void clean_buffers(struct page *page, unsigned first_unmapped) +{ + unsigned buffer_counter = 0; + struct buffer_head *bh, *head; + if (!page_has_buffers(page)) + return; + head = page_buffers(page); + bh = head; + + do { + if (buffer_counter++ == first_unmapped) + break; + clear_buffer_dirty(bh); + bh = bh->b_this_page; + } while (bh != head); + + /* + * we cannot drop the bh if the page is not uptodate or a concurrent + * readpage would fail to serialize with the bh and it would read from + * disk before we reach the platter. + */ + if (buffer_heads_over_limit && PageUptodate(page)) + try_to_free_buffers(page); +} + static int __mpage_writepage(struct page *page, struct writeback_control *wbc, void *data) { @@ -575,6 +592,13 @@ page_is_mapped: alloc_new: if (bio == NULL) { + if (first_unmapped == blocks_per_page) { + if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9), + page, wbc)) { + clean_buffers(page, first_unmapped); + goto out; + } + } bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH); if (bio == NULL) @@ -592,30 +616,7 @@ alloc_new: goto alloc_new; } - /* - * OK, we have our BIO, so we can now mark the buffers clean. Make - * sure to only clean buffers which we know we'll be writing. - */ - if (page_has_buffers(page)) { - struct buffer_head *head = page_buffers(page); - struct buffer_head *bh = head; - unsigned buffer_counter = 0; - - do { - if (buffer_counter++ == first_unmapped) - break; - clear_buffer_dirty(bh); - bh = bh->b_this_page; - } while (bh != head); - - /* - * we cannot drop the bh if the page is not uptodate - * or a concurrent readpage would fail to serialize with the bh - * and it would read from disk before we reach the platter. - */ - if (buffer_heads_over_limit && PageUptodate(page)) - try_to_free_buffers(page); - } + clean_buffers(page, first_unmapped); BUG_ON(PageWriteback(page)); set_page_writeback(page); diff --git a/fs/namei.c b/fs/namei.c index 645268f23eb..9eb787e5c16 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -196,6 +196,7 @@ recopy: goto error; result->uptr = filename; + result->aname = NULL; audit_getname(result); return result; @@ -209,7 +210,35 @@ getname(const char __user * filename) { return getname_flags(filename, 0, NULL); } -EXPORT_SYMBOL(getname); + +/* + * The "getname_kernel()" interface doesn't do pathnames longer + * than EMBEDDED_NAME_MAX. Deal with it - you're a kernel user. + */ +struct filename * +getname_kernel(const char * filename) +{ + struct filename *result; + char *kname; + int len; + + len = strlen(filename); + if (len >= EMBEDDED_NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + result = __getname(); + if (unlikely(!result)) + return ERR_PTR(-ENOMEM); + + kname = (char *)result + sizeof(*result); + result->name = kname; + result->uptr = NULL; + result->aname = NULL; + result->separate = false; + + strlcpy(kname, filename, EMBEDDED_NAME_MAX); + return result; +} #ifdef CONFIG_AUDITSYSCALL void putname(struct filename *name) @@ -235,27 +264,9 @@ static int check_acl(struct inode *inode, int mask) return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK); } - acl = get_cached_acl(inode, ACL_TYPE_ACCESS); - - /* - * A filesystem can force a ACL callback by just never filling the - * ACL cache. But normally you'd fill the cache either at inode - * instantiation time, or on the first ->get_acl call. - * - * If the filesystem doesn't have a get_acl() function at all, we'll - * just create the negative cache entry. - */ - if (acl == ACL_NOT_CACHED) { - if (inode->i_op->get_acl) { - acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } else { - set_cached_acl(inode, ACL_TYPE_ACCESS, NULL); - return -EAGAIN; - } - } - + acl = get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); if (acl) { int error = posix_acl_permission(inode, acl, mask); posix_acl_release(acl); @@ -321,10 +332,11 @@ int generic_permission(struct inode *inode, int mask) if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ - if (inode_capable(inode, CAP_DAC_OVERRIDE)) + if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) return 0; if (!(mask & MAY_WRITE)) - if (inode_capable(inode, CAP_DAC_READ_SEARCH)) + if (capable_wrt_inode_uidgid(inode, + CAP_DAC_READ_SEARCH)) return 0; return -EACCES; } @@ -334,7 +346,7 @@ int generic_permission(struct inode *inode, int mask) * at least one exec bit set. */ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) - if (inode_capable(inode, CAP_DAC_OVERRIDE)) + if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) return 0; /* @@ -342,11 +354,12 @@ int generic_permission(struct inode *inode, int mask) */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ) - if (inode_capable(inode, CAP_DAC_READ_SEARCH)) + if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH)) return 0; return -EACCES; } +EXPORT_SYMBOL(generic_permission); /* * We _really_ want to just do "generic_permission()" without @@ -444,6 +457,7 @@ int inode_permission(struct inode *inode, int mask) return retval; return __inode_permission(inode, mask); } +EXPORT_SYMBOL(inode_permission); /** * path_get - get a reference to a path @@ -482,18 +496,6 @@ EXPORT_SYMBOL(path_put); * to restart the path walk from the beginning in ref-walk mode. */ -static inline void lock_rcu_walk(void) -{ - br_read_lock(&vfsmount_lock); - rcu_read_lock(); -} - -static inline void unlock_rcu_walk(void) -{ - rcu_read_unlock(); - br_read_unlock(&vfsmount_lock); -} - /** * unlazy_walk - try to switch to ref-walk mode. * @nd: nameidata pathwalk data @@ -512,26 +514,22 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) BUG_ON(!(nd->flags & LOOKUP_RCU)); /* - * Get a reference to the parent first: we're - * going to make "path_put(nd->path)" valid in - * non-RCU context for "terminate_walk()". - * - * If this doesn't work, return immediately with - * RCU walking still active (and then we will do - * the RCU walk cleanup in terminate_walk()). + * After legitimizing the bastards, terminate_walk() + * will do the right thing for non-RCU mode, and all our + * subsequent exit cases should rcu_read_unlock() + * before returning. Do vfsmount first; if dentry + * can't be legitimized, just set nd->path.dentry to NULL + * and rely on dput(NULL) being a no-op. */ - if (!lockref_get_not_dead(&parent->d_lockref)) + if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) return -ECHILD; - - /* - * After the mntget(), we terminate_walk() will do - * the right thing for non-RCU mode, and all our - * subsequent exit cases should unlock_rcu_walk() - * before returning. - */ - mntget(nd->path.mnt); nd->flags &= ~LOOKUP_RCU; + if (!lockref_get_not_dead(&parent->d_lockref)) { + nd->path.dentry = NULL; + goto out; + } + /* * For a negative lookup, the lookup sequence point is the parents * sequence point, and it only needs to revalidate the parent dentry. @@ -566,17 +564,17 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) spin_unlock(&fs->lock); } - unlock_rcu_walk(); + rcu_read_unlock(); return 0; unlock_and_drop_dentry: spin_unlock(&fs->lock); drop_dentry: - unlock_rcu_walk(); + rcu_read_unlock(); dput(dentry); goto drop_root_mnt; out: - unlock_rcu_walk(); + rcu_read_unlock(); drop_root_mnt: if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; @@ -608,17 +606,22 @@ static int complete_walk(struct nameidata *nd) if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; + if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) { + rcu_read_unlock(); + return -ECHILD; + } if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) { - unlock_rcu_walk(); + rcu_read_unlock(); + mntput(nd->path.mnt); return -ECHILD; } if (read_seqcount_retry(&dentry->d_seq, nd->seq)) { - unlock_rcu_walk(); + rcu_read_unlock(); dput(dentry); + mntput(nd->path.mnt); return -ECHILD; } - mntget(nd->path.mnt); - unlock_rcu_walk(); + rcu_read_unlock(); } if (likely(!(nd->flags & LOOKUP_JUMPED))) @@ -909,21 +912,22 @@ int follow_up(struct path *path) struct mount *parent; struct dentry *mountpoint; - br_read_lock(&vfsmount_lock); + read_seqlock_excl(&mount_lock); parent = mnt->mnt_parent; if (parent == mnt) { - br_read_unlock(&vfsmount_lock); + read_sequnlock_excl(&mount_lock); return 0; } mntget(&parent->mnt); mountpoint = dget(mnt->mnt_mountpoint); - br_read_unlock(&vfsmount_lock); + read_sequnlock_excl(&mount_lock); dput(path->dentry); path->dentry = mountpoint; mntput(path->mnt); path->mnt = &parent->mnt; return 1; } +EXPORT_SYMBOL(follow_up); /* * Perform an automount @@ -1048,8 +1052,8 @@ static int follow_managed(struct path *path, unsigned flags) /* Something is mounted on this dentry in another * namespace and/or whatever was mounted there in this - * namespace got unmounted before we managed to get the - * vfsmount_lock */ + * namespace got unmounted before lookup_mnt() could + * get it */ } /* Handle an automount point */ @@ -1085,6 +1089,7 @@ int follow_down_one(struct path *path) } return 0; } +EXPORT_SYMBOL(follow_down_one); static inline bool managed_dentry_might_block(struct dentry *dentry) { @@ -1109,9 +1114,9 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, return false; if (!d_mountpoint(path->dentry)) - break; + return true; - mounted = __lookup_mnt(path->mnt, path->dentry, 1); + mounted = __lookup_mnt(path->mnt, path->dentry); if (!mounted) break; path->mnt = &mounted->mnt; @@ -1125,20 +1130,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, */ *inode = path->dentry->d_inode; } - return true; -} - -static void follow_mount_rcu(struct nameidata *nd) -{ - while (d_mountpoint(nd->path.dentry)) { - struct mount *mounted; - mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1); - if (!mounted) - break; - nd->path.mnt = &mounted->mnt; - nd->path.dentry = mounted->mnt.mnt_root; - nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); - } + return read_seqretry(&mount_lock, nd->m_seq); } static int follow_dotdot_rcu(struct nameidata *nd) @@ -1166,7 +1158,17 @@ static int follow_dotdot_rcu(struct nameidata *nd) break; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } - follow_mount_rcu(nd); + while (d_mountpoint(nd->path.dentry)) { + struct mount *mounted; + mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry); + if (!mounted) + break; + nd->path.mnt = &mounted->mnt; + nd->path.dentry = mounted->mnt.mnt_root; + nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); + if (!read_seqretry(&mount_lock, nd->m_seq)) + goto failed; + } nd->inode = nd->path.dentry->d_inode; return 0; @@ -1174,7 +1176,7 @@ failed: nd->flags &= ~LOOKUP_RCU; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; - unlock_rcu_walk(); + rcu_read_unlock(); return -ECHILD; } @@ -1226,6 +1228,7 @@ int follow_down(struct path *path) } return 0; } +EXPORT_SYMBOL(follow_down); /* * Skip to top of mountpoint pile in refwalk mode for follow_dotdot() @@ -1308,8 +1311,8 @@ static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir, } /* - * Call i_op->lookup on the dentry. The dentry must be negative but may be - * hashed if it was pouplated with DCACHE_NEED_LOOKUP. + * Call i_op->lookup on the dentry. The dentry must be negative and + * unhashed. * * dir->d_inode->i_mutex must be held */ @@ -1501,7 +1504,7 @@ static void terminate_walk(struct nameidata *nd) nd->flags &= ~LOOKUP_RCU; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; - unlock_rcu_walk(); + rcu_read_unlock(); } } @@ -1511,18 +1514,9 @@ static void terminate_walk(struct nameidata *nd) * so we keep a cache of "no, this doesn't need follow_link" * for the common case. */ -static inline int should_follow_link(struct inode *inode, int follow) +static inline int should_follow_link(struct dentry *dentry, int follow) { - if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { - if (likely(inode->i_op->follow_link)) - return follow; - - /* This gets set once for the inode lifetime */ - spin_lock(&inode->i_lock); - inode->i_opflags |= IOP_NOFOLLOW; - spin_unlock(&inode->i_lock); - } - return 0; + return unlikely(d_is_symlink(dentry)) ? follow : 0; } static inline int walk_component(struct nameidata *nd, struct path *path, @@ -1549,10 +1543,10 @@ static inline int walk_component(struct nameidata *nd, struct path *path, inode = path->dentry->d_inode; } err = -ENOENT; - if (!inode) + if (!inode || d_is_negative(path->dentry)) goto out_path_put; - if (should_follow_link(inode, follow)) { + if (should_follow_link(path->dentry, follow)) { if (nd->flags & LOOKUP_RCU) { if (unlikely(unlazy_walk(nd, path->dentry))) { err = -ECHILD; @@ -1611,26 +1605,6 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd) } /* - * We really don't want to look at inode->i_op->lookup - * when we don't have to. So we keep a cache bit in - * the inode ->i_opflags field that says "yes, we can - * do lookup on this inode". - */ -static inline int can_lookup(struct inode *inode) -{ - if (likely(inode->i_opflags & IOP_LOOKUP)) - return 1; - if (likely(!inode->i_op->lookup)) - return 0; - - /* We do this once for the lifetime of the inode */ - spin_lock(&inode->i_lock); - inode->i_opflags |= IOP_LOOKUP; - spin_unlock(&inode->i_lock); - return 1; -} - -/* * We can do the critical dentry name comparison and hashing * operations one word at a time, but we are limited to: * @@ -1638,11 +1612,6 @@ static inline int can_lookup(struct inode *inode) * do a "get_unaligned()" if this helps and is sufficiently * fast. * - * - Little-endian machines (so that we can generate the mask - * of low bytes efficiently). Again, we *could* do a byte - * swapping load on big-endian architectures if that is not - * expensive enough to make the optimization worthless. - * * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we * do not trap on the (extremely unlikely) case of a page * crossing operation. @@ -1686,7 +1655,7 @@ unsigned int full_name_hash(const unsigned char *name, unsigned int len) if (!len) goto done; } - mask = ~(~0ul << len*8); + mask = bytemask_from_count(len); hash += mask & a; done: return fold_hash(hash); @@ -1833,7 +1802,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (err) return err; } - if (!can_lookup(nd->inode)) { + if (!d_can_lookup(nd->path.dentry)) { err = -ENOTDIR; break; } @@ -1851,9 +1820,10 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->flags = flags | LOOKUP_JUMPED; nd->depth = 0; if (flags & LOOKUP_ROOT) { - struct inode *inode = nd->root.dentry->d_inode; + struct dentry *root = nd->root.dentry; + struct inode *inode = root->d_inode; if (*name) { - if (!can_lookup(inode)) + if (!d_can_lookup(root)) return -ENOTDIR; retval = inode_permission(inode, MAY_EXEC); if (retval) @@ -1862,8 +1832,9 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { - lock_rcu_walk(); + rcu_read_lock(); nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + nd->m_seq = read_seqbegin(&mount_lock); } else { path_get(&nd->path); } @@ -1872,9 +1843,10 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->root.mnt = NULL; + nd->m_seq = read_seqbegin(&mount_lock); if (*name=='/') { if (flags & LOOKUP_RCU) { - lock_rcu_walk(); + rcu_read_lock(); set_root_rcu(nd); } else { set_root(nd); @@ -1886,7 +1858,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct fs_struct *fs = current->fs; unsigned seq; - lock_rcu_walk(); + rcu_read_lock(); do { seq = read_seqcount_begin(&fs->seq); @@ -1907,7 +1879,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, dentry = f.file->f_path.dentry; if (*name) { - if (!can_lookup(dentry->d_inode)) { + if (!d_can_lookup(dentry)) { fdput(f); return -ENOTDIR; } @@ -1915,10 +1887,10 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->path = f.file->f_path; if (flags & LOOKUP_RCU) { - if (f.need_put) + if (f.flags & FDPUT_FPUT) *fp = f.file; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - lock_rcu_walk(); + rcu_read_lock(); } else { path_get(&nd->path); fdput(f); @@ -1989,7 +1961,7 @@ static int path_lookupat(int dfd, const char *name, err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) { - if (!can_lookup(nd->inode)) { + if (!d_can_lookup(nd->path.dentry)) { path_put(&nd->path); err = -ENOTDIR; } @@ -2059,6 +2031,7 @@ int kern_path(const char *name, unsigned int flags, struct path *path) *path = nd.path; return res; } +EXPORT_SYMBOL(kern_path); /** * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair @@ -2083,6 +2056,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, *path = nd.path; return err; } +EXPORT_SYMBOL(vfs_path_lookup); /* * Restricted form of lookup. Doesn't follow links, single-component only, @@ -2145,6 +2119,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) return __lookup_hash(&this, base, 0); } +EXPORT_SYMBOL(lookup_one_len); int user_path_at_empty(int dfd, const char __user *name, unsigned flags, struct path *path, int *empty) @@ -2169,6 +2144,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags, { return user_path_at_empty(dfd, name, flags, path, NULL); } +EXPORT_SYMBOL(user_path_at); /* * NB: most callers don't do anything directly with the reference to the @@ -2274,15 +2250,16 @@ mountpoint_last(struct nameidata *nd, struct path *path) mutex_unlock(&dir->d_inode->i_mutex); done: - if (!dentry->d_inode) { + if (!dentry->d_inode || d_is_negative(dentry)) { error = -ENOENT; dput(dentry); goto out; } path->dentry = dentry; - path->mnt = mntget(nd->path.mnt); - if (should_follow_link(dentry->d_inode, nd->flags & LOOKUP_FOLLOW)) + path->mnt = nd->path.mnt; + if (should_follow_link(dentry, nd->flags & LOOKUP_FOLLOW)) return 1; + mntget(path->mnt); follow_mount(path); error = 0; out: @@ -2294,10 +2271,11 @@ out: * path_mountpoint - look up a path to be umounted * @dfd: directory file descriptor to start walk from * @name: full pathname to walk + * @path: pointer to container for result * @flags: lookup flags * * Look up the given name, but don't attempt to revalidate the last component. - * Returns 0 and "path" will be valid on success; Retuns error otherwise. + * Returns 0 and "path" will be valid on success; Returns error otherwise. */ static int path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags) @@ -2403,7 +2381,7 @@ static inline int check_sticky(struct inode *dir, struct inode *inode) return 0; if (uid_eq(dir->i_uid, fsuid)) return 0; - return !inode_capable(inode, CAP_FOWNER); + return !capable_wrt_inode_uidgid(inode, CAP_FOWNER); } /* @@ -2425,12 +2403,14 @@ static inline int check_sticky(struct inode *dir, struct inode *inode) * 10. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ -static int may_delete(struct inode *dir,struct dentry *victim,int isdir) +static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) { + struct inode *inode = victim->d_inode; int error; - if (!victim->d_inode) + if (d_is_negative(victim)) return -ENOENT; + BUG_ON(!inode); BUG_ON(victim->d_parent->d_inode != dir); audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); @@ -2440,15 +2420,16 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir) return error; if (IS_APPEND(dir)) return -EPERM; - if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)|| - IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) + + if (check_sticky(dir, inode) || IS_APPEND(inode) || + IS_IMMUTABLE(inode) || IS_SWAPFILE(inode)) return -EPERM; if (isdir) { - if (!S_ISDIR(victim->d_inode->i_mode)) + if (!d_is_dir(victim)) return -ENOTDIR; if (IS_ROOT(victim)) return -EBUSY; - } else if (S_ISDIR(victim->d_inode->i_mode)) + } else if (d_is_dir(victim)) return -EISDIR; if (IS_DEADDIR(dir)) return -ENOENT; @@ -2467,6 +2448,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir) */ static inline int may_create(struct inode *dir, struct dentry *child) { + audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); if (child->d_inode) return -EEXIST; if (IS_DEADDIR(dir)) @@ -2506,6 +2488,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); return NULL; } +EXPORT_SYMBOL(lock_rename); void unlock_rename(struct dentry *p1, struct dentry *p2) { @@ -2515,6 +2498,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex); } } +EXPORT_SYMBOL(unlock_rename); int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool want_excl) @@ -2535,6 +2519,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_create); static int may_open(struct path *path, int acc_mode, int flag) { @@ -2598,7 +2583,7 @@ static int handle_truncate(struct file *filp) /* * Refuse to truncate files with mandatory locks held on them. */ - error = locks_verify_locked(inode); + error = locks_verify_locked(filp); if (!error) error = security_path_truncate(path); if (!error) { @@ -2982,7 +2967,7 @@ retry_lookup: /* * create/update audit record if it already exists. */ - if (path->dentry->d_inode) + if (d_is_positive(path->dentry)) audit_inode(name, path->dentry, 0); /* @@ -3011,12 +2996,12 @@ retry_lookup: finish_lookup: /* we _can_ be in RCU mode here */ error = -ENOENT; - if (!inode) { + if (!inode || d_is_negative(path->dentry)) { path_to_nameidata(path, nd); goto out; } - if (should_follow_link(inode, !symlink_ok)) { + if (should_follow_link(path->dentry, !symlink_ok)) { if (nd->flags & LOOKUP_RCU) { if (unlikely(unlazy_walk(nd, path->dentry))) { error = -ECHILD; @@ -3045,10 +3030,10 @@ finish_open: } audit_inode(name, nd->path.dentry, 0); error = -EISDIR; - if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode)) + if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry)) goto out; error = -ENOTDIR; - if ((nd->flags & LOOKUP_DIRECTORY) && !can_lookup(nd->inode)) + if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) goto out; if (!S_ISREG(nd->inode->i_mode)) will_truncate = false; @@ -3274,7 +3259,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, nd.root.mnt = mnt; nd.root.dentry = dentry; - if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN) + if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_RCU); @@ -3324,8 +3309,9 @@ struct dentry *kern_path_create(int dfd, const char *pathname, goto unlock; error = -EEXIST; - if (dentry->d_inode) + if (d_is_positive(dentry)) goto fail; + /* * Special case - lookup gave negative, but... we had foo/bar/ * From the vfs_mknod() POV we just have a negative dentry - @@ -3403,6 +3389,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_mknod); static int may_mknod(umode_t mode) { @@ -3492,6 +3479,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) fsnotify_mkdir(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_mkdir); SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) { @@ -3546,6 +3534,7 @@ void dentry_unhash(struct dentry *dentry) __d_drop(dentry); spin_unlock(&dentry->d_lock); } +EXPORT_SYMBOL(dentry_unhash); int vfs_rmdir(struct inode *dir, struct dentry *dentry) { @@ -3583,6 +3572,7 @@ out: d_delete(dentry); return error; } +EXPORT_SYMBOL(vfs_rmdir); static long do_rmdir(int dfd, const char __user *pathname) { @@ -3646,8 +3636,27 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) return do_rmdir(AT_FDCWD, pathname); } -int vfs_unlink(struct inode *dir, struct dentry *dentry) +/** + * vfs_unlink - unlink a filesystem object + * @dir: parent directory + * @dentry: victim + * @delegated_inode: returns victim inode, if the inode is delegated. + * + * The caller must hold dir->i_mutex. + * + * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and + * return a reference to the inode in delegated_inode. The caller + * should then break the delegation on that inode and retry. Because + * breaking a delegation may take a long time, the caller should drop + * dir->i_mutex before doing so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. + */ +int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) { + struct inode *target = dentry->d_inode; int error = may_delete(dir, dentry, 0); if (error) @@ -3656,27 +3665,32 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry) if (!dir->i_op->unlink) return -EPERM; - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&target->i_mutex); if (d_mountpoint(dentry)) error = -EBUSY; else { error = security_inode_unlink(dir, dentry); if (!error) { + error = try_break_deleg(target, delegated_inode); + if (error) + goto out; error = dir->i_op->unlink(dir, dentry); if (!error) dont_mount(dentry); } } - mutex_unlock(&dentry->d_inode->i_mutex); +out: + mutex_unlock(&target->i_mutex); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { - fsnotify_link_count(dentry->d_inode); + fsnotify_link_count(target); d_delete(dentry); } return error; } +EXPORT_SYMBOL(vfs_unlink); /* * Make sure that the actual truncation of the file will occur outside its @@ -3691,6 +3705,7 @@ static long do_unlinkat(int dfd, const char __user *pathname) struct dentry *dentry; struct nameidata nd; struct inode *inode = NULL; + struct inode *delegated_inode = NULL; unsigned int lookup_flags = 0; retry: name = user_path_parent(dfd, pathname, &nd, lookup_flags); @@ -3705,7 +3720,7 @@ retry: error = mnt_want_write(nd.path.mnt); if (error) goto exit1; - +retry_deleg: mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); dentry = lookup_hash(&nd); error = PTR_ERR(dentry); @@ -3714,19 +3729,25 @@ retry: if (nd.last.name[nd.last.len]) goto slashes; inode = dentry->d_inode; - if (!inode) + if (d_is_negative(dentry)) goto slashes; ihold(inode); error = security_path_unlink(&nd.path, dentry); if (error) goto exit2; - error = vfs_unlink(nd.path.dentry->d_inode, dentry); + error = vfs_unlink(nd.path.dentry->d_inode, dentry, &delegated_inode); exit2: dput(dentry); } mutex_unlock(&nd.path.dentry->d_inode->i_mutex); if (inode) iput(inode); /* truncate the inode here */ + inode = NULL; + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } mnt_drop_write(nd.path.mnt); exit1: path_put(&nd.path); @@ -3739,8 +3760,12 @@ exit1: return error; slashes: - error = !dentry->d_inode ? -ENOENT : - S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR; + if (d_is_negative(dentry)) + error = -ENOENT; + else if (d_is_dir(dentry)) + error = -EISDIR; + else + error = -ENOTDIR; goto exit2; } @@ -3779,6 +3804,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_symlink); SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, int, newdfd, const char __user *, newname) @@ -3816,7 +3842,26 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn return sys_symlinkat(oldname, AT_FDCWD, newname); } -int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) +/** + * vfs_link - create a new link + * @old_dentry: object to be linked + * @dir: new parent + * @new_dentry: where to create the new link + * @delegated_inode: returns inode needing a delegation break + * + * The caller must hold dir->i_mutex + * + * If vfs_link discovers a delegation on the to-be-linked file in need + * of breaking, it will return -EWOULDBLOCK and return a reference to the + * inode in delegated_inode. The caller should then break the delegation + * and retry. Because breaking a delegation may take a long time, the + * caller should drop the i_mutex before doing so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. + */ +int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode) { struct inode *inode = old_dentry->d_inode; unsigned max_links = dir->i_sb->s_max_links; @@ -3852,8 +3897,11 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de error = -ENOENT; else if (max_links && inode->i_nlink >= max_links) error = -EMLINK; - else - error = dir->i_op->link(old_dentry, dir, new_dentry); + else { + error = try_break_deleg(inode, delegated_inode); + if (!error) + error = dir->i_op->link(old_dentry, dir, new_dentry); + } if (!error && (inode->i_state & I_LINKABLE)) { spin_lock(&inode->i_lock); @@ -3865,6 +3913,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de fsnotify_link(dir, inode, new_dentry); return error; } +EXPORT_SYMBOL(vfs_link); /* * Hardlinks are often used in delicate situations. We avoid @@ -3880,6 +3929,7 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, { struct dentry *new_dentry; struct path old_path, new_path; + struct inode *delegated_inode = NULL; int how = 0; int error; @@ -3918,10 +3968,18 @@ retry: error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; - error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry); + error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode); out_dput: done_path_create(&new_path, new_dentry); + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) { + path_put(&old_path); + goto retry; + } + } if (retry_estale(error, how)) { + path_put(&old_path); how |= LOOKUP_REVAL; goto retry; } @@ -3936,7 +3994,28 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } -/* +/** + * vfs_rename - rename a filesystem object + * @old_dir: parent of source + * @old_dentry: source + * @new_dir: parent of destination + * @new_dentry: destination + * @delegated_inode: returns an inode needing a delegation break + * @flags: rename flags + * + * The caller must hold multiple mutexes--see lock_rename()). + * + * If vfs_rename discovers a delegation in need of breaking at either + * the source or destination, it will return -EWOULDBLOCK and return a + * reference to the inode in delegated_inode. The caller should then + * break the delegation and retry. Because breaking a delegation may + * take a long time, the caller should drop all locks before doing + * so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. + * * The worst of all namespace operations - renaming directory. "Perverted" * doesn't even start to describe it. Somebody in UCB had a heck of a trip... * Problems: @@ -3945,7 +4024,8 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * That's where 4.4 screws up. Current fix: serialization on * sb->s_vfs_rename_mutex. We might be more accurate, but that's another * story. - * c) we have to lock _three_ objects - parents and victim (if it exists). + * c) we have to lock _four_ objects - parents and victim (if it exists), + * and source (if it is not a directory). * And that - after we got ->i_mutex on parents (until then we don't know * whether the target exists). Solution: try to be smart with locking * order for inodes. We rely on the fact that tree topology may change @@ -3963,143 +4043,158 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * ->i_mutex on parents, which works but leads to some truly excessive * locking]. */ -static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) +int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + struct inode **delegated_inode, unsigned int flags) { - int error = 0; + int error; + bool is_dir = d_is_dir(old_dentry); + const unsigned char *old_name; + struct inode *source = old_dentry->d_inode; struct inode *target = new_dentry->d_inode; + bool new_is_dir = false; unsigned max_links = new_dir->i_sb->s_max_links; - /* - * If we are going to change the parent - check write permissions, - * we'll need to flip '..'. - */ - if (new_dir != old_dir) { - error = inode_permission(old_dentry->d_inode, MAY_WRITE); - if (error) - return error; - } + if (source == target) + return 0; - error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); + error = may_delete(old_dir, old_dentry, is_dir); if (error) return error; - dget(new_dentry); - if (target) - mutex_lock(&target->i_mutex); + if (!target) { + error = may_create(new_dir, new_dentry); + } else { + new_is_dir = d_is_dir(new_dentry); - error = -EBUSY; - if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) - goto out; + if (!(flags & RENAME_EXCHANGE)) + error = may_delete(new_dir, new_dentry, is_dir); + else + error = may_delete(new_dir, new_dentry, new_is_dir); + } + if (error) + return error; - error = -EMLINK; - if (max_links && !target && new_dir != old_dir && - new_dir->i_nlink >= max_links) - goto out; + if (!old_dir->i_op->rename) + return -EPERM; - if (target) - shrink_dcache_parent(new_dentry); - error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); - if (error) - goto out; + if (flags && !old_dir->i_op->rename2) + return -EINVAL; - if (target) { - target->i_flags |= S_DEAD; - dont_mount(new_dentry); + /* + * If we are going to change the parent - check write permissions, + * we'll need to flip '..'. + */ + if (new_dir != old_dir) { + if (is_dir) { + error = inode_permission(source, MAY_WRITE); + if (error) + return error; + } + if ((flags & RENAME_EXCHANGE) && new_is_dir) { + error = inode_permission(target, MAY_WRITE); + if (error) + return error; + } } -out: - if (target) - mutex_unlock(&target->i_mutex); - dput(new_dentry); - if (!error) - if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) - d_move(old_dentry,new_dentry); - return error; -} -static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - struct inode *target = new_dentry->d_inode; - int error; - - error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); + error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry, + flags); if (error) return error; + old_name = fsnotify_oldname_init(old_dentry->d_name.name); dget(new_dentry); - if (target) + if (!is_dir || (flags & RENAME_EXCHANGE)) + lock_two_nondirectories(source, target); + else if (target) mutex_lock(&target->i_mutex); error = -EBUSY; - if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) + if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) goto out; - error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); + if (max_links && new_dir != old_dir) { + error = -EMLINK; + if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links) + goto out; + if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir && + old_dir->i_nlink >= max_links) + goto out; + } + if (is_dir && !(flags & RENAME_EXCHANGE) && target) + shrink_dcache_parent(new_dentry); + if (!is_dir) { + error = try_break_deleg(source, delegated_inode); + if (error) + goto out; + } + if (target && !new_is_dir) { + error = try_break_deleg(target, delegated_inode); + if (error) + goto out; + } + if (!flags) { + error = old_dir->i_op->rename(old_dir, old_dentry, + new_dir, new_dentry); + } else { + error = old_dir->i_op->rename2(old_dir, old_dentry, + new_dir, new_dentry, flags); + } if (error) goto out; - if (target) + if (!(flags & RENAME_EXCHANGE) && target) { + if (is_dir) + target->i_flags |= S_DEAD; dont_mount(new_dentry); - if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) - d_move(old_dentry, new_dentry); + } + if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) { + if (!(flags & RENAME_EXCHANGE)) + d_move(old_dentry, new_dentry); + else + d_exchange(old_dentry, new_dentry); + } out: - if (target) + if (!is_dir || (flags & RENAME_EXCHANGE)) + unlock_two_nondirectories(source, target); + else if (target) mutex_unlock(&target->i_mutex); dput(new_dentry); - return error; -} - -int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - int error; - int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); - const unsigned char *old_name; - - if (old_dentry->d_inode == new_dentry->d_inode) - return 0; - - error = may_delete(old_dir, old_dentry, is_dir); - if (error) - return error; - - if (!new_dentry->d_inode) - error = may_create(new_dir, new_dentry); - else - error = may_delete(new_dir, new_dentry, is_dir); - if (error) - return error; - - if (!old_dir->i_op->rename) - return -EPERM; - - old_name = fsnotify_oldname_init(old_dentry->d_name.name); - - if (is_dir) - error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); - else - error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry); - if (!error) + if (!error) { fsnotify_move(old_dir, new_dir, old_name, is_dir, - new_dentry->d_inode, old_dentry); + !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry); + if (flags & RENAME_EXCHANGE) { + fsnotify_move(new_dir, old_dir, old_dentry->d_name.name, + new_is_dir, NULL, new_dentry); + } + } fsnotify_oldname_free(old_name); return error; } +EXPORT_SYMBOL(vfs_rename); -SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, - int, newdfd, const char __user *, newname) +SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, + int, newdfd, const char __user *, newname, unsigned int, flags) { struct dentry *old_dir, *new_dir; struct dentry *old_dentry, *new_dentry; struct dentry *trap; struct nameidata oldnd, newnd; + struct inode *delegated_inode = NULL; struct filename *from; struct filename *to; unsigned int lookup_flags = 0; bool should_retry = false; int error; + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if ((flags & RENAME_NOREPLACE) && (flags & RENAME_EXCHANGE)) + return -EINVAL; + retry: from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags); if (IS_ERR(from)) { @@ -4123,6 +4218,8 @@ retry: goto exit2; new_dir = newnd.path.dentry; + if (flags & RENAME_NOREPLACE) + error = -EEXIST; if (newnd.last_type != LAST_NORM) goto exit2; @@ -4132,8 +4229,10 @@ retry: oldnd.flags &= ~LOOKUP_PARENT; newnd.flags &= ~LOOKUP_PARENT; - newnd.flags |= LOOKUP_RENAME_TARGET; + if (!(flags & RENAME_EXCHANGE)) + newnd.flags |= LOOKUP_RENAME_TARGET; +retry_deleg: trap = lock_rename(new_dir, old_dir); old_dentry = lookup_hash(&oldnd); @@ -4142,41 +4241,62 @@ retry: goto exit3; /* source must exist */ error = -ENOENT; - if (!old_dentry->d_inode) + if (d_is_negative(old_dentry)) + goto exit4; + new_dentry = lookup_hash(&newnd); + error = PTR_ERR(new_dentry); + if (IS_ERR(new_dentry)) goto exit4; + error = -EEXIST; + if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) + goto exit5; + if (flags & RENAME_EXCHANGE) { + error = -ENOENT; + if (d_is_negative(new_dentry)) + goto exit5; + + if (!d_is_dir(new_dentry)) { + error = -ENOTDIR; + if (newnd.last.name[newnd.last.len]) + goto exit5; + } + } /* unless the source is a directory trailing slashes give -ENOTDIR */ - if (!S_ISDIR(old_dentry->d_inode->i_mode)) { + if (!d_is_dir(old_dentry)) { error = -ENOTDIR; if (oldnd.last.name[oldnd.last.len]) - goto exit4; - if (newnd.last.name[newnd.last.len]) - goto exit4; + goto exit5; + if (!(flags & RENAME_EXCHANGE) && newnd.last.name[newnd.last.len]) + goto exit5; } /* source should not be ancestor of target */ error = -EINVAL; if (old_dentry == trap) - goto exit4; - new_dentry = lookup_hash(&newnd); - error = PTR_ERR(new_dentry); - if (IS_ERR(new_dentry)) - goto exit4; + goto exit5; /* target should not be an ancestor of source */ - error = -ENOTEMPTY; + if (!(flags & RENAME_EXCHANGE)) + error = -ENOTEMPTY; if (new_dentry == trap) goto exit5; error = security_path_rename(&oldnd.path, old_dentry, - &newnd.path, new_dentry); + &newnd.path, new_dentry, flags); if (error) goto exit5; error = vfs_rename(old_dir->d_inode, old_dentry, - new_dir->d_inode, new_dentry); + new_dir->d_inode, new_dentry, + &delegated_inode, flags); exit5: dput(new_dentry); exit4: dput(old_dentry); exit3: unlock_rename(new_dir, old_dir); + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } mnt_drop_write(oldnd.path.mnt); exit2: if (retry_estale(error, lookup_flags)) @@ -4195,16 +4315,20 @@ exit: return error; } -SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) +SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, + int, newdfd, const char __user *, newname) { - return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname); + return sys_renameat2(olddfd, oldname, newdfd, newname, 0); } -int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link) +SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) { - int len; + return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); +} - len = PTR_ERR(link); +int readlink_copy(char __user *buffer, int buflen, const char *link) +{ + int len = PTR_ERR(link); if (IS_ERR(link)) goto out; @@ -4216,6 +4340,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const c out: return len; } +EXPORT_SYMBOL(readlink_copy); /* * A helper for ->readlink(). This should be used *ONLY* for symlinks that @@ -4233,11 +4358,12 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) if (IS_ERR(cookie)) return PTR_ERR(cookie); - res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd)); + res = readlink_copy(buffer, buflen, nd_get_link(&nd)); if (dentry->d_inode->i_op->put_link) dentry->d_inode->i_op->put_link(dentry, &nd, cookie); return res; } +EXPORT_SYMBOL(generic_readlink); /* get the link contents into pagecache */ static char *page_getlink(struct dentry * dentry, struct page **ppage) @@ -4257,14 +4383,14 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage) int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct page *page = NULL; - char *s = page_getlink(dentry, &page); - int res = vfs_readlink(dentry,buffer,buflen,s); + int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page)); if (page) { kunmap(page); page_cache_release(page); } return res; } +EXPORT_SYMBOL(page_readlink); void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) { @@ -4272,6 +4398,7 @@ void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) nd_set_link(nd, page_getlink(dentry, &page)); return page; } +EXPORT_SYMBOL(page_follow_link_light); void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) { @@ -4282,6 +4409,7 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) page_cache_release(page); } } +EXPORT_SYMBOL(page_put_link); /* * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS @@ -4319,45 +4447,18 @@ retry: fail: return err; } +EXPORT_SYMBOL(__page_symlink); int page_symlink(struct inode *inode, const char *symname, int len) { return __page_symlink(inode, symname, len, !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS)); } +EXPORT_SYMBOL(page_symlink); const struct inode_operations page_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, }; - -EXPORT_SYMBOL(user_path_at); -EXPORT_SYMBOL(follow_down_one); -EXPORT_SYMBOL(follow_down); -EXPORT_SYMBOL(follow_up); -EXPORT_SYMBOL(get_write_access); /* nfsd */ -EXPORT_SYMBOL(lock_rename); -EXPORT_SYMBOL(lookup_one_len); -EXPORT_SYMBOL(page_follow_link_light); -EXPORT_SYMBOL(page_put_link); -EXPORT_SYMBOL(page_readlink); -EXPORT_SYMBOL(__page_symlink); -EXPORT_SYMBOL(page_symlink); EXPORT_SYMBOL(page_symlink_inode_operations); -EXPORT_SYMBOL(kern_path); -EXPORT_SYMBOL(vfs_path_lookup); -EXPORT_SYMBOL(inode_permission); -EXPORT_SYMBOL(unlock_rename); -EXPORT_SYMBOL(vfs_create); -EXPORT_SYMBOL(vfs_link); -EXPORT_SYMBOL(vfs_mkdir); -EXPORT_SYMBOL(vfs_mknod); -EXPORT_SYMBOL(generic_permission); -EXPORT_SYMBOL(vfs_readlink); -EXPORT_SYMBOL(vfs_rename); -EXPORT_SYMBOL(vfs_rmdir); -EXPORT_SYMBOL(vfs_symlink); -EXPORT_SYMBOL(vfs_unlink); -EXPORT_SYMBOL(dentry_unhash); -EXPORT_SYMBOL(generic_readlink); diff --git a/fs/namespace.c b/fs/namespace.c index 5918fc31a63..182bc41cd88 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -23,23 +23,46 @@ #include <linux/uaccess.h> #include <linux/proc_ns.h> #include <linux/magic.h> +#include <linux/bootmem.h> #include "pnode.h" #include "internal.h" -#define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head)) -#define HASH_SIZE (1UL << HASH_SHIFT) +static unsigned int m_hash_mask __read_mostly; +static unsigned int m_hash_shift __read_mostly; +static unsigned int mp_hash_mask __read_mostly; +static unsigned int mp_hash_shift __read_mostly; -static int event; +static __initdata unsigned long mhash_entries; +static int __init set_mhash_entries(char *str) +{ + if (!str) + return 0; + mhash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("mhash_entries=", set_mhash_entries); + +static __initdata unsigned long mphash_entries; +static int __init set_mphash_entries(char *str) +{ + if (!str) + return 0; + mphash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("mphash_entries=", set_mphash_entries); + +static u64 event; static DEFINE_IDA(mnt_id_ida); static DEFINE_IDA(mnt_group_ida); static DEFINE_SPINLOCK(mnt_id_lock); static int mnt_id_start = 0; static int mnt_group_start = 1; -static struct list_head *mount_hashtable __read_mostly; -static struct list_head *mountpoint_hashtable __read_mostly; +static struct hlist_head *mount_hashtable __read_mostly; +static struct hlist_head *mountpoint_hashtable __read_mostly; static struct kmem_cache *mnt_cache __read_mostly; -static struct rw_semaphore namespace_sem; +static DECLARE_RWSEM(namespace_sem); /* /sys/fs */ struct kobject *fs_kobj; @@ -53,17 +76,22 @@ EXPORT_SYMBOL_GPL(fs_kobj); * It should be taken for write in all cases where the vfsmount * tree or hash is modified or when a vfsmount structure is modified. */ -DEFINE_BRLOCK(vfsmount_lock); +__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); -static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) +static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) { unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); tmp += ((unsigned long)dentry / L1_CACHE_BYTES); - tmp = tmp + (tmp >> HASH_SHIFT); - return tmp & (HASH_SIZE - 1); + tmp = tmp + (tmp >> m_hash_shift); + return &mount_hashtable[tmp & m_hash_mask]; } -#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16) +static inline struct hlist_head *mp_hash(struct dentry *dentry) +{ + unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES); + tmp = tmp + (tmp >> mp_hash_shift); + return &mountpoint_hashtable[tmp & mp_hash_mask]; +} /* * allocation is serialized by namespace_sem, but we need the spinlock to @@ -189,7 +217,7 @@ static struct mount *alloc_vfsmnt(const char *name) mnt->mnt_writers = 0; #endif - INIT_LIST_HEAD(&mnt->mnt_hash); + INIT_HLIST_NODE(&mnt->mnt_hash); INIT_LIST_HEAD(&mnt->mnt_child); INIT_LIST_HEAD(&mnt->mnt_mounts); INIT_LIST_HEAD(&mnt->mnt_list); @@ -386,9 +414,7 @@ EXPORT_SYMBOL_GPL(mnt_clone_write); */ int __mnt_want_write_file(struct file *file) { - struct inode *inode = file_inode(file); - - if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) + if (!(file->f_mode & FMODE_WRITER)) return __mnt_want_write(file->f_path.mnt); else return mnt_clone_write(file->f_path.mnt); @@ -458,7 +484,7 @@ static int mnt_make_readonly(struct mount *mnt) { int ret = 0; - br_write_lock(&vfsmount_lock); + lock_mount_hash(); mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; /* * After storing MNT_WRITE_HOLD, we'll read the counters. This store @@ -492,15 +518,15 @@ static int mnt_make_readonly(struct mount *mnt) */ smp_wmb(); mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; - br_write_unlock(&vfsmount_lock); + unlock_mount_hash(); return ret; } static void __mnt_unmake_readonly(struct mount *mnt) { - br_write_lock(&vfsmount_lock); + lock_mount_hash(); mnt->mnt.mnt_flags &= ~MNT_READONLY; - br_write_unlock(&vfsmount_lock); + unlock_mount_hash(); } int sb_prepare_remount_readonly(struct super_block *sb) @@ -512,7 +538,7 @@ int sb_prepare_remount_readonly(struct super_block *sb) if (atomic_long_read(&sb->s_remove_count)) return -EBUSY; - br_write_lock(&vfsmount_lock); + lock_mount_hash(); list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; @@ -534,7 +560,7 @@ int sb_prepare_remount_readonly(struct super_block *sb) if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; } - br_write_unlock(&vfsmount_lock); + unlock_mount_hash(); return err; } @@ -542,37 +568,71 @@ int sb_prepare_remount_readonly(struct super_block *sb) static void free_vfsmnt(struct mount *mnt) { kfree(mnt->mnt_devname); - mnt_free_id(mnt); #ifdef CONFIG_SMP free_percpu(mnt->mnt_pcp); #endif kmem_cache_free(mnt_cache, mnt); } +static void delayed_free_vfsmnt(struct rcu_head *head) +{ + free_vfsmnt(container_of(head, struct mount, mnt_rcu)); +} + +/* call under rcu_read_lock */ +bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) +{ + struct mount *mnt; + if (read_seqretry(&mount_lock, seq)) + return false; + if (bastard == NULL) + return true; + mnt = real_mount(bastard); + mnt_add_count(mnt, 1); + if (likely(!read_seqretry(&mount_lock, seq))) + return true; + if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { + mnt_add_count(mnt, -1); + return false; + } + rcu_read_unlock(); + mntput(bastard); + rcu_read_lock(); + return false; +} + /* - * find the first or last mount at @dentry on vfsmount @mnt depending on - * @dir. If @dir is set return the first mount else return the last mount. - * vfsmount_lock must be held for read or write. + * find the first mount at @dentry on vfsmount @mnt. + * call under rcu_read_lock() */ -struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, - int dir) +struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) { - struct list_head *head = mount_hashtable + hash(mnt, dentry); - struct list_head *tmp = head; - struct mount *p, *found = NULL; + struct hlist_head *head = m_hash(mnt, dentry); + struct mount *p; - for (;;) { - tmp = dir ? tmp->next : tmp->prev; - p = NULL; - if (tmp == head) - break; - p = list_entry(tmp, struct mount, mnt_hash); - if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) { - found = p; + hlist_for_each_entry_rcu(p, head, mnt_hash) + if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) + return p; + return NULL; +} + +/* + * find the last mount at @dentry on vfsmount @mnt. + * mount_lock must be held. + */ +struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) +{ + struct mount *p, *res; + res = p = __lookup_mnt(mnt, dentry); + if (!p) + goto out; + hlist_for_each_entry_continue(p, mnt_hash) { + if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry) break; - } + res = p; } - return found; +out: + return res; } /* @@ -594,26 +654,26 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, struct vfsmount *lookup_mnt(struct path *path) { struct mount *child_mnt; + struct vfsmount *m; + unsigned seq; - br_read_lock(&vfsmount_lock); - child_mnt = __lookup_mnt(path->mnt, path->dentry, 1); - if (child_mnt) { - mnt_add_count(child_mnt, 1); - br_read_unlock(&vfsmount_lock); - return &child_mnt->mnt; - } else { - br_read_unlock(&vfsmount_lock); - return NULL; - } + rcu_read_lock(); + do { + seq = read_seqbegin(&mount_lock); + child_mnt = __lookup_mnt(path->mnt, path->dentry); + m = child_mnt ? &child_mnt->mnt : NULL; + } while (!legitimize_mnt(m, seq)); + rcu_read_unlock(); + return m; } static struct mountpoint *new_mountpoint(struct dentry *dentry) { - struct list_head *chain = mountpoint_hashtable + hash(NULL, dentry); + struct hlist_head *chain = mp_hash(dentry); struct mountpoint *mp; int ret; - list_for_each_entry(mp, chain, m_hash) { + hlist_for_each_entry(mp, chain, m_hash) { if (mp->m_dentry == dentry) { /* might be worth a WARN_ON() */ if (d_unlinked(dentry)) @@ -635,7 +695,7 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry) mp->m_dentry = dentry; mp->m_count = 1; - list_add(&mp->m_hash, chain); + hlist_add_head(&mp->m_hash, chain); return mp; } @@ -646,7 +706,7 @@ static void put_mountpoint(struct mountpoint *mp) spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_MOUNTED; spin_unlock(&dentry->d_lock); - list_del(&mp->m_hash); + hlist_del(&mp->m_hash); kfree(mp); } } @@ -688,7 +748,7 @@ static void detach_mnt(struct mount *mnt, struct path *old_path) mnt->mnt_parent = mnt; mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); - list_del_init(&mnt->mnt_hash); + hlist_del_init_rcu(&mnt->mnt_hash); put_mountpoint(mnt->mnt_mp); mnt->mnt_mp = NULL; } @@ -715,15 +775,14 @@ static void attach_mnt(struct mount *mnt, struct mountpoint *mp) { mnt_set_mountpoint(parent, mp, mnt); - list_add_tail(&mnt->mnt_hash, mount_hashtable + - hash(&parent->mnt, mp->m_dentry)); + hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); } /* * vfsmount lock must be held for write */ -static void commit_tree(struct mount *mnt) +static void commit_tree(struct mount *mnt, struct mount *shadows) { struct mount *parent = mnt->mnt_parent; struct mount *m; @@ -738,8 +797,11 @@ static void commit_tree(struct mount *mnt) list_splice(&head, n->list.prev); - list_add_tail(&mnt->mnt_hash, mount_hashtable + - hash(&parent->mnt, mnt->mnt_mountpoint)); + if (shadows) + hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash); + else + hlist_add_head_rcu(&mnt->mnt_hash, + m_hash(&parent->mnt, mnt->mnt_mountpoint)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); touch_mnt_namespace(n); } @@ -788,6 +850,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void root = mount_fs(type, flags, name, data); if (IS_ERR(root)) { + mnt_free_id(mnt); free_vfsmnt(mnt); return ERR_CAST(root); } @@ -796,9 +859,9 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void mnt->mnt.mnt_sb = root->d_sb; mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; - br_write_lock(&vfsmount_lock); + lock_mount_hash(); list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts); - br_write_unlock(&vfsmount_lock); + unlock_mount_hash(); return &mnt->mnt; } EXPORT_SYMBOL_GPL(vfs_kern_mount); @@ -825,7 +888,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, goto out_free; } - mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD; + mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); /* Don't allow unprivileged users to change mount flags */ if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; @@ -839,9 +902,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, mnt->mnt.mnt_root = dget(root); mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; - br_write_lock(&vfsmount_lock); + lock_mount_hash(); list_add_tail(&mnt->mnt_instance, &sb->s_mounts); - br_write_unlock(&vfsmount_lock); + unlock_mount_hash(); if ((flag & CL_SLAVE) || ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { @@ -868,68 +931,61 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, return mnt; out_free: + mnt_free_id(mnt); free_vfsmnt(mnt); return ERR_PTR(err); } -static inline void mntfree(struct mount *mnt) -{ - struct vfsmount *m = &mnt->mnt; - struct super_block *sb = m->mnt_sb; - - /* - * This probably indicates that somebody messed - * up a mnt_want/drop_write() pair. If this - * happens, the filesystem was probably unable - * to make r/w->r/o transitions. - */ - /* - * The locking used to deal with mnt_count decrement provides barriers, - * so mnt_get_writers() below is safe. - */ - WARN_ON(mnt_get_writers(mnt)); - fsnotify_vfsmount_delete(m); - dput(m->mnt_root); - free_vfsmnt(mnt); - deactivate_super(sb); -} - static void mntput_no_expire(struct mount *mnt) { put_again: -#ifdef CONFIG_SMP - br_read_lock(&vfsmount_lock); - if (likely(mnt->mnt_ns)) { - /* shouldn't be the last one */ - mnt_add_count(mnt, -1); - br_read_unlock(&vfsmount_lock); + rcu_read_lock(); + mnt_add_count(mnt, -1); + if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ + rcu_read_unlock(); return; } - br_read_unlock(&vfsmount_lock); - - br_write_lock(&vfsmount_lock); - mnt_add_count(mnt, -1); + lock_mount_hash(); if (mnt_get_count(mnt)) { - br_write_unlock(&vfsmount_lock); + rcu_read_unlock(); + unlock_mount_hash(); return; } -#else - mnt_add_count(mnt, -1); - if (likely(mnt_get_count(mnt))) - return; - br_write_lock(&vfsmount_lock); -#endif if (unlikely(mnt->mnt_pinned)) { mnt_add_count(mnt, mnt->mnt_pinned + 1); mnt->mnt_pinned = 0; - br_write_unlock(&vfsmount_lock); + rcu_read_unlock(); + unlock_mount_hash(); acct_auto_close_mnt(&mnt->mnt); goto put_again; } + if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { + rcu_read_unlock(); + unlock_mount_hash(); + return; + } + mnt->mnt.mnt_flags |= MNT_DOOMED; + rcu_read_unlock(); list_del(&mnt->mnt_instance); - br_write_unlock(&vfsmount_lock); - mntfree(mnt); + unlock_mount_hash(); + + /* + * This probably indicates that somebody messed + * up a mnt_want/drop_write() pair. If this + * happens, the filesystem was probably unable + * to make r/w->r/o transitions. + */ + /* + * The locking used to deal with mnt_count decrement provides barriers, + * so mnt_get_writers() below is safe. + */ + WARN_ON(mnt_get_writers(mnt)); + fsnotify_vfsmount_delete(&mnt->mnt); + dput(mnt->mnt.mnt_root); + deactivate_super(mnt->mnt.mnt_sb); + mnt_free_id(mnt); + call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); } void mntput(struct vfsmount *mnt) @@ -954,21 +1010,21 @@ EXPORT_SYMBOL(mntget); void mnt_pin(struct vfsmount *mnt) { - br_write_lock(&vfsmount_lock); + lock_mount_hash(); real_mount(mnt)->mnt_pinned++; - br_write_unlock(&vfsmount_lock); + unlock_mount_hash(); } EXPORT_SYMBOL(mnt_pin); void mnt_unpin(struct vfsmount *m) { struct mount *mnt = real_mount(m); - br_write_lock(&vfsmount_lock); + lock_mount_hash(); if (mnt->mnt_pinned) { mnt_add_count(mnt, 1); mnt->mnt_pinned--; } - br_write_unlock(&vfsmount_lock); + unlock_mount_hash(); } EXPORT_SYMBOL(mnt_unpin); @@ -1038,14 +1094,29 @@ static void *m_start(struct seq_file *m, loff_t *pos) struct proc_mounts *p = proc_mounts(m); down_read(&namespace_sem); - return seq_list_start(&p->ns->list, *pos); + if (p->cached_event == p->ns->event) { + void *v = p->cached_mount; + if (*pos == p->cached_index) + return v; + if (*pos == p->cached_index + 1) { + v = seq_list_next(v, &p->ns->list, &p->cached_index); + return p->cached_mount = v; + } + } + + p->cached_event = p->ns->event; + p->cached_mount = seq_list_start(&p->ns->list, *pos); + p->cached_index = *pos; + return p->cached_mount; } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { struct proc_mounts *p = proc_mounts(m); - return seq_list_next(v, &p->ns->list, pos); + p->cached_mount = seq_list_next(v, &p->ns->list, pos); + p->cached_index = *pos; + return p->cached_mount; } static void m_stop(struct seq_file *m, void *v) @@ -1085,12 +1156,12 @@ int may_umount_tree(struct vfsmount *m) BUG_ON(!m); /* write lock needed for mnt_get_count */ - br_write_lock(&vfsmount_lock); + lock_mount_hash(); for (p = mnt; p; p = next_mnt(p, mnt)) { actual_refs += mnt_get_count(p); minimum_refs += 2; } - br_write_unlock(&vfsmount_lock); + unlock_mount_hash(); if (actual_refs > minimum_refs) return 0; @@ -1117,48 +1188,40 @@ int may_umount(struct vfsmount *mnt) { int ret = 1; down_read(&namespace_sem); - br_write_lock(&vfsmount_lock); + lock_mount_hash(); if (propagate_mount_busy(real_mount(mnt), 2)) ret = 0; - br_write_unlock(&vfsmount_lock); + unlock_mount_hash(); up_read(&namespace_sem); return ret; } EXPORT_SYMBOL(may_umount); -static LIST_HEAD(unmounted); /* protected by namespace_sem */ +static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static void namespace_unlock(void) { struct mount *mnt; - LIST_HEAD(head); + struct hlist_head head = unmounted; - if (likely(list_empty(&unmounted))) { + if (likely(hlist_empty(&head))) { up_write(&namespace_sem); return; } - list_splice_init(&unmounted, &head); + head.first->pprev = &head.first; + INIT_HLIST_HEAD(&unmounted); + up_write(&namespace_sem); - while (!list_empty(&head)) { - mnt = list_first_entry(&head, struct mount, mnt_hash); - list_del_init(&mnt->mnt_hash); - if (mnt_has_parent(mnt)) { - struct dentry *dentry; - struct mount *m; - - br_write_lock(&vfsmount_lock); - dentry = mnt->mnt_mountpoint; - m = mnt->mnt_parent; - mnt->mnt_mountpoint = mnt->mnt.mnt_root; - mnt->mnt_parent = mnt; - m->mnt_ghosts--; - br_write_unlock(&vfsmount_lock); - dput(dentry); - mntput(&m->mnt); - } + synchronize_rcu(); + + while (!hlist_empty(&head)) { + mnt = hlist_entry(head.first, struct mount, mnt_hash); + hlist_del_init(&mnt->mnt_hash); + if (mnt->mnt_ex_mountpoint.mnt) + path_put(&mnt->mnt_ex_mountpoint); mntput(&mnt->mnt); } } @@ -1169,34 +1232,51 @@ static inline void namespace_lock(void) } /* - * vfsmount lock must be held for write + * mount_lock must be held * namespace_sem must be held for write + * how = 0 => just this tree, don't propagate + * how = 1 => propagate; we know that nobody else has reference to any victims + * how = 2 => lazy umount */ -void umount_tree(struct mount *mnt, int propagate) +void umount_tree(struct mount *mnt, int how) { - LIST_HEAD(tmp_list); + HLIST_HEAD(tmp_list); struct mount *p; + struct mount *last = NULL; - for (p = mnt; p; p = next_mnt(p, mnt)) - list_move(&p->mnt_hash, &tmp_list); + for (p = mnt; p; p = next_mnt(p, mnt)) { + hlist_del_init_rcu(&p->mnt_hash); + hlist_add_head(&p->mnt_hash, &tmp_list); + } - if (propagate) + if (how) propagate_umount(&tmp_list); - list_for_each_entry(p, &tmp_list, mnt_hash) { + hlist_for_each_entry(p, &tmp_list, mnt_hash) { list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); __touch_mnt_namespace(p->mnt_ns); p->mnt_ns = NULL; + if (how < 2) + p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; list_del_init(&p->mnt_child); if (mnt_has_parent(p)) { - p->mnt_parent->mnt_ghosts++; put_mountpoint(p->mnt_mp); + /* move the reference to mountpoint into ->mnt_ex_mountpoint */ + p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint; + p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt; + p->mnt_mountpoint = p->mnt.mnt_root; + p->mnt_parent = p; p->mnt_mp = NULL; } change_mnt_propagation(p, MS_PRIVATE); + last = p; + } + if (last) { + last->mnt_hash.next = unmounted.first; + unmounted.first = tmp_list.first; + unmounted.first->pprev = &unmounted.first; } - list_splice(&tmp_list, &unmounted); } static void shrink_submounts(struct mount *mnt); @@ -1225,12 +1305,12 @@ static int do_umount(struct mount *mnt, int flags) * probably don't strictly need the lock here if we examined * all race cases, but it's a slowpath. */ - br_write_lock(&vfsmount_lock); + lock_mount_hash(); if (mnt_get_count(mnt) != 2) { - br_write_unlock(&vfsmount_lock); + unlock_mount_hash(); return -EBUSY; } - br_write_unlock(&vfsmount_lock); + unlock_mount_hash(); if (!xchg(&mnt->mnt_expiry_mark, 1)) return -EAGAIN; @@ -1272,19 +1352,23 @@ static int do_umount(struct mount *mnt, int flags) } namespace_lock(); - br_write_lock(&vfsmount_lock); + lock_mount_hash(); event++; - if (!(flags & MNT_DETACH)) - shrink_submounts(mnt); - - retval = -EBUSY; - if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) { + if (flags & MNT_DETACH) { if (!list_empty(&mnt->mnt_list)) - umount_tree(mnt, 1); + umount_tree(mnt, 2); retval = 0; + } else { + shrink_submounts(mnt); + retval = -EBUSY; + if (!propagate_mount_busy(mnt, 2)) { + if (!list_empty(&mnt->mnt_list)) + umount_tree(mnt, 1); + retval = 0; + } } - br_write_unlock(&vfsmount_lock); + unlock_mount_hash(); namespace_unlock(); return retval; } @@ -1427,18 +1511,18 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, q = clone_mnt(p, p->mnt.mnt_root, flag); if (IS_ERR(q)) goto out; - br_write_lock(&vfsmount_lock); + lock_mount_hash(); list_add_tail(&q->mnt_list, &res->mnt_list); attach_mnt(q, parent, p->mnt_mp); - br_write_unlock(&vfsmount_lock); + unlock_mount_hash(); } } return res; out: if (res) { - br_write_lock(&vfsmount_lock); + lock_mount_hash(); umount_tree(res, 0); - br_write_unlock(&vfsmount_lock); + unlock_mount_hash(); } return q; } @@ -1460,9 +1544,9 @@ struct vfsmount *collect_mounts(struct path *path) void drop_collected_mounts(struct vfsmount *mnt) { namespace_lock(); - br_write_lock(&vfsmount_lock); + lock_mount_hash(); umount_tree(real_mount(mnt), 0); - br_write_unlock(&vfsmount_lock); + unlock_mount_hash(); namespace_unlock(); } @@ -1576,24 +1660,23 @@ static int attach_recursive_mnt(struct mount *source_mnt, struct mountpoint *dest_mp, struct path *parent_path) { - LIST_HEAD(tree_list); + HLIST_HEAD(tree_list); struct mount *child, *p; + struct hlist_node *n; int err; if (IS_MNT_SHARED(dest_mnt)) { err = invent_group_ids(source_mnt, true); if (err) goto out; - } - err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); - if (err) - goto out_cleanup_ids; - - br_write_lock(&vfsmount_lock); - - if (IS_MNT_SHARED(dest_mnt)) { + err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); + lock_mount_hash(); + if (err) + goto out_cleanup_ids; for (p = source_mnt; p; p = next_mnt(p, source_mnt)) set_mnt_shared(p); + } else { + lock_mount_hash(); } if (parent_path) { detach_mnt(source_mnt, parent_path); @@ -1601,20 +1684,27 @@ static int attach_recursive_mnt(struct mount *source_mnt, touch_mnt_namespace(source_mnt->mnt_ns); } else { mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); - commit_tree(source_mnt); + commit_tree(source_mnt, NULL); } - list_for_each_entry_safe(child, p, &tree_list, mnt_hash) { - list_del_init(&child->mnt_hash); - commit_tree(child); + hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { + struct mount *q; + hlist_del_init(&child->mnt_hash); + q = __lookup_mnt_last(&child->mnt_parent->mnt, + child->mnt_mountpoint); + commit_tree(child, q); } - br_write_unlock(&vfsmount_lock); + unlock_mount_hash(); return 0; out_cleanup_ids: - if (IS_MNT_SHARED(dest_mnt)) - cleanup_group_ids(source_mnt, NULL); + while (!hlist_empty(&tree_list)) { + child = hlist_entry(tree_list.first, struct mount, mnt_hash); + umount_tree(child, 0); + } + unlock_mount_hash(); + cleanup_group_ids(source_mnt, NULL); out: return err; } @@ -1710,10 +1800,10 @@ static int do_change_type(struct path *path, int flag) goto out_unlock; } - br_write_lock(&vfsmount_lock); + lock_mount_hash(); for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) change_mnt_propagation(m, type); - br_write_unlock(&vfsmount_lock); + unlock_mount_hash(); out_unlock: namespace_unlock(); @@ -1785,9 +1875,9 @@ static int do_loopback(struct path *path, const char *old_name, err = graft_tree(mnt, parent, mp); if (err) { - br_write_lock(&vfsmount_lock); + lock_mount_hash(); umount_tree(mnt, 0); - br_write_unlock(&vfsmount_lock); + unlock_mount_hash(); } out2: unlock_mount(mp); @@ -1846,17 +1936,13 @@ static int do_remount(struct path *path, int flags, int mnt_flags, else err = do_remount_sb(sb, flags, data, 0); if (!err) { - br_write_lock(&vfsmount_lock); + lock_mount_hash(); mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK; mnt->mnt.mnt_flags = mnt_flags; - br_write_unlock(&vfsmount_lock); - } - up_write(&sb->s_umount); - if (!err) { - br_write_lock(&vfsmount_lock); touch_mnt_namespace(mnt->mnt_ns); - br_write_unlock(&vfsmount_lock); + unlock_mount_hash(); } + up_write(&sb->s_umount); return err; } @@ -1972,7 +2058,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) struct mount *parent; int err; - mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL); + mnt_flags &= ~MNT_INTERNAL_FLAGS; mp = lock_mount(path); if (IS_ERR(mp)) @@ -2077,9 +2163,7 @@ fail: /* remove m from any expiration list it may be on */ if (!list_empty(&mnt->mnt_expire)) { namespace_lock(); - br_write_lock(&vfsmount_lock); list_del_init(&mnt->mnt_expire); - br_write_unlock(&vfsmount_lock); namespace_unlock(); } mntput(m); @@ -2095,11 +2179,9 @@ fail: void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) { namespace_lock(); - br_write_lock(&vfsmount_lock); list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); - br_write_unlock(&vfsmount_lock); namespace_unlock(); } EXPORT_SYMBOL(mnt_set_expiry); @@ -2118,7 +2200,7 @@ void mark_mounts_for_expiry(struct list_head *mounts) return; namespace_lock(); - br_write_lock(&vfsmount_lock); + lock_mount_hash(); /* extract from the expiration list every vfsmount that matches the * following criteria: @@ -2137,7 +2219,7 @@ void mark_mounts_for_expiry(struct list_head *mounts) touch_mnt_namespace(mnt->mnt_ns); umount_tree(mnt, 1); } - br_write_unlock(&vfsmount_lock); + unlock_mount_hash(); namespace_unlock(); } @@ -2193,7 +2275,7 @@ resume: * process a list of expirable mountpoints with the intent of discarding any * submounts of a specific parent mountpoint * - * vfsmount_lock must be held for write + * mount_lock must be held for write */ static void shrink_submounts(struct mount *mnt) { @@ -2414,20 +2496,25 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) return new_ns; } -/* - * Allocate a new namespace structure and populate it with contents - * copied from the namespace of the passed in task structure. - */ -static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, - struct user_namespace *user_ns, struct fs_struct *fs) +struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, + struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; struct mount *p, *q; - struct mount *old = mnt_ns->root; + struct mount *old; struct mount *new; int copy_flags; + BUG_ON(!ns); + + if (likely(!(flags & CLONE_NEWNS))) { + get_mnt_ns(ns); + return ns; + } + + old = ns->root; + new_ns = alloc_mnt_ns(user_ns); if (IS_ERR(new_ns)) return new_ns; @@ -2435,7 +2522,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, namespace_lock(); /* First pass: copy the tree topology */ copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; - if (user_ns != mnt_ns->user_ns) + if (user_ns != ns->user_ns) copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { @@ -2444,9 +2531,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, return ERR_CAST(new); } new_ns->root = new; - br_write_lock(&vfsmount_lock); list_add_tail(&new_ns->list, &new->mnt_list); - br_write_unlock(&vfsmount_lock); /* * Second pass: switch the tsk->fs->* elements and mark new vfsmounts @@ -2457,13 +2542,13 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, q = new; while (p) { q->mnt_ns = new_ns; - if (fs) { - if (&p->mnt == fs->root.mnt) { - fs->root.mnt = mntget(&q->mnt); + if (new_fs) { + if (&p->mnt == new_fs->root.mnt) { + new_fs->root.mnt = mntget(&q->mnt); rootmnt = &p->mnt; } - if (&p->mnt == fs->pwd.mnt) { - fs->pwd.mnt = mntget(&q->mnt); + if (&p->mnt == new_fs->pwd.mnt) { + new_fs->pwd.mnt = mntget(&q->mnt); pwdmnt = &p->mnt; } } @@ -2484,23 +2569,6 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, return new_ns; } -struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, - struct user_namespace *user_ns, struct fs_struct *new_fs) -{ - struct mnt_namespace *new_ns; - - BUG_ON(!ns); - get_mnt_ns(ns); - - if (!(flags & CLONE_NEWNS)) - return ns; - - new_ns = dup_mnt_ns(ns, user_ns, new_fs); - - put_mnt_ns(ns); - return new_ns; -} - /** * create_mnt_ns - creates a private namespace and adds a root filesystem * @mnt: pointer to the new root filesystem mountpoint @@ -2593,7 +2661,7 @@ out_type: /* * Return true if path is reachable from root * - * namespace_sem or vfsmount_lock is held + * namespace_sem or mount_lock is held */ bool is_path_reachable(struct mount *mnt, struct dentry *dentry, const struct path *root) @@ -2608,9 +2676,9 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry, int path_is_under(struct path *path1, struct path *path2) { int res; - br_read_lock(&vfsmount_lock); + read_seqlock_excl(&mount_lock); res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); - br_read_unlock(&vfsmount_lock); + read_sequnlock_excl(&mount_lock); return res; } EXPORT_SYMBOL(path_is_under); @@ -2701,7 +2769,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, if (!is_path_reachable(old_mnt, old.dentry, &new)) goto out4; root_mp->m_count++; /* pin it so it won't go away */ - br_write_lock(&vfsmount_lock); + lock_mount_hash(); detach_mnt(new_mnt, &parent_path); detach_mnt(root_mnt, &root_parent); if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { @@ -2713,7 +2781,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, /* mount new_root on / */ attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp); touch_mnt_namespace(current->nsproxy->mnt_ns); - br_write_unlock(&vfsmount_lock); + unlock_mount_hash(); chroot_fs_refs(&root, &new); put_mountpoint(root_mp); error = 0; @@ -2767,25 +2835,29 @@ void __init mnt_init(void) unsigned u; int err; - init_rwsem(&namespace_sem); - mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); - mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); - mountpoint_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); + mount_hashtable = alloc_large_system_hash("Mount-cache", + sizeof(struct hlist_head), + mhash_entries, 19, + 0, + &m_hash_shift, &m_hash_mask, 0, 0); + mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", + sizeof(struct hlist_head), + mphash_entries, 19, + 0, + &mp_hash_shift, &mp_hash_mask, 0, 0); if (!mount_hashtable || !mountpoint_hashtable) panic("Failed to allocate mount hash table\n"); - printk(KERN_INFO "Mount-cache hash table entries: %lu\n", HASH_SIZE); - - for (u = 0; u < HASH_SIZE; u++) - INIT_LIST_HEAD(&mount_hashtable[u]); - for (u = 0; u < HASH_SIZE; u++) - INIT_LIST_HEAD(&mountpoint_hashtable[u]); + for (u = 0; u <= m_hash_mask; u++) + INIT_HLIST_HEAD(&mount_hashtable[u]); + for (u = 0; u <= mp_hash_mask; u++) + INIT_HLIST_HEAD(&mountpoint_hashtable[u]); - br_lock_init(&vfsmount_lock); + kernfs_init(); err = sysfs_init(); if (err) @@ -2825,9 +2897,8 @@ void kern_unmount(struct vfsmount *mnt) { /* release long term mount so mount point can be released */ if (!IS_ERR_OR_NULL(mnt)) { - br_write_lock(&vfsmount_lock); real_mount(mnt)->mnt_ns = NULL; - br_write_unlock(&vfsmount_lock); + synchronize_rcu(); /* yecchhh... */ mntput(mnt); } } @@ -2871,7 +2942,7 @@ bool fs_fully_visible(struct file_system_type *type) if (unlikely(!ns)) return false; - namespace_lock(); + down_read(&namespace_sem); list_for_each_entry(mnt, &ns->list, mnt_list) { struct mount *child; if (mnt->mnt.mnt_sb->s_type != type) @@ -2884,7 +2955,7 @@ bool fs_fully_visible(struct file_system_type *type) struct inode *inode = child->mnt_mountpoint->d_inode; if (!S_ISDIR(inode->i_mode)) goto next; - if (inode->i_nlink != 2) + if (inode->i_nlink > 2) goto next; } visible = true; @@ -2892,7 +2963,7 @@ bool fs_fully_visible(struct file_system_type *type) next: ; } found: - namespace_unlock(); + up_read(&namespace_sem); return visible; } diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index c320ac52353..08b8ea8c353 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -339,7 +339,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) if (val) goto finished; - DDPRINTK("ncp_lookup_validate: %pd2 not valid, age=%ld, server lookup\n", + ncp_dbg(2, "%pd2 not valid, age=%ld, server lookup\n", dentry, NCP_GET_AGE(dentry)); len = sizeof(__name); @@ -358,7 +358,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) res = ncp_obtain_info(server, dir, __name, &(finfo.i)); } finfo.volume = finfo.i.volNumber; - DDPRINTK("ncp_lookup_validate: looked for %pd/%s, res=%d\n", + ncp_dbg(2, "looked for %pd/%s, res=%d\n", dentry->d_parent, __name, res); /* * If we didn't find it, or if it has a different dirEntNum to @@ -372,14 +372,14 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) ncp_new_dentry(dentry); val=1; } else - DDPRINTK("ncp_lookup_validate: found, but dirEntNum changed\n"); + ncp_dbg(2, "found, but dirEntNum changed\n"); ncp_update_inode2(inode, &finfo); mutex_unlock(&inode->i_mutex); } finished: - DDPRINTK("ncp_lookup_validate: result=%d\n", val); + ncp_dbg(2, "result=%d\n", val); dput(parent); return val; } @@ -453,8 +453,7 @@ static int ncp_readdir(struct file *file, struct dir_context *ctx) ctl.page = NULL; ctl.cache = NULL; - DDPRINTK("ncp_readdir: reading %pD2, pos=%d\n", file, - (int) ctx->pos); + ncp_dbg(2, "reading %pD2, pos=%d\n", file, (int)ctx->pos); result = -EIO; /* Do not generate '.' and '..' when server is dead. */ @@ -697,8 +696,7 @@ ncp_read_volume_list(struct file *file, struct dir_context *ctx, struct ncp_entry_info entry; int i; - DPRINTK("ncp_read_volume_list: pos=%ld\n", - (unsigned long) ctx->pos); + ncp_dbg(1, "pos=%ld\n", (unsigned long)ctx->pos); for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) { int inval_dentry; @@ -708,12 +706,11 @@ ncp_read_volume_list(struct file *file, struct dir_context *ctx, if (!strlen(info.volume_name)) continue; - DPRINTK("ncp_read_volume_list: found vol: %s\n", - info.volume_name); + ncp_dbg(1, "found vol: %s\n", info.volume_name); if (ncp_lookup_volume(server, info.volume_name, &entry.i)) { - DPRINTK("ncpfs: could not lookup vol %s\n", + ncp_dbg(1, "could not lookup vol %s\n", info.volume_name); continue; } @@ -738,14 +735,13 @@ ncp_do_readdir(struct file *file, struct dir_context *ctx, int more; size_t bufsize; - DPRINTK("ncp_do_readdir: %pD2, fpos=%ld\n", file, - (unsigned long) ctx->pos); - PPRINTK("ncp_do_readdir: init %pD, volnum=%d, dirent=%u\n", - file, NCP_FINFO(dir)->volNumber, NCP_FINFO(dir)->dirEntNum); + ncp_dbg(1, "%pD2, fpos=%ld\n", file, (unsigned long)ctx->pos); + ncp_vdbg("init %pD, volnum=%d, dirent=%u\n", + file, NCP_FINFO(dir)->volNumber, NCP_FINFO(dir)->dirEntNum); err = ncp_initialize_search(server, dir, &seq); if (err) { - DPRINTK("ncp_do_readdir: init failed, err=%d\n", err); + ncp_dbg(1, "init failed, err=%d\n", err); return; } /* We MUST NOT use server->buffer_size handshaked with server if we are @@ -808,8 +804,7 @@ int ncp_conn_logged_in(struct super_block *sb) goto out; result = -ENOENT; if (ncp_get_volume_root(server, __name, &volNumber, &dirEntNum, &DosDirNum)) { - PPRINTK("ncp_conn_logged_in: %s not found\n", - server->m.mounted_vol); + ncp_vdbg("%s not found\n", server->m.mounted_vol); goto out; } dent = sb->s_root; @@ -822,10 +817,10 @@ int ncp_conn_logged_in(struct super_block *sb) NCP_FINFO(ino)->DosDirNum = DosDirNum; result = 0; } else { - DPRINTK("ncpfs: sb->s_root->d_inode == NULL!\n"); + ncp_dbg(1, "sb->s_root->d_inode == NULL!\n"); } } else { - DPRINTK("ncpfs: sb->s_root == NULL!\n"); + ncp_dbg(1, "sb->s_root == NULL!\n"); } } else result = 0; @@ -846,7 +841,7 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsig if (!ncp_conn_valid(server)) goto finished; - PPRINTK("ncp_lookup: server lookup for %pd2\n", dentry); + ncp_vdbg("server lookup for %pd2\n", dentry); len = sizeof(__name); if (ncp_is_server_root(dir)) { @@ -854,15 +849,15 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsig dentry->d_name.len, 1); if (!res) res = ncp_lookup_volume(server, __name, &(finfo.i)); - if (!res) - ncp_update_known_namespace(server, finfo.i.volNumber, NULL); + if (!res) + ncp_update_known_namespace(server, finfo.i.volNumber, NULL); } else { res = ncp_io2vol(server, __name, &len, dentry->d_name.name, dentry->d_name.len, !ncp_preserve_case(dir)); if (!res) res = ncp_obtain_info(server, dir, __name, &(finfo.i)); } - PPRINTK("ncp_lookup: looked for %pd2, res=%d\n", dentry, res); + ncp_vdbg("looked for %pd2, res=%d\n", dentry, res); /* * If we didn't find an entry, make a negative dentry. */ @@ -886,7 +881,7 @@ add_entry: } finished: - PPRINTK("ncp_lookup: result=%d\n", error); + ncp_vdbg("result=%d\n", error); return ERR_PTR(error); } @@ -909,7 +904,7 @@ out: return error; out_close: - PPRINTK("ncp_instantiate: %pd2 failed, closing file\n", dentry); + ncp_vdbg("%pd2 failed, closing file\n", dentry); ncp_close_file(NCP_SERVER(dir), finfo->file_handle); goto out; } @@ -923,7 +918,7 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, umode_t mode, int opmode; __u8 __name[NCP_MAXPATHLEN + 1]; - PPRINTK("ncp_create_new: creating %pd2, mode=%hx\n", dentry, mode); + ncp_vdbg("creating %pd2, mode=%hx\n", dentry, mode); ncp_age_dentry(server, dentry); len = sizeof(__name); @@ -952,7 +947,7 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, umode_t mode, error = -ENAMETOOLONG; else if (result < 0) error = result; - DPRINTK("ncp_create: %pd2 failed\n", dentry); + ncp_dbg(1, "%pd2 failed\n", dentry); goto out; } opmode = O_WRONLY; @@ -985,7 +980,7 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) int error, len; __u8 __name[NCP_MAXPATHLEN + 1]; - DPRINTK("ncp_mkdir: making %pd2\n", dentry); + ncp_dbg(1, "making %pd2\n", dentry); ncp_age_dentry(server, dentry); len = sizeof(__name); @@ -1022,7 +1017,7 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry) int error, result, len; __u8 __name[NCP_MAXPATHLEN + 1]; - DPRINTK("ncp_rmdir: removing %pd2\n", dentry); + ncp_dbg(1, "removing %pd2\n", dentry); len = sizeof(__name); error = ncp_io2vol(server, __name, &len, dentry->d_name.name, @@ -1067,13 +1062,13 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry) int error; server = NCP_SERVER(dir); - DPRINTK("ncp_unlink: unlinking %pd2\n", dentry); + ncp_dbg(1, "unlinking %pd2\n", dentry); /* * Check whether to close the file ... */ if (inode) { - PPRINTK("ncp_unlink: closing file\n"); + ncp_vdbg("closing file\n"); ncp_make_closed(inode); } @@ -1087,7 +1082,7 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry) #endif switch (error) { case 0x00: - DPRINTK("ncp: removed %pd2\n", dentry); + ncp_dbg(1, "removed %pd2\n", dentry); break; case 0x85: case 0x8A: @@ -1120,7 +1115,7 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry, int old_len, new_len; __u8 __old_name[NCP_MAXPATHLEN + 1], __new_name[NCP_MAXPATHLEN + 1]; - DPRINTK("ncp_rename: %pd2 to %pd2\n", old_dentry, new_dentry); + ncp_dbg(1, "%pd2 to %pd2\n", old_dentry, new_dentry); ncp_age_dentry(server, old_dentry); ncp_age_dentry(server, new_dentry); @@ -1150,8 +1145,8 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry, #endif switch (error) { case 0x00: - DPRINTK("ncp renamed %pd -> %pd.\n", - old_dentry, new_dentry); + ncp_dbg(1, "renamed %pd -> %pd\n", + old_dentry, new_dentry); break; case 0x9E: error = -ENAMETOOLONG; @@ -1173,7 +1168,7 @@ static int ncp_mknod(struct inode * dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) { - DPRINTK(KERN_DEBUG "ncp_mknod: mode = 0%ho\n", mode); + ncp_dbg(1, "mode = 0%ho\n", mode); return ncp_create_new(dir, dentry, mode, rdev, 0); } return -EPERM; /* Strange, but true */ diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 8f5074e1ecb..77640a8bfb8 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -6,6 +6,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <asm/uaccess.h> #include <linux/time.h> @@ -34,11 +36,11 @@ int ncp_make_open(struct inode *inode, int right) error = -EINVAL; if (!inode) { - printk(KERN_ERR "ncp_make_open: got NULL inode\n"); + pr_err("%s: got NULL inode\n", __func__); goto out; } - DPRINTK("ncp_make_open: opened=%d, volume # %u, dir entry # %u\n", + ncp_dbg(1, "opened=%d, volume # %u, dir entry # %u\n", atomic_read(&NCP_FINFO(inode)->opened), NCP_FINFO(inode)->volNumber, NCP_FINFO(inode)->dirEntNum); @@ -71,7 +73,7 @@ int ncp_make_open(struct inode *inode, int right) break; } if (result) { - PPRINTK("ncp_make_open: failed, result=%d\n", result); + ncp_vdbg("failed, result=%d\n", result); goto out_unlock; } /* @@ -83,7 +85,7 @@ int ncp_make_open(struct inode *inode, int right) } access = NCP_FINFO(inode)->access; - PPRINTK("ncp_make_open: file open, access=%x\n", access); + ncp_vdbg("file open, access=%x\n", access); if (access == right || access == O_RDWR) { atomic_inc(&NCP_FINFO(inode)->opened); error = 0; @@ -107,7 +109,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) void* freepage; size_t freelen; - DPRINTK("ncp_file_read: enter %pd2\n", dentry); + ncp_dbg(1, "enter %pd2\n", dentry); pos = *ppos; @@ -124,7 +126,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) error = ncp_make_open(inode, O_RDONLY); if (error) { - DPRINTK(KERN_ERR "ncp_file_read: open failed, error=%d\n", error); + ncp_dbg(1, "open failed, error=%d\n", error); return error; } @@ -165,7 +167,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) file_accessed(file); - DPRINTK("ncp_file_read: exit %pd2\n", dentry); + ncp_dbg(1, "exit %pd2\n", dentry); outrel: ncp_inode_close(inode); return already_read ? already_read : error; @@ -182,7 +184,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t * int errno; void* bouncebuffer; - DPRINTK("ncp_file_write: enter %pd2\n", dentry); + ncp_dbg(1, "enter %pd2\n", dentry); if ((ssize_t) count < 0) return -EINVAL; pos = *ppos; @@ -211,7 +213,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t * return 0; errno = ncp_make_open(inode, O_WRONLY); if (errno) { - DPRINTK(KERN_ERR "ncp_file_write: open failed, error=%d\n", errno); + ncp_dbg(1, "open failed, error=%d\n", errno); return errno; } bufsize = NCP_SERVER(inode)->buffer_size; @@ -261,7 +263,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t * i_size_write(inode, pos); mutex_unlock(&inode->i_mutex); } - DPRINTK("ncp_file_write: exit %pd2\n", dentry); + ncp_dbg(1, "exit %pd2\n", dentry); outrel: ncp_inode_close(inode); return already_written ? already_written : errno; @@ -269,7 +271,7 @@ outrel: static int ncp_release(struct inode *inode, struct file *file) { if (ncp_make_closed(inode)) { - DPRINTK("ncp_release: failed to close\n"); + ncp_dbg(1, "failed to close\n"); } return 0; } diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c index 0af3349de85..344889cd120 100644 --- a/fs/ncpfs/getopt.c +++ b/fs/ncpfs/getopt.c @@ -2,6 +2,8 @@ * getopt.c */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/string.h> @@ -46,29 +48,28 @@ int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts if (opts->has_arg & OPT_NOPARAM) { return opts->val; } - printk(KERN_INFO "%s: the %s option requires an argument\n", - caller, token); + pr_info("%s: the %s option requires an argument\n", + caller, token); return -EINVAL; } if (opts->has_arg & OPT_INT) { - char* v; + int rc = kstrtoul(val, 0, value); - *value = simple_strtoul(val, &v, 0); - if (!*v) { - return opts->val; + if (rc) { + pr_info("%s: invalid numeric value in %s=%s\n", + caller, token, val); + return rc; } - printk(KERN_INFO "%s: invalid numeric value in %s=%s\n", - caller, token, val); - return -EDOM; + return opts->val; } if (opts->has_arg & OPT_STRING) { return opts->val; } - printk(KERN_INFO "%s: unexpected argument %s to the %s option\n", + pr_info("%s: unexpected argument %s to the %s option\n", caller, val, token); return -EINVAL; } } - printk(KERN_INFO "%s: Unrecognized mount option %s\n", caller, token); + pr_info("%s: Unrecognized mount option %s\n", caller, token); return -EOPNOTSUPP; } diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 4659da67e7f..e31e589369a 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -9,6 +9,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <asm/uaccess.h> @@ -99,6 +101,7 @@ static void destroy_inodecache(void) static int ncp_remount(struct super_block *sb, int *flags, char* data) { + sync_filesystem(sb); *flags |= MS_NODIRATIME; return 0; } @@ -132,7 +135,7 @@ void ncp_update_inode(struct inode *inode, struct ncp_entry_info *nwinfo) NCP_FINFO(inode)->access = nwinfo->access; memcpy(NCP_FINFO(inode)->file_handle, nwinfo->file_handle, sizeof(nwinfo->file_handle)); - DPRINTK("ncp_update_inode: updated %s, volnum=%d, dirent=%u\n", + ncp_dbg(1, "updated %s, volnum=%d, dirent=%u\n", nwinfo->i.entryName, NCP_FINFO(inode)->volNumber, NCP_FINFO(inode)->dirEntNum); } @@ -140,8 +143,7 @@ void ncp_update_inode(struct inode *inode, struct ncp_entry_info *nwinfo) static void ncp_update_dates(struct inode *inode, struct nw_info_struct *nwi) { /* NFS namespace mode overrides others if it's set. */ - DPRINTK(KERN_DEBUG "ncp_update_dates_and_mode: (%s) nfs.mode=0%o\n", - nwi->entryName, nwi->nfs.mode); + ncp_dbg(1, "(%s) nfs.mode=0%o\n", nwi->entryName, nwi->nfs.mode); if (nwi->nfs.mode) { /* XXX Security? */ inode->i_mode = nwi->nfs.mode; @@ -229,7 +231,7 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo) ncp_update_attrs(inode, nwinfo); - DDPRINTK("ncp_read_inode: inode->i_mode = %u\n", inode->i_mode); + ncp_dbg(2, "inode->i_mode = %u\n", inode->i_mode); set_nlink(inode, 1); inode->i_uid = server->m.uid; @@ -257,7 +259,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info) struct inode *inode; if (info == NULL) { - printk(KERN_ERR "ncp_iget: info is NULL\n"); + pr_err("%s: info is NULL\n", __func__); return NULL; } @@ -289,23 +291,23 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info) } insert_inode_hash(inode); } else - printk(KERN_ERR "ncp_iget: iget failed!\n"); + pr_err("%s: iget failed!\n", __func__); return inode; } static void ncp_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (S_ISDIR(inode->i_mode)) { - DDPRINTK("ncp_evict_inode: put directory %ld\n", inode->i_ino); + ncp_dbg(2, "put directory %ld\n", inode->i_ino); } if (ncp_make_closed(inode) != 0) { /* We can't do anything but complain. */ - printk(KERN_ERR "ncp_evict_inode: could not close\n"); + pr_err("%s: could not close\n", __func__); } } @@ -468,9 +470,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) { struct ncp_mount_data_kernel data; struct ncp_server *server; - struct file *ncp_filp; struct inode *root_inode; - struct inode *sock_inode; struct socket *sock; int error; int default_bufsize; @@ -539,18 +539,10 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) if (!uid_valid(data.mounted_uid) || !uid_valid(data.uid) || !gid_valid(data.gid)) goto out; - error = -EBADF; - ncp_filp = fget(data.ncp_fd); - if (!ncp_filp) - goto out; - error = -ENOTSOCK; - sock_inode = file_inode(ncp_filp); - if (!S_ISSOCK(sock_inode->i_mode)) - goto out_fput; - sock = SOCKET_I(sock_inode); + sock = sockfd_lookup(data.ncp_fd, &error); if (!sock) - goto out_fput; - + goto out; + if (sock->type == SOCK_STREAM) default_bufsize = 0xF000; else @@ -572,27 +564,16 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) if (error) goto out_fput; - server->ncp_filp = ncp_filp; server->ncp_sock = sock; if (data.info_fd != -1) { - struct socket *info_sock; - - error = -EBADF; - server->info_filp = fget(data.info_fd); - if (!server->info_filp) - goto out_bdi; - error = -ENOTSOCK; - sock_inode = file_inode(server->info_filp); - if (!S_ISSOCK(sock_inode->i_mode)) - goto out_fput2; - info_sock = SOCKET_I(sock_inode); + struct socket *info_sock = sockfd_lookup(data.info_fd, &error); if (!info_sock) - goto out_fput2; + goto out_bdi; + server->info_sock = info_sock; error = -EBADFD; if (info_sock->type != SOCK_STREAM) goto out_fput2; - server->info_sock = info_sock; } /* server->lock = 0; */ @@ -620,7 +601,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) now because of PATH_MAX changes.. */ if (server->m.time_out < 1) { server->m.time_out = 10; - printk(KERN_INFO "You need to recompile your ncpfs utils..\n"); + pr_info("You need to recompile your ncpfs utils..\n"); } server->m.time_out = server->m.time_out * HZ / 100; server->m.file_mode = (server->m.file_mode & S_IRWXUGO) | S_IFREG; @@ -681,7 +662,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) ncp_unlock_server(server); if (error < 0) goto out_rxbuf; - DPRINTK("ncp_fill_super: NCP_SBP(sb) = %x\n", (int) NCP_SBP(sb)); + ncp_dbg(1, "NCP_SBP(sb) = %p\n", NCP_SBP(sb)); error = -EMSGSIZE; /* -EREMOTESIDEINCOMPATIBLE */ #ifdef CONFIG_NCPFS_PACKET_SIGNING @@ -709,7 +690,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) if (ncp_negotiate_buffersize(server, default_bufsize, &(server->buffer_size)) != 0) goto out_disconnect; - DPRINTK("ncpfs: bufsize = %d\n", server->buffer_size); + ncp_dbg(1, "bufsize = %d\n", server->buffer_size); memset(&finfo, 0, sizeof(finfo)); finfo.i.attributes = aDIR; @@ -738,7 +719,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) root_inode = ncp_iget(sb, &finfo); if (!root_inode) goto out_disconnect; - DPRINTK("ncp_fill_super: root vol=%d\n", NCP_FINFO(root_inode)->volNumber); + ncp_dbg(1, "root vol=%d\n", NCP_FINFO(root_inode)->volNumber); sb->s_root = d_make_root(root_inode); if (!sb->s_root) goto out_disconnect; @@ -764,17 +745,12 @@ out_nls: mutex_destroy(&server->root_setup_lock); mutex_destroy(&server->mutex); out_fput2: - if (server->info_filp) - fput(server->info_filp); + if (server->info_sock) + sockfd_put(server->info_sock); out_bdi: bdi_destroy(&server->bdi); out_fput: - /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>: - * - * The previously used put_filp(ncp_filp); was bogus, since - * it doesn't perform proper unlocking. - */ - fput(ncp_filp); + sockfd_put(sock); out: put_pid(data.wdog_pid); sb->s_fs_info = NULL; @@ -782,6 +758,17 @@ out: return error; } +static void delayed_free(struct rcu_head *p) +{ + struct ncp_server *server = container_of(p, struct ncp_server, rcu); +#ifdef CONFIG_NCPFS_NLS + /* unload the NLS charsets */ + unload_nls(server->nls_vol); + unload_nls(server->nls_io); +#endif /* CONFIG_NCPFS_NLS */ + kfree(server); +} + static void ncp_put_super(struct super_block *sb) { struct ncp_server *server = NCP_SBP(sb); @@ -792,18 +779,13 @@ static void ncp_put_super(struct super_block *sb) ncp_stop_tasks(server); -#ifdef CONFIG_NCPFS_NLS - /* unload the NLS charsets */ - unload_nls(server->nls_vol); - unload_nls(server->nls_io); -#endif /* CONFIG_NCPFS_NLS */ mutex_destroy(&server->rcv.creq_mutex); mutex_destroy(&server->root_setup_lock); mutex_destroy(&server->mutex); - if (server->info_filp) - fput(server->info_filp); - fput(server->ncp_filp); + if (server->info_sock) + sockfd_put(server->info_sock); + sockfd_put(server->ncp_sock); kill_pid(server->m.wdog_pid, SIGTERM, 1); put_pid(server->m.wdog_pid); @@ -813,8 +795,7 @@ static void ncp_put_super(struct super_block *sb) vfree(server->rxbuf); vfree(server->txbuf); vfree(server->packet); - sb->s_fs_info = NULL; - kfree(server); + call_rcu(&server->rcu, delayed_free); } static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -979,8 +960,7 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_SIZE) != 0) { int written; - DPRINTK("ncpfs: trying to change size to %ld\n", - attr->ia_size); + ncp_dbg(1, "trying to change size to %llu\n", attr->ia_size); if ((result = ncp_make_open(inode, O_WRONLY)) < 0) { result = -EACCES; @@ -1066,7 +1046,7 @@ MODULE_ALIAS_FS("ncpfs"); static int __init init_ncp_fs(void) { int err; - DPRINTK("ncpfs: init_ncp_fs called\n"); + ncp_dbg(1, "called\n"); err = init_inodecache(); if (err) @@ -1083,7 +1063,7 @@ out1: static void __exit exit_ncp_fs(void) { - DPRINTK("ncpfs: exit_ncp_fs called\n"); + ncp_dbg(1, "called\n"); unregister_filesystem(&ncp_fs_type); destroy_inodecache(); } diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index 60426ccb3b6..d5659d96ee7 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -41,7 +41,7 @@ ncp_get_fs_info(struct ncp_server * server, struct inode *inode, return -EFAULT; if (info.version != NCP_GET_FS_INFO_VERSION) { - DPRINTK("info.version invalid: %d\n", info.version); + ncp_dbg(1, "info.version invalid: %d\n", info.version); return -EINVAL; } /* TODO: info.addr = server->m.serv_addr; */ @@ -66,7 +66,7 @@ ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode, return -EFAULT; if (info2.version != NCP_GET_FS_INFO_VERSION_V2) { - DPRINTK("info.version invalid: %d\n", info2.version); + ncp_dbg(1, "info.version invalid: %d\n", info2.version); return -EINVAL; } info2.mounted_uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); @@ -132,7 +132,7 @@ ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode, return -EFAULT; if (info2.version != NCP_GET_FS_INFO_VERSION_V2) { - DPRINTK("info.version invalid: %d\n", info2.version); + ncp_dbg(1, "info.version invalid: %d\n", info2.version); return -EINVAL; } info2.mounted_uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); @@ -308,8 +308,7 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg else result = server->reply_size; ncp_unlock_server(server); - DPRINTK("ncp_ioctl: copy %d bytes\n", - result); + ncp_dbg(1, "copy %d bytes\n", result); if (result >= 0) if (copy_to_user(request.data, bouncebuffer, result)) result = -EFAULT; @@ -385,9 +384,9 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg sr.namespace = server->name_space[sr.volNumber]; result = 0; } else - DPRINTK("ncpfs: s_root->d_inode==NULL\n"); + ncp_dbg(1, "s_root->d_inode==NULL\n"); } else - DPRINTK("ncpfs: s_root==NULL\n"); + ncp_dbg(1, "s_root==NULL\n"); } else { sr.volNumber = -1; sr.namespace = 0; @@ -440,11 +439,11 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg NCP_FINFO(s_inode)->DosDirNum = dosde; server->root_setuped = 1; } else { - DPRINTK("ncpfs: s_root->d_inode==NULL\n"); + ncp_dbg(1, "s_root->d_inode==NULL\n"); result = -EIO; } } else { - DPRINTK("ncpfs: s_root==NULL\n"); + ncp_dbg(1, "s_root==NULL\n"); result = -EIO; } } diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index 3c5dd55d284..b359d12eb35 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -107,7 +107,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); - DPRINTK("ncp_mmap: called\n"); + ncp_dbg(1, "called\n"); if (!ncp_conn_valid(NCP_SERVER(inode))) return -EIO; diff --git a/fs/ncpfs/ncp_fs.h b/fs/ncpfs/ncp_fs.h index 31831afe1c3..b9f69e1b1f4 100644 --- a/fs/ncpfs/ncp_fs.h +++ b/fs/ncpfs/ncp_fs.h @@ -2,30 +2,32 @@ #include "ncp_fs_i.h" #include "ncp_fs_sb.h" -/* define because it is easy to change PRINTK to {*}PRINTK */ -#define PRINTK(format, args...) printk(KERN_DEBUG format , ## args) - #undef NCPFS_PARANOIA #ifdef NCPFS_PARANOIA -#define PPRINTK(format, args...) PRINTK(format , ## args) +#define ncp_vdbg(fmt, ...) \ + pr_debug(fmt, ##__VA_ARGS__) #else -#define PPRINTK(format, args...) +#define ncp_vdbg(fmt, ...) \ +do { \ + if (0) \ + pr_debug(fmt, ##__VA_ARGS__); \ +} while (0) #endif #ifndef DEBUG_NCP #define DEBUG_NCP 0 #endif -#if DEBUG_NCP > 0 -#define DPRINTK(format, args...) PRINTK(format , ## args) -#else -#define DPRINTK(format, args...) -#endif -#if DEBUG_NCP > 1 -#define DDPRINTK(format, args...) PRINTK(format , ## args) -#else -#define DDPRINTK(format, args...) + +#if DEBUG_NCP > 0 && !defined(DEBUG) +#define DEBUG #endif +#define ncp_dbg(level, fmt, ...) \ +do { \ + if (level <= DEBUG_NCP) \ + pr_debug(fmt, ##__VA_ARGS__); \ +} while (0) + #define NCP_MAX_RPC_TIMEOUT (6*HZ) diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h index c51b2c54353..55e26fd8088 100644 --- a/fs/ncpfs/ncp_fs_sb.h +++ b/fs/ncpfs/ncp_fs_sb.h @@ -38,16 +38,14 @@ struct ncp_mount_data_kernel { }; struct ncp_server { - + struct rcu_head rcu; struct ncp_mount_data_kernel m; /* Nearly all of the mount data is of interest for us later, so we store it completely. */ __u8 name_space[NCP_NUMBER_OF_VOLUMES + 2]; - struct file *ncp_filp; /* File pointer to ncp socket */ struct socket *ncp_sock;/* ncp socket */ - struct file *info_filp; struct socket *info_sock; u8 sequence; @@ -111,7 +109,7 @@ struct ncp_server { spinlock_t requests_lock; /* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */ - void (*data_ready)(struct sock* sk, int len); + void (*data_ready)(struct sock* sk); void (*error_report)(struct sock* sk); void (*write_space)(struct sock* sk); /* STREAM mode only */ struct { @@ -153,7 +151,7 @@ extern void ncp_tcp_tx_proc(struct work_struct *work); extern void ncpdgram_rcv_proc(struct work_struct *work); extern void ncpdgram_timeout_proc(struct work_struct *work); extern void ncpdgram_timeout_call(unsigned long server); -extern void ncp_tcp_data_ready(struct sock* sk, int len); +extern void ncp_tcp_data_ready(struct sock* sk); extern void ncp_tcp_write_space(struct sock* sk); extern void ncp_tcp_error_report(struct sock* sk); diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c index 981a95617fc..482387532f5 100644 --- a/fs/ncpfs/ncplib_kernel.c +++ b/fs/ncpfs/ncplib_kernel.c @@ -9,14 +9,14 @@ * */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "ncp_fs.h" static inline void assert_server_locked(struct ncp_server *server) { if (server->lock == 0) { - DPRINTK("ncpfs: server not locked!\n"); + ncp_dbg(1, "server not locked!\n"); } } @@ -75,7 +75,7 @@ static void ncp_add_pstring(struct ncp_server *server, const char *s) int len = strlen(s); assert_server_locked(server); if (len > 255) { - DPRINTK("ncpfs: string too long: %s\n", s); + ncp_dbg(1, "string too long: %s\n", s); len = 255; } ncp_add_byte(server, len); @@ -225,7 +225,7 @@ int ncp_get_volume_info_with_number(struct ncp_server* server, result = -EIO; len = ncp_reply_byte(server, 29); if (len > NCP_VOLNAME_LEN) { - DPRINTK("ncpfs: volume name too long: %d\n", len); + ncp_dbg(1, "volume name too long: %d\n", len); goto out; } memcpy(&(target->volume_name), ncp_reply_data(server, 30), len); @@ -259,7 +259,7 @@ int ncp_get_directory_info(struct ncp_server* server, __u8 n, result = -EIO; len = ncp_reply_byte(server, 21); if (len > NCP_VOLNAME_LEN) { - DPRINTK("ncpfs: volume name too long: %d\n", len); + ncp_dbg(1, "volume name too long: %d\n", len); goto out; } memcpy(&(target->volume_name), ncp_reply_data(server, 22), len); @@ -295,9 +295,9 @@ ncp_make_closed(struct inode *inode) err = ncp_close_file(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle); if (!err) - PPRINTK("ncp_make_closed: volnum=%d, dirent=%u, error=%d\n", - NCP_FINFO(inode)->volNumber, - NCP_FINFO(inode)->dirEntNum, err); + ncp_vdbg("volnum=%d, dirent=%u, error=%d\n", + NCP_FINFO(inode)->volNumber, + NCP_FINFO(inode)->dirEntNum, err); } mutex_unlock(&NCP_FINFO(inode)->open_mutex); return err; @@ -394,8 +394,7 @@ int ncp_obtain_nfs_info(struct ncp_server *server, if ((result = ncp_request(server, 87)) == 0) { ncp_extract_nfs_info(ncp_reply_data(server, 0), &target->nfs); - DPRINTK(KERN_DEBUG - "ncp_obtain_nfs_info: (%s) mode=0%o, rdev=0x%x\n", + ncp_dbg(1, "(%s) mode=0%o, rdev=0x%x\n", target->entryName, target->nfs.mode, target->nfs.rdev); } else { @@ -425,7 +424,7 @@ int ncp_obtain_info(struct ncp_server *server, struct inode *dir, const char *pa int result; if (target == NULL) { - printk(KERN_ERR "ncp_obtain_info: invalid call\n"); + pr_err("%s: invalid call\n", __func__); return -EINVAL; } ncp_init_request(server); @@ -498,7 +497,7 @@ ncp_get_known_namespace(struct ncp_server *server, __u8 volume) namespace = ncp_reply_data(server, 2); while (no_namespaces > 0) { - DPRINTK("get_namespaces: found %d on %d\n", *namespace, volume); + ncp_dbg(1, "found %d on %d\n", *namespace, volume); #ifdef CONFIG_NCPFS_NFS_NS if ((*namespace == NW_NS_NFS) && !(server->m.flags&NCP_MOUNT_NO_NFS)) @@ -531,8 +530,7 @@ ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns) if (ret_ns) *ret_ns = ns; - DPRINTK("lookup_vol: namespace[%d] = %d\n", - volume, server->name_space[volume]); + ncp_dbg(1, "namespace[%d] = %d\n", volume, server->name_space[volume]); if (server->name_space[volume] == ns) return 0; @@ -596,7 +594,7 @@ ncp_get_volume_root(struct ncp_server *server, { int result; - DPRINTK("ncp_get_volume_root: looking up vol %s\n", volname); + ncp_dbg(1, "looking up vol %s\n", volname); ncp_init_request(server); ncp_add_byte(server, 22); /* Subfunction: Generate dir handle */ diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c index 3a1587222c8..471bc3d1139 100644 --- a/fs/ncpfs/sock.c +++ b/fs/ncpfs/sock.c @@ -8,6 +8,7 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/time.h> #include <linux/errno.h> @@ -96,11 +97,11 @@ static void ncp_req_put(struct ncp_request_reply *req) kfree(req); } -void ncp_tcp_data_ready(struct sock *sk, int len) +void ncp_tcp_data_ready(struct sock *sk) { struct ncp_server *server = sk->sk_user_data; - server->data_ready(sk, len); + server->data_ready(sk); schedule_work(&server->rcv.tq); } @@ -231,7 +232,7 @@ static void __ncptcp_try_send(struct ncp_server *server) return; if (result < 0) { - printk(KERN_ERR "ncpfs: tcp: Send failed: %d\n", result); + pr_err("tcp: Send failed: %d\n", result); __ncp_abort_request(server, rq, result); return; } @@ -332,7 +333,7 @@ static int ncp_add_request(struct ncp_server *server, struct ncp_request_reply * mutex_lock(&server->rcv.creq_mutex); if (!ncp_conn_valid(server)) { mutex_unlock(&server->rcv.creq_mutex); - printk(KERN_ERR "ncpfs: tcp: Server died\n"); + pr_err("tcp: Server died\n"); return -EIO; } ncp_req_get(req); @@ -405,15 +406,15 @@ void ncpdgram_rcv_proc(struct work_struct *work) } result = _recv(sock, buf, sizeof(buf), MSG_DONTWAIT); if (result < 0) { - DPRINTK("recv failed with %d\n", result); + ncp_dbg(1, "recv failed with %d\n", result); continue; } if (result < 10) { - DPRINTK("too short (%u) watchdog packet\n", result); + ncp_dbg(1, "too short (%u) watchdog packet\n", result); continue; } if (buf[9] != '?') { - DPRINTK("bad signature (%02X) in watchdog packet\n", buf[9]); + ncp_dbg(1, "bad signature (%02X) in watchdog packet\n", buf[9]); continue; } buf[9] = 'Y'; @@ -448,7 +449,7 @@ void ncpdgram_rcv_proc(struct work_struct *work) result -= 8; hdrl = sock->sk->sk_family == AF_INET ? 8 : 6; if (sign_verify_reply(server, server->rxbuf + hdrl, result - hdrl, cpu_to_le32(result), server->rxbuf + result)) { - printk(KERN_INFO "ncpfs: Signature violation\n"); + pr_info("Signature violation\n"); result = -EIO; } } @@ -524,7 +525,7 @@ static int do_tcp_rcv(struct ncp_server *server, void *buffer, size_t len) return result; } if (result > len) { - printk(KERN_ERR "ncpfs: tcp: bug in recvmsg (%u > %Zu)\n", result, len); + pr_err("tcp: bug in recvmsg (%u > %Zu)\n", result, len); return -EIO; } return result; @@ -552,9 +553,9 @@ static int __ncptcp_rcv_proc(struct ncp_server *server) __ncptcp_abort(server); } if (result < 0) { - printk(KERN_ERR "ncpfs: tcp: error in recvmsg: %d\n", result); + pr_err("tcp: error in recvmsg: %d\n", result); } else { - DPRINTK(KERN_ERR "ncpfs: tcp: EOF\n"); + ncp_dbg(1, "tcp: EOF\n"); } return -EIO; } @@ -566,20 +567,20 @@ static int __ncptcp_rcv_proc(struct ncp_server *server) switch (server->rcv.state) { case 0: if (server->rcv.buf.magic != htonl(NCP_TCP_RCVD_MAGIC)) { - printk(KERN_ERR "ncpfs: tcp: Unexpected reply type %08X\n", ntohl(server->rcv.buf.magic)); + pr_err("tcp: Unexpected reply type %08X\n", ntohl(server->rcv.buf.magic)); __ncptcp_abort(server); return -EIO; } datalen = ntohl(server->rcv.buf.len) & 0x0FFFFFFF; if (datalen < 10) { - printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d\n", datalen); + pr_err("tcp: Unexpected reply len %d\n", datalen); __ncptcp_abort(server); return -EIO; } #ifdef CONFIG_NCPFS_PACKET_SIGNING if (server->sign_active) { if (datalen < 18) { - printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d\n", datalen); + pr_err("tcp: Unexpected reply len %d\n", datalen); __ncptcp_abort(server); return -EIO; } @@ -604,7 +605,7 @@ cont:; server->rcv.len = datalen - 10; break; } - DPRINTK("ncpfs: tcp: Unexpected NCP type %02X\n", type); + ncp_dbg(1, "tcp: Unexpected NCP type %02X\n", type); skipdata2:; server->rcv.state = 2; skipdata:; @@ -614,11 +615,11 @@ skipdata:; } req = server->rcv.creq; if (!req) { - DPRINTK(KERN_ERR "ncpfs: Reply without appropriate request\n"); + ncp_dbg(1, "Reply without appropriate request\n"); goto skipdata2; } if (datalen > req->datalen + 8) { - printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d (expected at most %Zd)\n", datalen, req->datalen + 8); + pr_err("tcp: Unexpected reply len %d (expected at most %Zd)\n", datalen, req->datalen + 8); server->rcv.state = 3; goto skipdata; } @@ -638,12 +639,12 @@ skipdata:; req = server->rcv.creq; if (req->tx_type != NCP_ALLOC_SLOT_REQUEST) { if (((struct ncp_reply_header*)server->rxbuf)->sequence != server->sequence) { - printk(KERN_ERR "ncpfs: tcp: Bad sequence number\n"); + pr_err("tcp: Bad sequence number\n"); __ncp_abort_request(server, req, -EIO); return -EIO; } if ((((struct ncp_reply_header*)server->rxbuf)->conn_low | (((struct ncp_reply_header*)server->rxbuf)->conn_high << 8)) != server->connection) { - printk(KERN_ERR "ncpfs: tcp: Connection number mismatch\n"); + pr_err("tcp: Connection number mismatch\n"); __ncp_abort_request(server, req, -EIO); return -EIO; } @@ -651,7 +652,7 @@ skipdata:; #ifdef CONFIG_NCPFS_PACKET_SIGNING if (server->sign_active && req->tx_type != NCP_DEALLOC_SLOT_REQUEST) { if (sign_verify_reply(server, server->rxbuf + 6, req->datalen - 6, cpu_to_be32(req->datalen + 16), &server->rcv.buf.type)) { - printk(KERN_ERR "ncpfs: tcp: Signature violation\n"); + pr_err("tcp: Signature violation\n"); __ncp_abort_request(server, req, -EIO); return -EIO; } @@ -742,7 +743,7 @@ static int ncp_do_request(struct ncp_server *server, int size, int result; if (server->lock == 0) { - printk(KERN_ERR "ncpfs: Server not locked!\n"); + pr_err("Server not locked!\n"); return -EIO; } if (!ncp_conn_valid(server)) { @@ -781,7 +782,7 @@ static int ncp_do_request(struct ncp_server *server, int size, spin_unlock_irqrestore(¤t->sighand->siglock, flags); } - DDPRINTK("do_ncp_rpc_call returned %d\n", result); + ncp_dbg(2, "do_ncp_rpc_call returned %d\n", result); return result; } @@ -811,7 +812,7 @@ int ncp_request2(struct ncp_server *server, int function, result = ncp_do_request(server, server->current_size, reply, size); if (result < 0) { - DPRINTK("ncp_request_error: %d\n", result); + ncp_dbg(1, "ncp_request_error: %d\n", result); goto out; } server->completion = reply->completion_code; @@ -822,7 +823,7 @@ int ncp_request2(struct ncp_server *server, int function, result = reply->completion_code; if (result != 0) - PPRINTK("ncp_request: completion code=%x\n", result); + ncp_vdbg("completion code=%x\n", result); out: return result; } @@ -865,14 +866,14 @@ void ncp_lock_server(struct ncp_server *server) { mutex_lock(&server->mutex); if (server->lock) - printk(KERN_WARNING "ncp_lock_server: was locked!\n"); + pr_warn("%s: was locked!\n", __func__); server->lock = 1; } void ncp_unlock_server(struct ncp_server *server) { if (!server->lock) { - printk(KERN_WARNING "ncp_unlock_server: was not locked!\n"); + pr_warn("%s: was not locked!\n", __func__); return; } server->lock = 0; diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c index 52439ddc8de..1a63bfdb4a6 100644 --- a/fs/ncpfs/symlink.c +++ b/fs/ncpfs/symlink.c @@ -112,7 +112,7 @@ int ncp_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { __le32 attr; unsigned int hdr; - DPRINTK("ncp_symlink(dir=%p,dentry=%p,symname=%s)\n",dir,dentry,symname); + ncp_dbg(1, "dir=%p, dentry=%p, symname=%s\n", dir, dentry, symname); if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) kludge = 0; diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index b5e80b0af31..3dece03f2fc 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -116,17 +116,17 @@ config NFS_V4_2 config PNFS_FILE_LAYOUT tristate depends on NFS_V4_1 - default m + default NFS_V4 config PNFS_BLOCK tristate depends on NFS_V4_1 && BLK_DEV_DM - default m + default NFS_V4 config PNFS_OBJLAYOUT tristate depends on NFS_V4_1 && SCSI_OSD_ULD - default m + default NFS_V4 config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN string "NFSv4.1 Implementation ID Domain" @@ -140,6 +140,17 @@ config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN If the NFS client is unchanged from the upstream kernel, this option should be set to the default "kernel.org". +config NFS_V4_1_MIGRATION + bool "NFSv4.1 client support for migration" + depends on NFS_V4_1 + default n + help + This option makes the NFS client advertise to NFSv4.1 servers that + it can support NFSv4 migration. + + The NFSv4.1 pieces of the Linux NFSv4 migration implementation are + still experimental. If you are not an NFSv4 developer, say N here. + config NFS_V4_SECURITY_LABEL bool depends on NFS_V4_2 && SECURITY diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 03192a66c14..4782e0840dc 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -29,8 +29,6 @@ nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o -obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o -nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o - +obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/ obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index e242bbf7297..9b431f44fad 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -134,8 +134,8 @@ bl_submit_bio(int rw, struct bio *bio) if (bio) { get_parallel(bio->bi_private); dprintk("%s submitting %s bio %u@%llu\n", __func__, - rw == READ ? "read" : "write", - bio->bi_size, (unsigned long long)bio->bi_sector); + rw == READ ? "read" : "write", bio->bi_iter.bi_size, + (unsigned long long)bio->bi_iter.bi_sector); submit_bio(rw, bio); } return NULL; @@ -156,7 +156,8 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect, } if (bio) { - bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; + bio->bi_iter.bi_sector = isect - be->be_f_offset + + be->be_v_offset; bio->bi_bdev = be->be_mdev; bio->bi_end_io = end_io; bio->bi_private = par; @@ -201,19 +202,15 @@ static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, static void bl_end_io_read(struct bio *bio, int err) { struct parallel_io *par = bio->bi_private; - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec; + int i; - do { - struct page *page = bvec->bv_page; + if (!err) + bio_for_each_segment_all(bvec, bio, i) + SetPageUptodate(bvec->bv_page); - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - if (uptodate) - SetPageUptodate(page); - } while (bvec >= bio->bi_io_vec); - if (!uptodate) { - struct nfs_read_data *rdata = par->data; + if (err) { + struct nfs_pgio_data *rdata = par->data; struct nfs_pgio_header *header = rdata->header; if (!header->pnfs_error) @@ -227,17 +224,17 @@ static void bl_end_io_read(struct bio *bio, int err) static void bl_read_cleanup(struct work_struct *work) { struct rpc_task *task; - struct nfs_read_data *rdata; + struct nfs_pgio_data *rdata; dprintk("%s enter\n", __func__); task = container_of(work, struct rpc_task, u.tk_work); - rdata = container_of(task, struct nfs_read_data, task); + rdata = container_of(task, struct nfs_pgio_data, task); pnfs_ld_read_done(rdata); } static void bl_end_par_io_read(void *data, int unused) { - struct nfs_read_data *rdata = data; + struct nfs_pgio_data *rdata = data; rdata->task.tk_status = rdata->header->pnfs_error; INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); @@ -245,7 +242,7 @@ bl_end_par_io_read(void *data, int unused) } static enum pnfs_try_status -bl_read_pagelist(struct nfs_read_data *rdata) +bl_read_pagelist(struct nfs_pgio_data *rdata) { struct nfs_pgio_header *header = rdata->header; int i, hole; @@ -383,21 +380,17 @@ static void mark_extents_written(struct pnfs_block_layout *bl, static void bl_end_io_write_zero(struct bio *bio, int err) { struct parallel_io *par = bio->bi_private; - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - - do { - struct page *page = bvec->bv_page; + struct bio_vec *bvec; + int i; - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); + bio_for_each_segment_all(bvec, bio, i) { /* This is the zeroing page we added */ - end_page_writeback(page); - page_cache_release(page); - } while (bvec >= bio->bi_io_vec); + end_page_writeback(bvec->bv_page); + page_cache_release(bvec->bv_page); + } - if (unlikely(!uptodate)) { - struct nfs_write_data *data = par->data; + if (unlikely(err)) { + struct nfs_pgio_data *data = par->data; struct nfs_pgio_header *header = data->header; if (!header->pnfs_error) @@ -412,7 +405,7 @@ static void bl_end_io_write(struct bio *bio, int err) { struct parallel_io *par = bio->bi_private; const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct nfs_write_data *data = par->data; + struct nfs_pgio_data *data = par->data; struct nfs_pgio_header *header = data->header; if (!uptodate) { @@ -430,10 +423,10 @@ static void bl_end_io_write(struct bio *bio, int err) static void bl_write_cleanup(struct work_struct *work) { struct rpc_task *task; - struct nfs_write_data *wdata; + struct nfs_pgio_data *wdata; dprintk("%s enter\n", __func__); task = container_of(work, struct rpc_task, u.tk_work); - wdata = container_of(task, struct nfs_write_data, task); + wdata = container_of(task, struct nfs_pgio_data, task); if (likely(!wdata->header->pnfs_error)) { /* Marks for LAYOUTCOMMIT */ mark_extents_written(BLK_LSEG2EXT(wdata->header->lseg), @@ -445,7 +438,7 @@ static void bl_write_cleanup(struct work_struct *work) /* Called when last of bios associated with a bl_write_pagelist call finishes */ static void bl_end_par_io_write(void *data, int num_se) { - struct nfs_write_data *wdata = data; + struct nfs_pgio_data *wdata = data; if (unlikely(wdata->header->pnfs_error)) { bl_free_short_extents(&BLK_LSEG2EXT(wdata->header->lseg)->bl_inval, @@ -519,7 +512,7 @@ bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be, isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) + (offset / SECTOR_SIZE); - bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; + bio->bi_iter.bi_sector = isect - be->be_f_offset + be->be_v_offset; bio->bi_bdev = be->be_mdev; bio->bi_end_io = bl_read_single_end_io; @@ -680,7 +673,7 @@ check_page: } static enum pnfs_try_status -bl_write_pagelist(struct nfs_write_data *wdata, int sync) +bl_write_pagelist(struct nfs_pgio_data *wdata, int sync) { struct nfs_pgio_header *header = wdata->header; int i, ret, npg_zero, pg_index, last = 0; @@ -1196,13 +1189,17 @@ bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) pnfs_generic_pg_init_read(pgio, req); } -static bool +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { if (pgio->pg_dreq != NULL && !is_aligned_req(req, SECTOR_SIZE)) - return false; + return 0; return pnfs_generic_pg_test(pgio, prev, req); } @@ -1220,7 +1217,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx) end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE); if (end != NFS_I(inode)->npages) { rcu_read_lock(); - end = radix_tree_next_hole(&mapping->page_tree, idx + 1, ULONG_MAX); + end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX); rcu_read_unlock(); } @@ -1248,13 +1245,17 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) } } -static bool +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { if (pgio->pg_dreq != NULL && !is_aligned_req(req, PAGE_CACHE_SIZE)) - return false; + return 0; return pnfs_generic_pg_test(pgio, prev, req); } diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 8485978993e..9838fb02047 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -36,6 +36,7 @@ #include <linux/nfs_fs.h> #include <linux/sunrpc/rpc_pipe_fs.h> +#include "../nfs4_fs.h" #include "../pnfs.h" #include "../netns.h" diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c index 9c3e117c3ed..4d016144256 100644 --- a/fs/nfs/blocklayout/extents.c +++ b/fs/nfs/blocklayout/extents.c @@ -44,7 +44,7 @@ static inline sector_t normalize(sector_t s, int base) { sector_t tmp = s; /* Since do_div modifies its argument */ - return s - do_div(tmp, base); + return s - sector_div(tmp, base); } static inline sector_t normalize_up(sector_t s, int base) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 67cd7321316..073b4cf67ed 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -164,8 +164,7 @@ nfs41_callback_up(struct svc_serv *serv) svc_xprt_put(serv->sv_bc_xprt); serv->sv_bc_xprt = NULL; } - dprintk("--> %s return %ld\n", __func__, - IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0); + dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp)); return rqstp; } diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index ae2e87b9545..41db5258e7a 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -112,7 +112,8 @@ out: * TODO: keep track of all layouts (and delegations) in a hash table * hashed by filehandle. */ -static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, struct nfs_fh *fh) +static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, + struct nfs_fh *fh, nfs4_stateid *stateid) { struct nfs_server *server; struct inode *ino; @@ -120,17 +121,19 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { list_for_each_entry(lo, &server->layouts, plh_layouts) { + if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid)) + continue; if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh)) continue; ino = igrab(lo->plh_inode); if (!ino) - continue; + break; spin_lock(&ino->i_lock); /* Is this layout in the process of being freed? */ if (NFS_I(ino)->layout != lo) { spin_unlock(&ino->i_lock); iput(ino); - continue; + break; } pnfs_get_layout_hdr(lo); spin_unlock(&ino->i_lock); @@ -141,13 +144,14 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, return NULL; } -static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, struct nfs_fh *fh) +static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, + struct nfs_fh *fh, nfs4_stateid *stateid) { struct pnfs_layout_hdr *lo; spin_lock(&clp->cl_lock); rcu_read_lock(); - lo = get_layout_by_fh_locked(clp, fh); + lo = get_layout_by_fh_locked(clp, fh, stateid); rcu_read_unlock(); spin_unlock(&clp->cl_lock); @@ -162,9 +166,9 @@ static u32 initiate_file_draining(struct nfs_client *clp, u32 rv = NFS4ERR_NOMATCHING_LAYOUT; LIST_HEAD(free_me_list); - lo = get_layout_by_fh(clp, &args->cbl_fh); + lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid); if (!lo) - return NFS4ERR_NOMATCHING_LAYOUT; + goto out; ino = lo->plh_inode; spin_lock(&ino->i_lock); @@ -179,6 +183,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, pnfs_free_lseg_list(&free_me_list); pnfs_put_layout_hdr(lo); iput(ino); +out: return rv; } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 2dceee4db07..1d09289c8f0 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -590,6 +590,8 @@ int nfs_create_rpc_client(struct nfs_client *clp, if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_DISCRTRY; + if (test_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags)) + args.flags |= RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT; if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags)) @@ -784,8 +786,10 @@ static int nfs_init_server(struct nfs_server *server, goto error; server->port = data->nfs_server.port; + server->auth_info = data->auth_info; - error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]); + error = nfs_init_server_rpcclient(server, &timeparms, + data->selected_flavor); if (error < 0) goto error; @@ -926,6 +930,7 @@ void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *sour target->acdirmax = source->acdirmax; target->caps = source->caps; target->options = source->options; + target->auth_info = source->auth_info; } EXPORT_SYMBOL_GPL(nfs_server_copy_userdata); @@ -943,7 +948,7 @@ void nfs_server_insert_lists(struct nfs_server *server) } EXPORT_SYMBOL_GPL(nfs_server_insert_lists); -static void nfs_server_remove_lists(struct nfs_server *server) +void nfs_server_remove_lists(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; struct nfs_net *nn; @@ -960,6 +965,7 @@ static void nfs_server_remove_lists(struct nfs_server *server) synchronize_rcu(); } +EXPORT_SYMBOL_GPL(nfs_server_remove_lists); /* * Allocate and initialise a server record diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index ef792f29f83..5d8ccecf5f5 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -659,16 +659,19 @@ int nfs_async_inode_return_delegation(struct inode *inode, rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation == NULL) + goto out_enoent; - if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid)) { - rcu_read_unlock(); - return -ENOENT; - } + if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid)) + goto out_enoent; nfs_mark_return_delegation(server, delegation); rcu_read_unlock(); nfs_delegation_run_state_manager(clp); return 0; +out_enoent: + rcu_read_unlock(); + return -ENOENT; } static struct inode * diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 8b3dd7174fa..4a3d4ef7612 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -69,21 +69,28 @@ const struct address_space_operations nfs_dir_aops = { static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred) { + struct nfs_inode *nfsi = NFS_I(dir); struct nfs_open_dir_context *ctx; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (ctx != NULL) { ctx->duped = 0; - ctx->attr_gencount = NFS_I(dir)->attr_gencount; + ctx->attr_gencount = nfsi->attr_gencount; ctx->dir_cookie = 0; ctx->dup_cookie = 0; ctx->cred = get_rpccred(cred); + spin_lock(&dir->i_lock); + list_add(&ctx->list, &nfsi->open_files); + spin_unlock(&dir->i_lock); return ctx; } return ERR_PTR(-ENOMEM); } -static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx) +static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx) { + spin_lock(&dir->i_lock); + list_del(&ctx->list); + spin_unlock(&dir->i_lock); put_rpccred(ctx->cred); kfree(ctx); } @@ -126,7 +133,7 @@ out: static int nfs_closedir(struct inode *inode, struct file *filp) { - put_nfs_open_dir_context(filp->private_data); + put_nfs_open_dir_context(filp->f_path.dentry->d_inode, filp->private_data); return 0; } @@ -274,6 +281,15 @@ out_eof: return -EBADCOOKIE; } +static bool +nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi) +{ + if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA)) + return false; + smp_rmb(); + return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags); +} + static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) { @@ -287,8 +303,8 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des struct nfs_open_dir_context *ctx = desc->file->private_data; new_pos = desc->current_index + i; - if (ctx->attr_gencount != nfsi->attr_gencount - || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) { + if (ctx->attr_gencount != nfsi->attr_gencount || + !nfs_readdir_inode_mapping_valid(nfsi)) { ctx->duped = 0; ctx->attr_gencount = nfsi->attr_gencount; } else if (new_pos < desc->ctx->pos) { @@ -297,10 +313,9 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des if (printk_ratelimit()) { pr_notice("NFS: directory %pD2 contains a readdir loop." "Please contact your server vendor. " - "The file: %s has duplicate cookie %llu\n", - desc->file, - array->array[i].string.name, - *desc->dir_cookie); + "The file: %.*s has duplicate cookie %llu\n", + desc->file, array->array[i].string.len, + array->array[i].string.name, *desc->dir_cookie); } status = -ELOOP; goto out; @@ -428,6 +443,22 @@ void nfs_advise_use_readdirplus(struct inode *dir) set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags); } +/* + * This function is mainly for use by nfs_getattr(). + * + * If this is an 'ls -l', we want to force use of readdirplus. + * Do this by checking if there is an active file descriptor + * and calling nfs_advise_use_readdirplus, then forcing a + * cache flush. + */ +void nfs_force_use_readdirplus(struct inode *dir) +{ + if (!list_empty(&NFS_I(dir)->open_files)) { + nfs_advise_use_readdirplus(dir); + nfs_zap_mapping(dir, dir->i_mapping); + } +} + static void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) { @@ -806,6 +837,17 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc) goto out; } +static bool nfs_dir_mapping_need_revalidate(struct inode *dir) +{ + struct nfs_inode *nfsi = NFS_I(dir); + + if (nfs_attribute_cache_expired(dir)) + return true; + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + return true; + return false; +} + /* The file offset position represents the dirent entry number. A last cookie cache takes care of the common case of reading the whole directory. @@ -838,7 +880,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0; nfs_block_sillyrename(dentry); - if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) + if (ctx->pos == 0 || nfs_dir_mapping_need_revalidate(inode)) res = nfs_revalidate_mapping(inode, file->f_mapping); if (res < 0) goto out; @@ -1125,7 +1167,13 @@ out_zap_parent: if (inode && S_ISDIR(inode->i_mode)) { /* Purge readdir caches. */ nfs_zap_caches(inode); - if (dentry->d_flags & DCACHE_DISCONNECTED) + /* + * We can't d_drop the root of a disconnected tree: + * its d_hash is on the s_anon list and d_drop() would hide + * it from shrink_dcache_for_unmount(), leading to busy + * inodes on unmount and further oopses. + */ + if (IS_ROOT(dentry)) goto out_valid; } /* If we have submounts, don't unhash ! */ @@ -1361,7 +1409,7 @@ static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, i static int do_open(struct inode *inode, struct file *filp) { - nfs_fscache_set_inode_cookie(inode, filp); + nfs_fscache_open_file(inode, filp); return 0; } @@ -1398,7 +1446,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, /* Expect a negative dentry */ BUG_ON(dentry->d_inode); - dfprintk(VFS, "NFS: atomic_open(%s/%ld), %pd\n", + dfprintk(VFS, "NFS: atomic_open(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); err = nfs_check_flags(open_flags); @@ -1588,7 +1636,7 @@ int nfs_create(struct inode *dir, struct dentry *dentry, int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT; int error; - dfprintk(VFS, "NFS: create(%s/%ld), %pd\n", + dfprintk(VFS, "NFS: create(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); attr.ia_mode = mode; @@ -1615,7 +1663,7 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) struct iattr attr; int status; - dfprintk(VFS, "NFS: mknod(%s/%ld), %pd\n", + dfprintk(VFS, "NFS: mknod(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); if (!new_valid_dev(rdev)) @@ -1644,7 +1692,7 @@ int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct iattr attr; int error; - dfprintk(VFS, "NFS: mkdir(%s/%ld), %pd\n", + dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); attr.ia_valid = ATTR_MODE; @@ -1672,7 +1720,7 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry) { int error; - dfprintk(VFS, "NFS: rmdir(%s/%ld), %pd\n", + dfprintk(VFS, "NFS: rmdir(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); trace_nfs_rmdir_enter(dir, dentry); @@ -1741,7 +1789,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) int error; int need_rehash = 0; - dfprintk(VFS, "NFS: unlink(%s/%ld, %pd)\n", dir->i_sb->s_id, + dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id, dir->i_ino, dentry); trace_nfs_unlink_enter(dir, dentry); @@ -1792,7 +1840,7 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) unsigned int pathlen = strlen(symname); int error; - dfprintk(VFS, "NFS: symlink(%s/%ld, %pd, %s)\n", dir->i_sb->s_id, + dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s)\n", dir->i_sb->s_id, dir->i_ino, dentry, symname); if (pathlen > PAGE_SIZE) @@ -1815,7 +1863,7 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr); trace_nfs_symlink_exit(dir, dentry, error); if (error != 0) { - dfprintk(VFS, "NFS: symlink(%s/%ld, %pd, %s) error %d\n", + dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n", dir->i_sb->s_id, dir->i_ino, dentry, symname, error); d_drop(dentry); @@ -1831,6 +1879,11 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) GFP_KERNEL)) { SetPageUptodate(page); unlock_page(page); + /* + * add_to_page_cache_lru() grabs an extra page refcount. + * Drop it here to avoid leaking this page later. + */ + page_cache_release(page); } else __free_page(page); @@ -1891,6 +1944,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode = old_dentry->d_inode; struct inode *new_inode = new_dentry->d_inode; struct dentry *dentry = NULL, *rehash = NULL; + struct rpc_task *task; int error = -EBUSY; dfprintk(VFS, "NFS: rename(%pd2 -> %pd2, ct=%d)\n", @@ -1938,8 +1992,16 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_inode != NULL) NFS_PROTO(new_inode)->return_delegation(new_inode); - error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name, - new_dir, &new_dentry->d_name); + task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); + if (IS_ERR(task)) { + error = PTR_ERR(task); + goto out; + } + + error = rpc_wait_for_completion_task(task); + if (error == 0) + error = task->tk_status; + rpc_put_task(task); nfs_mark_for_revalidate(old_inode); out: if (rehash) @@ -1970,9 +2032,9 @@ static void nfs_access_free_entry(struct nfs_access_entry *entry) { put_rpccred(entry->cred); kfree(entry); - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_long_dec(&nfs_access_nr_entries); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } static void nfs_access_free_list(struct list_head *head) @@ -2020,9 +2082,9 @@ nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc) else { remove_lru_entry: list_del_init(&nfsi->access_cache_inode_lru); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } spin_unlock(&inode->i_lock); } @@ -2170,9 +2232,9 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) nfs_access_add_rbtree(inode, cache); /* Update accounting */ - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_long_inc(&nfs_access_nr_entries); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); /* Add inode to global LRU list */ if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { @@ -2298,7 +2360,7 @@ out: if (!res && (mask & MAY_EXEC) && !execute_ok(inode)) res = -EACCES; - dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n", + dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n", inode->i_sb->s_id, inode->i_ino, mask, res); return res; out_notsup: diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index d71d66c9e0a..f11b9eed0de 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -108,6 +108,97 @@ static inline int put_dreq(struct nfs_direct_req *dreq) return atomic_dec_and_test(&dreq->io_count); } +/* + * nfs_direct_select_verf - select the right verifier + * @dreq - direct request possibly spanning multiple servers + * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs + * @ds_idx - index of data server in data server list, only valid if ds_clp set + * + * returns the correct verifier to use given the role of the server + */ +static struct nfs_writeverf * +nfs_direct_select_verf(struct nfs_direct_req *dreq, + struct nfs_client *ds_clp, + int ds_idx) +{ + struct nfs_writeverf *verfp = &dreq->verf; + +#ifdef CONFIG_NFS_V4_1 + if (ds_clp) { + /* pNFS is in use, use the DS verf */ + if (ds_idx >= 0 && ds_idx < dreq->ds_cinfo.nbuckets) + verfp = &dreq->ds_cinfo.buckets[ds_idx].direct_verf; + else + WARN_ON_ONCE(1); + } +#endif + return verfp; +} + + +/* + * nfs_direct_set_hdr_verf - set the write/commit verifier + * @dreq - direct request possibly spanning multiple servers + * @hdr - pageio header to validate against previously seen verfs + * + * Set the server's (MDS or DS) "seen" verifier + */ +static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq, + struct nfs_pgio_header *hdr) +{ + struct nfs_writeverf *verfp; + + verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp, + hdr->data->ds_idx); + WARN_ON_ONCE(verfp->committed >= 0); + memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); + WARN_ON_ONCE(verfp->committed < 0); +} + +/* + * nfs_direct_cmp_hdr_verf - compare verifier for pgio header + * @dreq - direct request possibly spanning multiple servers + * @hdr - pageio header to validate against previously seen verf + * + * set the server's "seen" verf if not initialized. + * returns result of comparison between @hdr->verf and the "seen" + * verf of the server used by @hdr (DS or MDS) + */ +static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq, + struct nfs_pgio_header *hdr) +{ + struct nfs_writeverf *verfp; + + verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp, + hdr->data->ds_idx); + if (verfp->committed < 0) { + nfs_direct_set_hdr_verf(dreq, hdr); + return 0; + } + return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); +} + +#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) +/* + * nfs_direct_cmp_commit_data_verf - compare verifier for commit data + * @dreq - direct request possibly spanning multiple servers + * @data - commit data to validate against previously seen verf + * + * returns result of comparison between @data->verf and the verf of + * the server used by @data (DS or MDS) + */ +static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, + struct nfs_commit_data *data) +{ + struct nfs_writeverf *verfp; + + verfp = nfs_direct_select_verf(dreq, data->ds_clp, + data->ds_commit_index); + WARN_ON_ONCE(verfp->committed < 0); + return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); +} +#endif + /** * nfs_direct_IO - NFS address space operation for direct I/O * @rw: direction (read or write) @@ -121,20 +212,20 @@ static inline int put_dreq(struct nfs_direct_req *dreq) * shunt off direct read and write requests before the VFS gets them, * so this method is only ever called for swap. */ -ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) +ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos) { #ifndef CONFIG_NFS_SWAP dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n", - iocb->ki_filp, (long long) pos, nr_segs); + iocb->ki_filp, (long long) pos, iter->nr_segs); return -EINVAL; #else VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); if (rw == READ || rw == KERNEL_READ) - return nfs_file_direct_read(iocb, iov, nr_segs, pos, + return nfs_file_direct_read(iocb, iter, pos, rw == READ ? true : false); - return nfs_file_direct_write(iocb, iov, nr_segs, pos, + return nfs_file_direct_write(iocb, iter, pos, rw == WRITE ? true : false); #endif /* CONFIG_NFS_SWAP */ } @@ -168,6 +259,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void) kref_get(&dreq->kref); init_completion(&dreq->completion); INIT_LIST_HEAD(&dreq->mds_cinfo.list); + dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */ INIT_WORK(&dreq->work, nfs_direct_write_schedule_work); spin_lock_init(&dreq->lock); @@ -222,14 +314,31 @@ out: * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust * the iocb is still valid here if this is a synchronous request. */ -static void nfs_direct_complete(struct nfs_direct_req *dreq) +static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) { + struct inode *inode = dreq->inode; + + if (dreq->iocb && write) { + loff_t pos = dreq->iocb->ki_pos + dreq->count; + + spin_lock(&inode->i_lock); + if (i_size_read(inode) < pos) + i_size_write(inode, pos); + spin_unlock(&inode->i_lock); + } + + if (write) + nfs_zap_mapping(inode, inode->i_mapping); + + inode_dio_done(inode); + if (dreq->iocb) { long res = (long) dreq->error; if (!res) res = (long) dreq->count; aio_complete(dreq->iocb, res, 0); } + complete_all(&dreq->completion); nfs_direct_req_release(dreq); @@ -237,9 +346,9 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq) static void nfs_direct_readpage_release(struct nfs_page *req) { - dprintk("NFS: direct read done (%s/%lld %d@%lld)\n", + dprintk("NFS: direct read done (%s/%llu %d@%lld)\n", req->wb_context->dentry->d_inode->i_sb->s_id, - (long long)NFS_FILEID(req->wb_context->dentry->d_inode), + (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode), req->wb_bytes, (long long)req_offset(req)); nfs_release_request(req); @@ -272,7 +381,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) } out_put: if (put_dreq(dreq)) - nfs_direct_complete(dreq); + nfs_direct_complete(dreq, false); hdr->release(hdr); } @@ -305,66 +414,42 @@ static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = { * handled automatically by nfs_direct_read_result(). Otherwise, if * no requests have been sent, just return an error. */ -static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc, - const struct iovec *iov, - loff_t pos, bool uio) -{ - struct nfs_direct_req *dreq = desc->pg_dreq; - struct nfs_open_context *ctx = dreq->ctx; - struct inode *inode = ctx->dentry->d_inode; - unsigned long user_addr = (unsigned long)iov->iov_base; - size_t count = iov->iov_len; - size_t rsize = NFS_SERVER(inode)->rsize; - unsigned int pgbase; - int result; - ssize_t started = 0; - struct page **pagevec = NULL; - unsigned int npages; - - do { - size_t bytes; - int i; - pgbase = user_addr & ~PAGE_MASK; - bytes = min(max_t(size_t, rsize, PAGE_SIZE), count); +static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, + struct iov_iter *iter, + loff_t pos) +{ + struct nfs_pageio_descriptor desc; + struct inode *inode = dreq->inode; + ssize_t result = -EINVAL; + size_t requested_bytes = 0; + size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE); - result = -ENOMEM; - npages = nfs_page_array_len(pgbase, bytes); - if (!pagevec) - pagevec = kmalloc(npages * sizeof(struct page *), - GFP_KERNEL); - if (!pagevec) - break; - if (uio) { - down_read(¤t->mm->mmap_sem); - result = get_user_pages(current, current->mm, user_addr, - npages, 1, 0, pagevec, NULL); - up_read(¤t->mm->mmap_sem); - if (result < 0) - break; - } else { - WARN_ON(npages != 1); - result = get_kernel_page(user_addr, 1, pagevec); - if (WARN_ON(result != 1)) - break; - } + nfs_pageio_init_read(&desc, dreq->inode, false, + &nfs_direct_read_completion_ops); + get_dreq(dreq); + desc.pg_dreq = dreq; + atomic_inc(&inode->i_dio_count); - if ((unsigned)result < npages) { - bytes = result * PAGE_SIZE; - if (bytes <= pgbase) { - nfs_direct_release_pages(pagevec, result); - break; - } - bytes -= pgbase; - npages = result; - } + while (iov_iter_count(iter)) { + struct page **pagevec; + size_t bytes; + size_t pgbase; + unsigned npages, i; + result = iov_iter_get_pages_alloc(iter, &pagevec, + rsize, &pgbase); + if (result < 0) + break; + + bytes = result; + iov_iter_advance(iter, bytes); + npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE; for (i = 0; i < npages; i++) { struct nfs_page *req; unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); /* XXX do we need to do the eof zeroing found in async_filler? */ - req = nfs_create_request(dreq->ctx, dreq->inode, - pagevec[i], + req = nfs_create_request(dreq->ctx, pagevec[i], NULL, pgbase, req_len); if (IS_ERR(req)) { result = PTR_ERR(req); @@ -372,54 +457,21 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de } req->wb_index = pos >> PAGE_SHIFT; req->wb_offset = pos & ~PAGE_MASK; - if (!nfs_pageio_add_request(desc, req)) { - result = desc->pg_error; + if (!nfs_pageio_add_request(&desc, req)) { + result = desc.pg_error; nfs_release_request(req); break; } pgbase = 0; bytes -= req_len; - started += req_len; - user_addr += req_len; + requested_bytes += req_len; pos += req_len; - count -= req_len; dreq->bytes_left -= req_len; } - /* The nfs_page now hold references to these pages */ nfs_direct_release_pages(pagevec, npages); - } while (count != 0 && result >= 0); - - kfree(pagevec); - - if (started) - return started; - return result < 0 ? (ssize_t) result : -EFAULT; -} - -static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, - const struct iovec *iov, - unsigned long nr_segs, - loff_t pos, bool uio) -{ - struct nfs_pageio_descriptor desc; - ssize_t result = -EINVAL; - size_t requested_bytes = 0; - unsigned long seg; - - NFS_PROTO(dreq->inode)->read_pageio_init(&desc, dreq->inode, - &nfs_direct_read_completion_ops); - get_dreq(dreq); - desc.pg_dreq = dreq; - - for (seg = 0; seg < nr_segs; seg++) { - const struct iovec *vec = &iov[seg]; - result = nfs_direct_read_schedule_segment(&desc, vec, pos, uio); + kvfree(pagevec); if (result < 0) break; - requested_bytes += result; - if ((size_t)result < vec->iov_len) - break; - pos += vec->iov_len; } nfs_pageio_complete(&desc); @@ -429,29 +481,69 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * generic layer handle the completion. */ if (requested_bytes == 0) { + inode_dio_done(inode); nfs_direct_req_release(dreq); return result < 0 ? result : -EIO; } if (put_dreq(dreq)) - nfs_direct_complete(dreq); + nfs_direct_complete(dreq, false); return 0; } -static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, bool uio) +/** + * nfs_file_direct_read - file direct read operation for NFS files + * @iocb: target I/O control block + * @iter: vector of user buffers into which to read data + * @pos: byte offset in file where reading starts + * + * We use this function for direct reads instead of calling + * generic_file_aio_read() in order to avoid gfar's check to see if + * the request starts before the end of the file. For that check + * to work, we must generate a GETATTR before each direct read, and + * even then there is a window between the GETATTR and the subsequent + * READ where the file size could change. Our preference is simply + * to do all reads the application wants, and the server will take + * care of managing the end of file boundary. + * + * This function also eliminates unnecessarily updating the file's + * atime locally, as the NFS server sets the file's atime, and this + * client must read the updated atime from the server back into its + * cache. + */ +ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, + loff_t pos, bool uio) { - ssize_t result = -ENOMEM; - struct inode *inode = iocb->ki_filp->f_mapping->host; + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; struct nfs_direct_req *dreq; struct nfs_lock_context *l_ctx; + ssize_t result = -EINVAL; + size_t count = iov_iter_count(iter); + nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); + + dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n", + file, count, (long long) pos); + + result = 0; + if (!count) + goto out; + mutex_lock(&inode->i_mutex); + result = nfs_sync_mapping(mapping); + if (result) + goto out_unlock; + + task_io_account_read(count); + + result = -ENOMEM; dreq = nfs_direct_req_alloc(); if (dreq == NULL) - goto out; + goto out_unlock; dreq->inode = inode; - dreq->bytes_left = iov_length(iov, nr_segs); + dreq->bytes_left = count; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); if (IS_ERR(l_ctx)) { @@ -462,22 +554,28 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; - NFS_I(inode)->read_io += iov_length(iov, nr_segs); - result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio); - if (!result) + NFS_I(inode)->read_io += count; + result = nfs_direct_read_schedule_iovec(dreq, iter, pos); + + mutex_unlock(&inode->i_mutex); + + if (!result) { result = nfs_direct_wait(dreq); + if (result > 0) + iocb->ki_pos = pos + result; + } + + nfs_direct_req_release(dreq); + return result; + out_release: nfs_direct_req_release(dreq); +out_unlock: + mutex_unlock(&inode->i_mutex); out: return result; } -static void nfs_inode_dio_write_done(struct inode *inode) -{ - nfs_zap_mapping(inode, inode->i_mapping); - inode_dio_done(inode); -} - #if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) { @@ -496,7 +594,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) dreq->count = 0; get_dreq(dreq); - NFS_PROTO(dreq->inode)->write_pageio_init(&desc, dreq->inode, FLUSH_STABLE, + nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false, &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; @@ -535,7 +633,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) dprintk("NFS: %5u commit failed with error %d.\n", data->task.tk_pid, status); dreq->flags = NFS_ODIRECT_RESCHED_WRITES; - } else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) { + } else if (nfs_direct_cmp_commit_data_verf(dreq, data)) { dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid); dreq->flags = NFS_ODIRECT_RESCHED_WRITES; } @@ -593,8 +691,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work) nfs_direct_write_reschedule(dreq); break; default: - nfs_inode_dio_write_done(dreq->inode); - nfs_direct_complete(dreq); + nfs_direct_complete(dreq, true); } } @@ -610,114 +707,10 @@ static void nfs_direct_write_schedule_work(struct work_struct *work) static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) { - nfs_inode_dio_write_done(inode); - nfs_direct_complete(dreq); + nfs_direct_complete(dreq, true); } #endif -/* - * NB: Return the value of the first error return code. Subsequent - * errors after the first one are ignored. - */ -/* - * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE - * operation. If nfs_writedata_alloc() or get_user_pages() fails, - * bail and stop sending more writes. Write length accounting is - * handled automatically by nfs_direct_write_result(). Otherwise, if - * no requests have been sent, just return an error. - */ -static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc, - const struct iovec *iov, - loff_t pos, bool uio) -{ - struct nfs_direct_req *dreq = desc->pg_dreq; - struct nfs_open_context *ctx = dreq->ctx; - struct inode *inode = ctx->dentry->d_inode; - unsigned long user_addr = (unsigned long)iov->iov_base; - size_t count = iov->iov_len; - size_t wsize = NFS_SERVER(inode)->wsize; - unsigned int pgbase; - int result; - ssize_t started = 0; - struct page **pagevec = NULL; - unsigned int npages; - - do { - size_t bytes; - int i; - - pgbase = user_addr & ~PAGE_MASK; - bytes = min(max_t(size_t, wsize, PAGE_SIZE), count); - - result = -ENOMEM; - npages = nfs_page_array_len(pgbase, bytes); - if (!pagevec) - pagevec = kmalloc(npages * sizeof(struct page *), GFP_KERNEL); - if (!pagevec) - break; - - if (uio) { - down_read(¤t->mm->mmap_sem); - result = get_user_pages(current, current->mm, user_addr, - npages, 0, 0, pagevec, NULL); - up_read(¤t->mm->mmap_sem); - if (result < 0) - break; - } else { - WARN_ON(npages != 1); - result = get_kernel_page(user_addr, 0, pagevec); - if (WARN_ON(result != 1)) - break; - } - - if ((unsigned)result < npages) { - bytes = result * PAGE_SIZE; - if (bytes <= pgbase) { - nfs_direct_release_pages(pagevec, result); - break; - } - bytes -= pgbase; - npages = result; - } - - for (i = 0; i < npages; i++) { - struct nfs_page *req; - unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); - - req = nfs_create_request(dreq->ctx, dreq->inode, - pagevec[i], - pgbase, req_len); - if (IS_ERR(req)) { - result = PTR_ERR(req); - break; - } - nfs_lock_request(req); - req->wb_index = pos >> PAGE_SHIFT; - req->wb_offset = pos & ~PAGE_MASK; - if (!nfs_pageio_add_request(desc, req)) { - result = desc->pg_error; - nfs_unlock_and_release_request(req); - break; - } - pgbase = 0; - bytes -= req_len; - started += req_len; - user_addr += req_len; - pos += req_len; - count -= req_len; - dreq->bytes_left -= req_len; - } - /* The nfs_page now hold references to these pages */ - nfs_direct_release_pages(pagevec, npages); - } while (count != 0 && result >= 0); - - kfree(pagevec); - - if (started) - return started; - return result < 0 ? (ssize_t) result : -EFAULT; -} - static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) { struct nfs_direct_req *dreq = hdr->dreq; @@ -747,13 +740,13 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) bit = NFS_IOHDR_NEED_RESCHED; else if (dreq->flags == 0) { - memcpy(&dreq->verf, hdr->verf, - sizeof(dreq->verf)); + nfs_direct_set_hdr_verf(dreq, hdr); bit = NFS_IOHDR_NEED_COMMIT; dreq->flags = NFS_ODIRECT_DO_COMMIT; } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) { - if (memcmp(&dreq->verf, hdr->verf, sizeof(dreq->verf))) { - dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) { + dreq->flags = + NFS_ODIRECT_RESCHED_WRITES; bit = NFS_IOHDR_NEED_RESCHED; } else bit = NFS_IOHDR_NEED_COMMIT; @@ -763,6 +756,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) spin_unlock(&dreq->lock); while (!list_empty(&hdr->pages)) { + req = nfs_list_entry(hdr->pages.next); nfs_list_remove_request(req); switch (bit) { @@ -797,33 +791,77 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { .completion = nfs_direct_write_completion, }; + +/* + * NB: Return the value of the first error return code. Subsequent + * errors after the first one are ignored. + */ +/* + * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE + * operation. If nfs_writedata_alloc() or get_user_pages() fails, + * bail and stop sending more writes. Write length accounting is + * handled automatically by nfs_direct_write_result(). Otherwise, if + * no requests have been sent, just return an error. + */ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, - const struct iovec *iov, - unsigned long nr_segs, - loff_t pos, bool uio) + struct iov_iter *iter, + loff_t pos) { struct nfs_pageio_descriptor desc; struct inode *inode = dreq->inode; ssize_t result = 0; size_t requested_bytes = 0; - unsigned long seg; + size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); - NFS_PROTO(inode)->write_pageio_init(&desc, inode, FLUSH_COND_STABLE, + nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false, &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; get_dreq(dreq); atomic_inc(&inode->i_dio_count); - NFS_I(dreq->inode)->write_io += iov_length(iov, nr_segs); - for (seg = 0; seg < nr_segs; seg++) { - const struct iovec *vec = &iov[seg]; - result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio); + NFS_I(inode)->write_io += iov_iter_count(iter); + while (iov_iter_count(iter)) { + struct page **pagevec; + size_t bytes; + size_t pgbase; + unsigned npages, i; + + result = iov_iter_get_pages_alloc(iter, &pagevec, + wsize, &pgbase); if (result < 0) break; - requested_bytes += result; - if ((size_t)result < vec->iov_len) + + bytes = result; + iov_iter_advance(iter, bytes); + npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE; + for (i = 0; i < npages; i++) { + struct nfs_page *req; + unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); + + req = nfs_create_request(dreq->ctx, pagevec[i], NULL, + pgbase, req_len); + if (IS_ERR(req)) { + result = PTR_ERR(req); + break; + } + nfs_lock_request(req); + req->wb_index = pos >> PAGE_SHIFT; + req->wb_offset = pos & ~PAGE_MASK; + if (!nfs_pageio_add_request(&desc, req)) { + result = desc.pg_error; + nfs_unlock_and_release_request(req); + break; + } + pgbase = 0; + bytes -= req_len; + requested_bytes += req_len; + pos += req_len; + dreq->bytes_left -= req_len; + } + nfs_direct_release_pages(pagevec, npages); + kvfree(pagevec); + if (result < 0) break; - pos += vec->iov_len; } nfs_pageio_complete(&desc); @@ -842,98 +880,10 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, return 0; } -static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, - size_t count, bool uio) -{ - ssize_t result = -ENOMEM; - struct inode *inode = iocb->ki_filp->f_mapping->host; - struct nfs_direct_req *dreq; - struct nfs_lock_context *l_ctx; - - dreq = nfs_direct_req_alloc(); - if (!dreq) - goto out; - - dreq->inode = inode; - dreq->bytes_left = count; - dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); - l_ctx = nfs_get_lock_context(dreq->ctx); - if (IS_ERR(l_ctx)) { - result = PTR_ERR(l_ctx); - goto out_release; - } - dreq->l_ctx = l_ctx; - if (!is_sync_kiocb(iocb)) - dreq->iocb = iocb; - - result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio); - if (!result) - result = nfs_direct_wait(dreq); -out_release: - nfs_direct_req_release(dreq); -out: - return result; -} - -/** - * nfs_file_direct_read - file direct read operation for NFS files - * @iocb: target I/O control block - * @iov: vector of user buffers into which to read data - * @nr_segs: size of iov vector - * @pos: byte offset in file where reading starts - * - * We use this function for direct reads instead of calling - * generic_file_aio_read() in order to avoid gfar's check to see if - * the request starts before the end of the file. For that check - * to work, we must generate a GETATTR before each direct read, and - * even then there is a window between the GETATTR and the subsequent - * READ where the file size could change. Our preference is simply - * to do all reads the application wants, and the server will take - * care of managing the end of file boundary. - * - * This function also eliminates unnecessarily updating the file's - * atime locally, as the NFS server sets the file's atime, and this - * client must read the updated atime from the server back into its - * cache. - */ -ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, bool uio) -{ - ssize_t retval = -EINVAL; - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - size_t count; - - count = iov_length(iov, nr_segs); - nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); - - dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n", - file, count, (long long) pos); - - retval = 0; - if (!count) - goto out; - - retval = nfs_sync_mapping(mapping); - if (retval) - goto out; - - task_io_account_read(count); - - retval = nfs_direct_read(iocb, iov, nr_segs, pos, uio); - if (retval > 0) - iocb->ki_pos = pos + retval; - -out: - return retval; -} - /** * nfs_file_direct_write - file direct write operation for NFS files * @iocb: target I/O control block - * @iov: vector of user buffers from which to write data - * @nr_segs: size of iov vector + * @iter: vector of user buffers from which to write data * @pos: byte offset in file where writing starts * * We use this function for direct writes instead of calling @@ -951,49 +901,97 @@ out: * Note that O_APPEND is not supported for NFS direct writes, as there * is no atomic O_APPEND write facility in the NFS protocol. */ -ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, bool uio) +ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, + loff_t pos, bool uio) { - ssize_t retval = -EINVAL; + ssize_t result = -EINVAL; struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; - size_t count; + struct inode *inode = mapping->host; + struct nfs_direct_req *dreq; + struct nfs_lock_context *l_ctx; + loff_t end; + size_t count = iov_iter_count(iter); + end = (pos + count - 1) >> PAGE_CACHE_SHIFT; - count = iov_length(iov, nr_segs); nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", file, count, (long long) pos); - retval = generic_write_checks(file, &pos, &count, 0); - if (retval) + result = generic_write_checks(file, &pos, &count, 0); + if (result) goto out; - retval = -EINVAL; + result = -EINVAL; if ((ssize_t) count < 0) goto out; - retval = 0; + result = 0; if (!count) goto out; - retval = nfs_sync_mapping(mapping); - if (retval) - goto out; + mutex_lock(&inode->i_mutex); + + result = nfs_sync_mapping(mapping); + if (result) + goto out_unlock; + + if (mapping->nrpages) { + result = invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, end); + if (result) + goto out_unlock; + } task_io_account_write(count); - retval = nfs_direct_write(iocb, iov, nr_segs, pos, count, uio); - if (retval > 0) { - struct inode *inode = mapping->host; + result = -ENOMEM; + dreq = nfs_direct_req_alloc(); + if (!dreq) + goto out_unlock; - iocb->ki_pos = pos + retval; - spin_lock(&inode->i_lock); - if (i_size_read(inode) < iocb->ki_pos) - i_size_write(inode, iocb->ki_pos); - spin_unlock(&inode->i_lock); + dreq->inode = inode; + dreq->bytes_left = count; + dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); + l_ctx = nfs_get_lock_context(dreq->ctx); + if (IS_ERR(l_ctx)) { + result = PTR_ERR(l_ctx); + goto out_release; + } + dreq->l_ctx = l_ctx; + if (!is_sync_kiocb(iocb)) + dreq->iocb = iocb; + + result = nfs_direct_write_schedule_iovec(dreq, iter, pos); + + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, end); } + + mutex_unlock(&inode->i_mutex); + + if (!result) { + result = nfs_direct_wait(dreq); + if (result > 0) { + struct inode *inode = mapping->host; + + iocb->ki_pos = pos + result; + spin_lock(&inode->i_lock); + if (i_size_read(inode) < iocb->ki_pos) + i_size_write(inode, iocb->ki_pos); + spin_unlock(&inode->i_lock); + } + } + nfs_direct_req_release(dreq); + return result; + +out_release: + nfs_direct_req_release(dreq); +out_unlock: + mutex_unlock(&inode->i_mutex); out: - return retval; + return result; } /** diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index fc0f95ec735..d25f10fb492 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -46,7 +46,9 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, #include <linux/sunrpc/cache.h> #include <linux/sunrpc/svcauth.h> #include <linux/sunrpc/rpc_pipe_fs.h> +#include <linux/nfs_fs.h> +#include "nfs4_fs.h" #include "dns_resolve.h" #include "cache_lib.h" #include "netns.h" diff --git a/fs/nfs/file.c b/fs/nfs/file.c index e2fcacf07de..4042ff58fe3 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -165,22 +165,21 @@ nfs_file_flush(struct file *file, fl_owner_t id) EXPORT_SYMBOL_GPL(nfs_file_flush); ssize_t -nfs_file_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +nfs_file_read(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t result; if (iocb->ki_filp->f_flags & O_DIRECT) - return nfs_file_direct_read(iocb, iov, nr_segs, pos, true); + return nfs_file_direct_read(iocb, to, iocb->ki_pos, true); - dprintk("NFS: read(%pD2, %lu@%lu)\n", + dprintk("NFS: read(%pD2, %zu@%lu)\n", iocb->ki_filp, - (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos); + iov_iter_count(to), (unsigned long) iocb->ki_pos); result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); if (!result) { - result = generic_file_aio_read(iocb, iov, nr_segs, pos); + result = generic_file_read_iter(iocb, to); if (result > 0) nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); } @@ -354,7 +353,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, struct page *page; int once_thru = 0; - dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%ld), %u@%lld)\n", + dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); start: @@ -395,7 +394,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, struct nfs_open_context *ctx = nfs_file_open_context(file); int status; - dfprintk(PAGECACHE, "NFS: write_end(%pD2(%ld), %u@%lld)\n", + dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); /* @@ -585,7 +584,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) int ret = VM_FAULT_NOPAGE; struct address_space *mapping; - dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%ld), offset %lld)\n", + dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%lu), offset %lld)\n", filp, filp->f_mapping->host->i_ino, (long long)page_offset(page)); @@ -617,6 +616,7 @@ out: static const struct vm_operations_struct nfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = nfs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; @@ -634,24 +634,24 @@ static int nfs_need_sync_write(struct file *filp, struct inode *inode) return 0; } -ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); unsigned long written = 0; ssize_t result; - size_t count = iov_length(iov, nr_segs); + size_t count = iov_iter_count(from); + loff_t pos = iocb->ki_pos; result = nfs_key_timeout_notify(file, inode); if (result) return result; if (file->f_flags & O_DIRECT) - return nfs_file_direct_write(iocb, iov, nr_segs, pos, true); + return nfs_file_direct_write(iocb, from, pos, true); - dprintk("NFS: write(%pD2, %lu@%Ld)\n", - file, (unsigned long) count, (long long) pos); + dprintk("NFS: write(%pD2, %zu@%Ld)\n", + file, count, (long long) pos); result = -EBUSY; if (IS_SWAPFILE(inode)) @@ -669,7 +669,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, if (!count) goto out; - result = generic_file_aio_write(iocb, iov, nr_segs, pos); + result = generic_file_write_iter(iocb, from); if (result > 0) written = result; @@ -690,36 +690,6 @@ out_swapfile: } EXPORT_SYMBOL_GPL(nfs_file_write); -ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, - struct file *filp, loff_t *ppos, - size_t count, unsigned int flags) -{ - struct inode *inode = file_inode(filp); - unsigned long written = 0; - ssize_t ret; - - dprintk("NFS splice_write(%pD2, %lu@%llu)\n", - filp, (unsigned long) count, (unsigned long long) *ppos); - - /* - * The combination of splice and an O_APPEND destination is disallowed. - */ - - ret = generic_file_splice_write(pipe, filp, ppos, count, flags); - if (ret > 0) - written = ret; - - if (ret >= 0 && nfs_need_sync_write(filp, inode)) { - int err = vfs_fsync(filp, 0); - if (err < 0) - ret = err; - } - if (ret > 0) - nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); - return ret; -} -EXPORT_SYMBOL_GPL(nfs_file_splice_write); - static int do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { @@ -915,10 +885,6 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) is_local = 1; /* We're simulating flock() locks using posix locks on the server */ - fl->fl_owner = (fl_owner_t)filp; - fl->fl_start = 0; - fl->fl_end = OFFSET_MAX; - if (fl->fl_type == F_UNLCK) return do_unlk(filp, cmd, fl, is_local); return do_setlk(filp, cmd, fl, is_local); @@ -938,10 +904,10 @@ EXPORT_SYMBOL_GPL(nfs_setlease); const struct file_operations nfs_file_operations = { .llseek = nfs_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = nfs_file_read, - .aio_write = nfs_file_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = nfs_file_read, + .write_iter = nfs_file_write, .mmap = nfs_file_mmap, .open = nfs_file_open, .flush = nfs_file_flush, @@ -950,7 +916,7 @@ const struct file_operations nfs_file_operations = { .lock = nfs_lock, .flock = nfs_flock, .splice_read = nfs_file_splice_read, - .splice_write = nfs_file_splice_write, + .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, .setlease = nfs_setlease, }; diff --git a/fs/nfs/filelayout/Makefile b/fs/nfs/filelayout/Makefile new file mode 100644 index 00000000000..8516cdffb9e --- /dev/null +++ b/fs/nfs/filelayout/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the pNFS Files Layout Driver kernel module +# +obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o +nfs_layout_nfsv41_files-y := filelayout.o filelayoutdev.o diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/filelayout/filelayout.c index b86464ba25e..d2eba1c13b7 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -35,11 +35,11 @@ #include <linux/sunrpc/metrics.h> -#include "nfs4session.h" -#include "internal.h" -#include "delegation.h" -#include "nfs4filelayout.h" -#include "nfs4trace.h" +#include "../nfs4session.h" +#include "../internal.h" +#include "../delegation.h" +#include "filelayout.h" +#include "../nfs4trace.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD @@ -84,17 +84,17 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) BUG(); } -static void filelayout_reset_write(struct nfs_write_data *data) +static void filelayout_reset_write(struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; struct rpc_task *task = &data->task; if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { dprintk("%s Reset task %5u for i/o through MDS " - "(req %s/%lld, %u bytes @ offset %llu)\n", __func__, + "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, data->task.tk_pid, hdr->inode->i_sb->s_id, - (long long)NFS_FILEID(hdr->inode), + (unsigned long long)NFS_FILEID(hdr->inode), data->args.count, (unsigned long long)data->args.offset); @@ -105,17 +105,17 @@ static void filelayout_reset_write(struct nfs_write_data *data) } } -static void filelayout_reset_read(struct nfs_read_data *data) +static void filelayout_reset_read(struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; struct rpc_task *task = &data->task; if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { dprintk("%s Reset task %5u for i/o through MDS " - "(req %s/%lld, %u bytes @ offset %llu)\n", __func__, + "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, data->task.tk_pid, hdr->inode->i_sb->s_id, - (long long)NFS_FILEID(hdr->inode), + (unsigned long long)NFS_FILEID(hdr->inode), data->args.count, (unsigned long long)data->args.offset); @@ -243,7 +243,7 @@ wait_on_recovery: /* NFS_PROTO call done callback routines */ static int filelayout_read_done_cb(struct rpc_task *task, - struct nfs_read_data *data) + struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; int err; @@ -270,7 +270,7 @@ static int filelayout_read_done_cb(struct rpc_task *task, * rfc5661 is not clear about which credential should be used. */ static void -filelayout_set_layoutcommit(struct nfs_write_data *wdata) +filelayout_set_layoutcommit(struct nfs_pgio_data *wdata) { struct nfs_pgio_header *hdr = wdata->header; @@ -279,7 +279,7 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata) return; pnfs_set_layoutcommit(wdata); - dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, + dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } @@ -305,7 +305,7 @@ filelayout_reset_to_mds(struct pnfs_layout_segment *lseg) */ static void filelayout_read_prepare(struct rpc_task *task, void *data) { - struct nfs_read_data *rdata = data; + struct nfs_pgio_data *rdata = data; if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) { rpc_exit(task, -EIO); @@ -317,26 +317,29 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data) rpc_exit(task, 0); return; } - rdata->read_done_cb = filelayout_read_done_cb; + rdata->pgio_done_cb = filelayout_read_done_cb; if (nfs41_setup_sequence(rdata->ds_clp->cl_session, &rdata->args.seq_args, &rdata->res.seq_res, task)) return; - nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context, - rdata->args.lock_context, FMODE_READ); + if (nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context, + rdata->args.lock_context, FMODE_READ) == -EIO) + rpc_exit(task, -EIO); /* lost lock, terminate I/O */ } static void filelayout_read_call_done(struct rpc_task *task, void *data) { - struct nfs_read_data *rdata = data; + struct nfs_pgio_data *rdata = data; dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) && - task->tk_status == 0) + task->tk_status == 0) { + nfs41_sequence_done(task, &rdata->res.seq_res); return; + } /* Note this may cause RPC to be resent */ rdata->header->mds_ops->rpc_call_done(task, data); @@ -344,14 +347,14 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data) static void filelayout_read_count_stats(struct rpc_task *task, void *data) { - struct nfs_read_data *rdata = data; + struct nfs_pgio_data *rdata = data; rpc_count_iostats(task, NFS_SERVER(rdata->header->inode)->client->cl_metrics); } static void filelayout_read_release(void *data) { - struct nfs_read_data *rdata = data; + struct nfs_pgio_data *rdata = data; struct pnfs_layout_hdr *lo = rdata->header->lseg->pls_layout; filelayout_fenceme(lo->plh_inode, lo); @@ -360,7 +363,7 @@ static void filelayout_read_release(void *data) } static int filelayout_write_done_cb(struct rpc_task *task, - struct nfs_write_data *data) + struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; int err; @@ -416,7 +419,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task, static void filelayout_write_prepare(struct rpc_task *task, void *data) { - struct nfs_write_data *wdata = data; + struct nfs_pgio_data *wdata = data; if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) { rpc_exit(task, -EIO); @@ -433,17 +436,20 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data) &wdata->res.seq_res, task)) return; - nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context, - wdata->args.lock_context, FMODE_WRITE); + if (nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context, + wdata->args.lock_context, FMODE_WRITE) == -EIO) + rpc_exit(task, -EIO); /* lost lock, terminate I/O */ } static void filelayout_write_call_done(struct rpc_task *task, void *data) { - struct nfs_write_data *wdata = data; + struct nfs_pgio_data *wdata = data; if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) && - task->tk_status == 0) + task->tk_status == 0) { + nfs41_sequence_done(task, &wdata->res.seq_res); return; + } /* Note this may cause RPC to be resent */ wdata->header->mds_ops->rpc_call_done(task, data); @@ -451,14 +457,14 @@ static void filelayout_write_call_done(struct rpc_task *task, void *data) static void filelayout_write_count_stats(struct rpc_task *task, void *data) { - struct nfs_write_data *wdata = data; + struct nfs_pgio_data *wdata = data; rpc_count_iostats(task, NFS_SERVER(wdata->header->inode)->client->cl_metrics); } static void filelayout_write_release(void *data) { - struct nfs_write_data *wdata = data; + struct nfs_pgio_data *wdata = data; struct pnfs_layout_hdr *lo = wdata->header->lseg->pls_layout; filelayout_fenceme(lo->plh_inode, lo); @@ -523,7 +529,7 @@ static const struct rpc_call_ops filelayout_commit_call_ops = { }; static enum pnfs_try_status -filelayout_read_pagelist(struct nfs_read_data *data) +filelayout_read_pagelist(struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; struct pnfs_layout_segment *lseg = hdr->lseg; @@ -554,6 +560,7 @@ filelayout_read_pagelist(struct nfs_read_data *data) /* No multipath support. Use first DS */ atomic_inc(&ds->ds_clp->cl_count); data->ds_clp = ds->ds_clp; + data->ds_idx = idx; fh = nfs4_fl_select_ds_fh(lseg, j); if (fh) data->args.fh = fh; @@ -562,14 +569,14 @@ filelayout_read_pagelist(struct nfs_read_data *data) data->mds_offset = offset; /* Perform an asynchronous read to ds */ - nfs_initiate_read(ds_clnt, data, - &filelayout_read_call_ops, RPC_TASK_SOFTCONN); + nfs_initiate_pgio(ds_clnt, data, + &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN); return PNFS_ATTEMPTED; } /* Perform async writes. */ static enum pnfs_try_status -filelayout_write_pagelist(struct nfs_write_data *data, int sync) +filelayout_write_pagelist(struct nfs_pgio_data *data, int sync) { struct nfs_pgio_header *hdr = data->header; struct pnfs_layout_segment *lseg = hdr->lseg; @@ -594,20 +601,18 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync) __func__, hdr->inode->i_ino, sync, (size_t) data->args.count, offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); - data->write_done_cb = filelayout_write_done_cb; + data->pgio_done_cb = filelayout_write_done_cb; atomic_inc(&ds->ds_clp->cl_count); data->ds_clp = ds->ds_clp; + data->ds_idx = idx; fh = nfs4_fl_select_ds_fh(lseg, j); if (fh) data->args.fh = fh; - /* - * Get the file offset on the dserver. Set the write offset to - * this offset and save the original offset. - */ + data->args.offset = filelayout_get_dserver_offset(lseg, offset); /* Perform an asynchronous write */ - nfs_initiate_write(ds_clnt, data, + nfs_initiate_pgio(ds_clnt, data, &filelayout_write_call_ops, sync, RPC_TASK_SOFTCONN); return PNFS_ATTEMPTED; @@ -631,7 +636,6 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, struct nfs4_deviceid_node *d; struct nfs4_file_layout_dsaddr *dsaddr; int status = -EINVAL; - struct nfs_server *nfss = NFS_SERVER(lo->plh_inode); dprintk("--> %s\n", __func__); @@ -649,7 +653,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, goto out; } - if (!fl->stripe_unit || fl->stripe_unit % PAGE_SIZE) { + if (!fl->stripe_unit) { dprintk("%s Invalid stripe unit (%u)\n", __func__, fl->stripe_unit); goto out; @@ -686,12 +690,6 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, goto out_put; } - if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) { - dprintk("%s Stripe unit (%u) not aligned with rsize %u " - "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize, - nfss->wsize); - } - status = 0; out: dprintk("--> %s returns %d\n", __func__, status); @@ -844,11 +842,15 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg, { struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); struct pnfs_commit_bucket *buckets; - int size; + int size, i; if (fl->commit_through_mds) return 0; - if (cinfo->ds->nbuckets != 0) { + + size = (fl->stripe_type == STRIPE_SPARSE) ? + fl->dsaddr->ds_num : fl->dsaddr->stripe_count; + + if (cinfo->ds->nbuckets >= size) { /* This assumes there is only one IOMODE_RW lseg. What * we really want to do is have a layout_hdr level * dictionary of <multipath_list4, fh> keys, each @@ -858,30 +860,36 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg, return 0; } - size = (fl->stripe_type == STRIPE_SPARSE) ? - fl->dsaddr->ds_num : fl->dsaddr->stripe_count; - buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket), gfp_flags); if (!buckets) return -ENOMEM; - else { - int i; + for (i = 0; i < size; i++) { + INIT_LIST_HEAD(&buckets[i].written); + INIT_LIST_HEAD(&buckets[i].committing); + /* mark direct verifier as unset */ + buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; + } - spin_lock(cinfo->lock); - if (cinfo->ds->nbuckets != 0) - kfree(buckets); - else { - cinfo->ds->buckets = buckets; - cinfo->ds->nbuckets = size; - for (i = 0; i < size; i++) { - INIT_LIST_HEAD(&buckets[i].written); - INIT_LIST_HEAD(&buckets[i].committing); - } - } - spin_unlock(cinfo->lock); - return 0; + spin_lock(cinfo->lock); + if (cinfo->ds->nbuckets >= size) + goto out; + for (i = 0; i < cinfo->ds->nbuckets; i++) { + list_splice(&cinfo->ds->buckets[i].written, + &buckets[i].written); + list_splice(&cinfo->ds->buckets[i].committing, + &buckets[i].committing); + buckets[i].direct_verf.committed = + cinfo->ds->buckets[i].direct_verf.committed; + buckets[i].wlseg = cinfo->ds->buckets[i].wlseg; + buckets[i].clseg = cinfo->ds->buckets[i].clseg; } + swap(cinfo->ds->buckets, buckets); + cinfo->ds->nbuckets = size; +out: + spin_unlock(cinfo->lock); + kfree(buckets); + return 0; } static struct pnfs_layout_segment * @@ -909,47 +917,51 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, /* * filelayout_pg_test(). Called by nfs_can_coalesce_requests() * - * return true : coalesce page - * return false : don't coalesce page + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. */ -static bool +static size_t filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { + unsigned int size; u64 p_stripe, r_stripe; - u32 stripe_unit; + u32 stripe_offset; + u64 segment_offset = pgio->pg_lseg->pls_range.offset; + u32 stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; - if (!pnfs_generic_pg_test(pgio, prev, req) || - !nfs_generic_pg_test(pgio, prev, req)) - return false; + /* calls nfs_generic_pg_test */ + size = pnfs_generic_pg_test(pgio, prev, req); + if (!size) + return 0; - p_stripe = (u64)req_offset(prev); - r_stripe = (u64)req_offset(req); - stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; + /* see if req and prev are in the same stripe */ + if (prev) { + p_stripe = (u64)req_offset(prev) - segment_offset; + r_stripe = (u64)req_offset(req) - segment_offset; + do_div(p_stripe, stripe_unit); + do_div(r_stripe, stripe_unit); - do_div(p_stripe, stripe_unit); - do_div(r_stripe, stripe_unit); + if (p_stripe != r_stripe) + return 0; + } - return (p_stripe == r_stripe); + /* calculate remaining bytes in the current stripe */ + div_u64_rem((u64)req_offset(req) - segment_offset, + stripe_unit, + &stripe_offset); + WARN_ON_ONCE(stripe_offset > stripe_unit); + if (stripe_offset >= stripe_unit) + return 0; + return min(stripe_unit - (unsigned int)stripe_offset, size); } static void filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - WARN_ON_ONCE(pgio->pg_lseg != NULL); - - if (req->wb_offset != req->wb_pgbase) { - /* - * Handling unaligned pages is difficult, because have to - * somehow split a req in two in certain cases in the - * pg.test code. Avoid this by just not using pnfs - * in this case. - */ - nfs_pageio_reset_read_mds(pgio); - return; - } - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + if (!pgio->pg_lseg) + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, @@ -967,11 +979,8 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_commit_info cinfo; int status; - WARN_ON_ONCE(pgio->pg_lseg != NULL); - - if (req->wb_offset != req->wb_pgbase) - goto out_mds; - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + if (!pgio->pg_lseg) + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, @@ -1061,6 +1070,7 @@ filelayout_choose_commit_list(struct nfs_page *req, */ j = nfs4_fl_calc_j_index(lseg, req_offset(req)); i = select_bucket_index(fl, j); + spin_lock(cinfo->lock); buckets = cinfo->ds->buckets; list = &buckets[i].written; if (list_empty(list)) { @@ -1074,6 +1084,7 @@ filelayout_choose_commit_list(struct nfs_page *req, } set_bit(PG_COMMIT_TO_DS, &req->wb_flags); cinfo->ds->nwritten++; + spin_unlock(cinfo->lock); return list; } @@ -1170,6 +1181,7 @@ transfer_commit_list(struct list_head *src, struct list_head *dst, return ret; } +/* Note called with cinfo->lock held. */ static int filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, struct nfs_commit_info *cinfo, @@ -1214,19 +1226,22 @@ static void filelayout_recover_commit_reqs(struct list_head *dst, struct nfs_commit_info *cinfo) { struct pnfs_commit_bucket *b; + struct pnfs_layout_segment *freeme; int i; - /* NOTE cinfo->lock is NOT held, relying on fact that this is - * only called on single thread per dreq. - * Can't take the lock because need to do pnfs_put_lseg - */ +restart: + spin_lock(cinfo->lock); for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { if (transfer_commit_list(&b->written, dst, cinfo, 0)) { - pnfs_put_lseg(b->wlseg); + freeme = b->wlseg; b->wlseg = NULL; + spin_unlock(cinfo->lock); + pnfs_put_lseg(freeme); + goto restart; } } cinfo->ds->nwritten = 0; + spin_unlock(cinfo->lock); } static unsigned int @@ -1237,6 +1252,7 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list) struct nfs_commit_data *data; int i, j; unsigned int nreq = 0; + struct pnfs_layout_segment *freeme; fl_cinfo = cinfo->ds; bucket = fl_cinfo->buckets; @@ -1247,8 +1263,10 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list) if (!data) break; data->ds_commit_index = i; + spin_lock(cinfo->lock); data->lseg = bucket->clseg; bucket->clseg = NULL; + spin_unlock(cinfo->lock); list_add(&data->pages, list); nreq++; } @@ -1258,8 +1276,11 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list) if (list_empty(&bucket->committing)) continue; nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo); - pnfs_put_lseg(bucket->clseg); + spin_lock(cinfo->lock); + freeme = bucket->clseg; bucket->clseg = NULL; + spin_unlock(cinfo->lock); + pnfs_put_lseg(freeme); } /* Caller will clean up entries put on list */ return nreq; @@ -1324,7 +1345,7 @@ filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) struct nfs4_filelayout *flo; flo = kzalloc(sizeof(*flo), gfp_flags); - return &flo->generic_hdr; + return flo != NULL ? &flo->generic_hdr : NULL; } static void diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/filelayout/filelayout.h index cebd20e7e92..ffbddf2219e 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/filelayout/filelayout.h @@ -30,7 +30,7 @@ #ifndef FS_NFS_NFS4FILELAYOUT_H #define FS_NFS_NFS4FILELAYOUT_H -#include "pnfs.h" +#include "../pnfs.h" /* * Default data server connection timeout and retrans vaules. diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index c7c295e556e..44bf0140a4c 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -33,9 +33,9 @@ #include <linux/module.h> #include <linux/sunrpc/addr.h> -#include "internal.h" -#include "nfs4session.h" -#include "nfs4filelayout.h" +#include "../internal.h" +#include "../nfs4session.h" +#include "filelayout.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD @@ -95,7 +95,7 @@ same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2) b6 = (struct sockaddr_in6 *)addr2; /* LINKLOCAL addresses must have matching scope_id */ - if (ipv6_addr_scope(&a6->sin6_addr) == + if (ipv6_addr_src_scope(&a6->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && a6->sin6_scope_id != b6->sin6_scope_id) return false; @@ -789,9 +789,9 @@ static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(NFS4DS_CONNECTING, &ds->ds_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING); } diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 24d1d1c5fca..3ef01f0ba0b 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -39,7 +39,7 @@ void nfs_fscache_get_client_cookie(struct nfs_client *clp) /* create a cache index for looking up filehandles */ clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index, &nfs_fscache_server_index_def, - clp); + clp, true); dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n", clp, clp->fscache); } @@ -139,7 +139,7 @@ void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int /* create a cache index for looking up filehandles */ nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache, &nfs_fscache_super_index_def, - nfss); + nfss, true); dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n", nfss, nfss->fscache); return; @@ -178,163 +178,79 @@ void nfs_fscache_release_super_cookie(struct super_block *sb) /* * Initialise the per-inode cache cookie pointer for an NFS inode. */ -void nfs_fscache_init_inode_cookie(struct inode *inode) +void nfs_fscache_init_inode(struct inode *inode) { - NFS_I(inode)->fscache = NULL; - if (S_ISREG(inode->i_mode)) - set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); -} - -/* - * Get the per-inode cache cookie for an NFS inode. - */ -static void nfs_fscache_enable_inode_cookie(struct inode *inode) -{ - struct super_block *sb = inode->i_sb; struct nfs_inode *nfsi = NFS_I(inode); - if (nfsi->fscache || !NFS_FSCACHE(inode)) + nfsi->fscache = NULL; + if (!S_ISREG(inode->i_mode)) return; - - if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) { - nfsi->fscache = fscache_acquire_cookie( - NFS_SB(sb)->fscache, - &nfs_fscache_inode_object_def, - nfsi); - - dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n", - sb, nfsi, nfsi->fscache); - } + nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache, + &nfs_fscache_inode_object_def, + nfsi, false); } /* * Release a per-inode cookie. */ -void nfs_fscache_release_inode_cookie(struct inode *inode) +void nfs_fscache_clear_inode(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); + struct fscache_cookie *cookie = nfs_i_fscache(inode); - dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", - nfsi, nfsi->fscache); + dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie); - fscache_relinquish_cookie(nfsi->fscache, 0); + fscache_relinquish_cookie(cookie, false); nfsi->fscache = NULL; } -/* - * Retire a per-inode cookie, destroying the data attached to it. - */ -void nfs_fscache_zap_inode_cookie(struct inode *inode) +static bool nfs_fscache_can_enable(void *data) { - struct nfs_inode *nfsi = NFS_I(inode); + struct inode *inode = data; - dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n", - nfsi, nfsi->fscache); - - fscache_relinquish_cookie(nfsi->fscache, 1); - nfsi->fscache = NULL; + return !inode_is_open_for_write(inode); } /* - * Turn off the cache with regard to a per-inode cookie if opened for writing, - * invalidating all the pages in the page cache relating to the associated - * inode to clear the per-page caching. - */ -static void nfs_fscache_disable_inode_cookie(struct inode *inode) -{ - clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); - - if (NFS_I(inode)->fscache) { - dfprintk(FSCACHE, - "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode)); - - /* Need to uncache any pages attached to this inode that - * fscache knows about before turning off the cache. - */ - fscache_uncache_all_inode_pages(NFS_I(inode)->fscache, inode); - nfs_fscache_zap_inode_cookie(inode); - } -} - -/* - * wait_on_bit() sleep function for uninterruptible waiting - */ -static int nfs_fscache_wait_bit(void *flags) -{ - schedule(); - return 0; -} - -/* - * Lock against someone else trying to also acquire or relinquish a cookie - */ -static inline void nfs_fscache_inode_lock(struct inode *inode) -{ - struct nfs_inode *nfsi = NFS_I(inode); - - while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags)) - wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK, - nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE); -} - -/* - * Unlock cookie management lock - */ -static inline void nfs_fscache_inode_unlock(struct inode *inode) -{ - struct nfs_inode *nfsi = NFS_I(inode); - - smp_mb__before_clear_bit(); - clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags); - smp_mb__after_clear_bit(); - wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK); -} - -/* - * Decide if we should enable or disable local caching for this inode. - * - For now, with NFS, only regular files that are open read-only will be able - * to use the cache. - * - May be invoked multiple times in parallel by parallel nfs_open() functions. - */ -void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp) -{ - if (NFS_FSCACHE(inode)) { - nfs_fscache_inode_lock(inode); - if ((filp->f_flags & O_ACCMODE) != O_RDONLY) - nfs_fscache_disable_inode_cookie(inode); - else - nfs_fscache_enable_inode_cookie(inode); - nfs_fscache_inode_unlock(inode); - } -} -EXPORT_SYMBOL_GPL(nfs_fscache_set_inode_cookie); - -/* - * Replace a per-inode cookie due to revalidation detecting a file having - * changed on the server. + * Enable or disable caching for a file that is being opened as appropriate. + * The cookie is allocated when the inode is initialised, but is not enabled at + * that time. Enablement is deferred to file-open time to avoid stat() and + * access() thrashing the cache. + * + * For now, with NFS, only regular files that are open read-only will be able + * to use the cache. + * + * We enable the cache for an inode if we open it read-only and it isn't + * currently open for writing. We disable the cache if the inode is open + * write-only. + * + * The caller uses the file struct to pin i_writecount on the inode before + * calling us when a file is opened for writing, so we can make use of that. + * + * Note that this may be invoked multiple times in parallel by parallel + * nfs_open() functions. */ -void nfs_fscache_reset_inode_cookie(struct inode *inode) +void nfs_fscache_open_file(struct inode *inode, struct file *filp) { struct nfs_inode *nfsi = NFS_I(inode); - struct nfs_server *nfss = NFS_SERVER(inode); - NFS_IFDEBUG(struct fscache_cookie *old = nfsi->fscache); + struct fscache_cookie *cookie = nfs_i_fscache(inode); - nfs_fscache_inode_lock(inode); - if (nfsi->fscache) { - /* retire the current fscache cache and get a new one */ - fscache_relinquish_cookie(nfsi->fscache, 1); - - nfsi->fscache = fscache_acquire_cookie( - nfss->nfs_client->fscache, - &nfs_fscache_inode_object_def, - nfsi); + if (!fscache_cookie_valid(cookie)) + return; - dfprintk(FSCACHE, - "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n", - nfss, nfsi, old, nfsi->fscache); + if (inode_is_open_for_write(inode)) { + dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi); + clear_bit(NFS_INO_FSCACHE, &nfsi->flags); + fscache_disable_cookie(cookie, true); + fscache_uncache_all_inode_pages(cookie, inode); + } else { + dfprintk(FSCACHE, "NFS: nfsi 0x%p enabling cache\n", nfsi); + fscache_enable_cookie(cookie, nfs_fscache_can_enable, inode); + if (fscache_cookie_enabled(cookie)) + set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); } - nfs_fscache_inode_unlock(inode); } +EXPORT_SYMBOL_GPL(nfs_fscache_open_file); /* * Release the caching state associated with a page, if the page isn't busy @@ -344,12 +260,11 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode) int nfs_fscache_release_page(struct page *page, gfp_t gfp) { if (PageFsCache(page)) { - struct nfs_inode *nfsi = NFS_I(page->mapping->host); - struct fscache_cookie *cookie = nfsi->fscache; + struct fscache_cookie *cookie = nfs_i_fscache(page->mapping->host); BUG_ON(!cookie); dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n", - cookie, page, nfsi); + cookie, page, NFS_I(page->mapping->host)); if (!fscache_maybe_release_page(cookie, page, gfp)) return 0; @@ -367,13 +282,12 @@ int nfs_fscache_release_page(struct page *page, gfp_t gfp) */ void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode) { - struct nfs_inode *nfsi = NFS_I(inode); - struct fscache_cookie *cookie = nfsi->fscache; + struct fscache_cookie *cookie = nfs_i_fscache(inode); BUG_ON(!cookie); dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n", - cookie, page, nfsi); + cookie, page, NFS_I(inode)); fscache_wait_on_page_write(cookie, page); @@ -417,9 +331,9 @@ int __nfs_readpage_from_fscache(struct nfs_open_context *ctx, dfprintk(FSCACHE, "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n", - NFS_I(inode)->fscache, page, page->index, page->flags, inode); + nfs_i_fscache(inode), page, page->index, page->flags, inode); - ret = fscache_read_or_alloc_page(NFS_I(inode)->fscache, + ret = fscache_read_or_alloc_page(nfs_i_fscache(inode), page, nfs_readpage_from_fscache_complete, ctx, @@ -459,9 +373,9 @@ int __nfs_readpages_from_fscache(struct nfs_open_context *ctx, int ret; dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n", - NFS_I(inode)->fscache, npages, inode); + nfs_i_fscache(inode), npages, inode); - ret = fscache_read_or_alloc_pages(NFS_I(inode)->fscache, + ret = fscache_read_or_alloc_pages(nfs_i_fscache(inode), mapping, pages, nr_pages, nfs_readpage_from_fscache_complete, ctx, @@ -506,15 +420,15 @@ void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync) dfprintk(FSCACHE, "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n", - NFS_I(inode)->fscache, page, page->index, page->flags, sync); + nfs_i_fscache(inode), page, page->index, page->flags, sync); - ret = fscache_write_page(NFS_I(inode)->fscache, page, GFP_KERNEL); + ret = fscache_write_page(nfs_i_fscache(inode), page, GFP_KERNEL); dfprintk(FSCACHE, "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n", page, page->index, page->flags, ret); if (ret != 0) { - fscache_uncache_page(NFS_I(inode)->fscache, page); + fscache_uncache_page(nfs_i_fscache(inode), page); nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1); nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1); diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index 4ecb76652eb..d7fe3e799f2 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -76,11 +76,9 @@ extern void nfs_fscache_release_client_cookie(struct nfs_client *); extern void nfs_fscache_get_super_cookie(struct super_block *, const char *, int); extern void nfs_fscache_release_super_cookie(struct super_block *); -extern void nfs_fscache_init_inode_cookie(struct inode *); -extern void nfs_fscache_release_inode_cookie(struct inode *); -extern void nfs_fscache_zap_inode_cookie(struct inode *); -extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *); -extern void nfs_fscache_reset_inode_cookie(struct inode *); +extern void nfs_fscache_init_inode(struct inode *); +extern void nfs_fscache_clear_inode(struct inode *); +extern void nfs_fscache_open_file(struct inode *, struct file *); extern void __nfs_fscache_invalidate_page(struct page *, struct inode *); extern int nfs_fscache_release_page(struct page *, gfp_t); @@ -187,12 +185,10 @@ static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {} static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {} -static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {} -static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {} -static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {} -static inline void nfs_fscache_set_inode_cookie(struct inode *inode, - struct file *filp) {} -static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {} +static inline void nfs_fscache_init_inode(struct inode *inode) {} +static inline void nfs_fscache_clear_inode(struct inode *inode) {} +static inline void nfs_fscache_open_file(struct inode *inode, + struct file *filp) {} static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp) { diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 66984a9aafa..b94f80420a5 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -120,7 +120,8 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh, security_d_instantiate(ret, inode); spin_lock(&ret->d_lock); - if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) { + if (IS_ROOT(ret) && !ret->d_fsdata && + !(ret->d_flags & DCACHE_NFSFS_RENAMED)) { ret->d_fsdata = name; name = NULL; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index eda8879171c..9927913c97c 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -122,13 +122,13 @@ void nfs_clear_inode(struct inode *inode) WARN_ON_ONCE(!list_empty(&NFS_I(inode)->open_files)); nfs_zap_acl_cache(inode); nfs_access_zap_cache(inode); - nfs_fscache_release_inode_cookie(inode); + nfs_fscache_clear_inode(inode); } EXPORT_SYMBOL_GPL(nfs_clear_inode); void nfs_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); nfs_clear_inode(inode); } @@ -147,6 +147,17 @@ int nfs_sync_mapping(struct address_space *mapping) return ret; } +static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + if (inode->i_mapping->nrpages == 0) + flags &= ~NFS_INO_INVALID_DATA; + nfsi->cache_validity |= flags; + if (flags & NFS_INO_INVALID_DATA) + nfs_fscache_invalidate(inode); +} + /* * Invalidate the local caches */ @@ -162,19 +173,17 @@ static void nfs_zap_caches_locked(struct inode *inode) memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { - nfs_fscache_invalidate(inode); - nfsi->cache_validity |= NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_LABEL + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL - | NFS_INO_REVAL_PAGECACHE; + | NFS_INO_REVAL_PAGECACHE); } else - nfsi->cache_validity |= NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_LABEL + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL - | NFS_INO_REVAL_PAGECACHE; + | NFS_INO_REVAL_PAGECACHE); + nfs_zap_label_cache_locked(nfsi); } void nfs_zap_caches(struct inode *inode) @@ -188,8 +197,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping) { if (mapping->nrpages != 0) { spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; - nfs_fscache_invalidate(inode); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); spin_unlock(&inode->i_lock); } } @@ -210,7 +218,7 @@ EXPORT_SYMBOL_GPL(nfs_zap_acl_cache); void nfs_invalidate_atime(struct inode *inode) { spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_invalidate_atime); @@ -266,6 +274,13 @@ nfs_init_locked(struct inode *inode, void *opaque) } #ifdef CONFIG_NFS_V4_SECURITY_LABEL +static void nfs_clear_label_invalid(struct inode *inode) +{ + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_LABEL; + spin_unlock(&inode->i_lock); +} + void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, struct nfs4_label *label) { @@ -274,12 +289,6 @@ void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, if (label == NULL) return; - if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL) == 0) - return; - - if (NFS_SERVER(inode)->nfs_client->cl_minorversion < 2) - return; - if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) { error = security_inode_notifysecctx(inode, label->label, label->len); @@ -289,6 +298,7 @@ void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, __func__, (char *)label->label, label->len, error); + nfs_clear_label_invalid(inode); } } @@ -318,7 +328,7 @@ struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) } EXPORT_SYMBOL_GPL(nfs4_label_alloc); #else -void inline nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, +void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, struct nfs4_label *label) { } @@ -368,7 +378,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st inode->i_mode = fattr->mode; if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 && nfs_server_capable(inode, NFS_CAP_MODE)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); /* Why so? Because we want revalidate for devices/FIFOs, and * that's precisely what we have in nfs_file_inode_operations. */ @@ -414,36 +424,36 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode->i_atime = fattr->atime; else if (nfs_server_capable(inode, NFS_CAP_ATIME)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode->i_mtime = fattr->mtime; else if (nfs_server_capable(inode, NFS_CAP_MTIME)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode->i_ctime = fattr->ctime; else if (nfs_server_capable(inode, NFS_CAP_CTIME)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) inode->i_version = fattr->change_attr; else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_SIZE) inode->i_size = nfs_size_to_loff_t(fattr->size); else - nfsi->cache_validity |= NFS_INO_INVALID_ATTR - | NFS_INO_REVAL_PAGECACHE; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_PAGECACHE); if (fattr->valid & NFS_ATTR_FATTR_NLINK) set_nlink(inode, fattr->nlink); else if (nfs_server_capable(inode, NFS_CAP_NLINK)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_OWNER) inode->i_uid = fattr->uid; else if (nfs_server_capable(inode, NFS_CAP_OWNER)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_GROUP) inode->i_gid = fattr->gid; else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { @@ -459,14 +469,14 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st nfsi->attrtimeo_timestamp = now; nfsi->access_cache = RB_ROOT; - nfs_fscache_init_inode_cookie(inode); + nfs_fscache_init_inode(inode); unlock_new_inode(inode); } else nfs_refresh_inode(inode, fattr); - dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n", + dprintk("NFS: nfs_fhget(%s/%Lu fh_crc=0x%08x ct=%d)\n", inode->i_sb->s_id, - (long long)NFS_FILEID(inode), + (unsigned long long)NFS_FILEID(inode), nfs_display_fhandle_hash(fh), atomic_read(&inode->i_count)); @@ -549,6 +559,9 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset) spin_lock(&inode->i_lock); i_size_write(inode, offset); + /* Optimisation */ + if (offset == 0) + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA; spin_unlock(&inode->i_lock); truncate_pagecache(inode, offset); @@ -577,7 +590,8 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) inode->i_uid = attr->ia_uid; if ((attr->ia_valid & ATTR_GID) != 0) inode->i_gid = attr->ia_gid; - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL); spin_unlock(&inode->i_lock); } if ((attr->ia_valid & ATTR_SIZE) != 0) { @@ -587,6 +601,25 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) } EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); +static void nfs_request_parent_use_readdirplus(struct dentry *dentry) +{ + struct dentry *parent; + + parent = dget_parent(dentry); + nfs_force_use_readdirplus(parent->d_inode); + dput(parent); +} + +static bool nfs_need_revalidate_inode(struct inode *inode) +{ + if (NFS_I(inode)->cache_validity & + (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) + return true; + if (nfs_attribute_cache_expired(inode)) + return true; + return false; +} + int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; @@ -615,10 +648,13 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) need_atime = 0; - if (need_atime) - err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); - else - err = nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (need_atime || nfs_need_revalidate_inode(inode)) { + struct nfs_server *server = NFS_SERVER(inode); + + if (server->caps & NFS_CAP_READDIRPLUS) + nfs_request_parent_use_readdirplus(dentry); + err = __nfs_revalidate_inode(server, inode); + } if (!err) { generic_fillattr(inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); @@ -854,7 +890,7 @@ int nfs_open(struct inode *inode, struct file *filp) return PTR_ERR(ctx); nfs_file_set_open_context(filp, ctx); put_nfs_open_context(ctx); - nfs_fscache_set_inode_cookie(inode, filp); + nfs_fscache_open_file(inode, filp); return 0; } @@ -876,8 +912,8 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) struct nfs_fattr *fattr = NULL; struct nfs_inode *nfsi = NFS_I(inode); - dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", - inode->i_sb->s_id, (long long)NFS_FILEID(inode)); + dfprintk(PAGECACHE, "NFS: revalidating (%s/%Lu)\n", + inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode)); trace_nfs_revalidate_inode_enter(inode); @@ -901,9 +937,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label); if (status != 0) { - dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", + dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) getattr failed, error=%d\n", inode->i_sb->s_id, - (long long)NFS_FILEID(inode), status); + (unsigned long long)NFS_FILEID(inode), status); if (status == -ESTALE) { nfs_zap_caches(inode); if (!S_ISDIR(inode->i_mode)) @@ -914,18 +950,20 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) status = nfs_refresh_inode(inode, fattr); if (status) { - dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", + dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) refresh failed, error=%d\n", inode->i_sb->s_id, - (long long)NFS_FILEID(inode), status); + (unsigned long long)NFS_FILEID(inode), status); goto err_out; } if (nfsi->cache_validity & NFS_INO_INVALID_ACL) nfs_zap_acl_cache(inode); - dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n", + nfs_setsecurity(inode, fattr, label); + + dfprintk(PAGECACHE, "NFS: (%s/%Lu) revalidation complete\n", inode->i_sb->s_id, - (long long)NFS_FILEID(inode)); + (unsigned long long)NFS_FILEID(inode)); err_out: nfs4_label_free(label); @@ -958,9 +996,7 @@ int nfs_attribute_cache_expired(struct inode *inode) */ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) { - if (!(NFS_I(inode)->cache_validity & - (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) - && !nfs_attribute_cache_expired(inode)) + if (!nfs_need_revalidate_inode(inode)) return NFS_STALE(inode) ? -ESTALE : 0; return __nfs_revalidate_inode(server, inode); } @@ -981,16 +1017,17 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map if (ret < 0) return ret; } - spin_lock(&inode->i_lock); - nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; - if (S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode)) { + spin_lock(&inode->i_lock); memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); - spin_unlock(&inode->i_lock); + spin_unlock(&inode->i_lock); + } nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); nfs_fscache_wait_on_invalidate(inode); - dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", - inode->i_sb->s_id, (long long)NFS_FILEID(inode)); + dfprintk(PAGECACHE, "NFS: (%s/%Lu) data cache invalidated\n", + inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(inode)); return 0; } @@ -1011,6 +1048,7 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode) int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) { struct nfs_inode *nfsi = NFS_I(inode); + unsigned long *bitlock = &nfsi->flags; int ret = 0; /* swapfiles are not supposed to be shared. */ @@ -1022,12 +1060,46 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) if (ret < 0) goto out; } - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) { - trace_nfs_invalidate_mapping_enter(inode); - ret = nfs_invalidate_mapping(inode, mapping); - trace_nfs_invalidate_mapping_exit(inode, ret); + + /* + * We must clear NFS_INO_INVALID_DATA first to ensure that + * invalidations that come in while we're shooting down the mappings + * are respected. But, that leaves a race window where one revalidator + * can clear the flag, and then another checks it before the mapping + * gets invalidated. Fix that by serializing access to this part of + * the function. + * + * At the same time, we need to allow other tasks to see whether we + * might be in the middle of invalidating the pages, so we only set + * the bit lock here if it looks like we're going to be doing that. + */ + for (;;) { + ret = wait_on_bit(bitlock, NFS_INO_INVALIDATING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (ret) + goto out; + spin_lock(&inode->i_lock); + if (test_bit(NFS_INO_INVALIDATING, bitlock)) { + spin_unlock(&inode->i_lock); + continue; + } + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + break; + spin_unlock(&inode->i_lock); + goto out; } + set_bit(NFS_INO_INVALIDATING, bitlock); + smp_wmb(); + nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; + spin_unlock(&inode->i_lock); + trace_nfs_invalidate_mapping_enter(inode); + ret = nfs_invalidate_mapping(inode, mapping); + trace_nfs_invalidate_mapping_exit(inode, ret); + + clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); + smp_mb__after_atomic(); + wake_up_bit(bitlock, NFS_INO_INVALIDATING); out: return ret; } @@ -1042,7 +1114,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr && inode->i_version == fattr->pre_change_attr) { inode->i_version = fattr->change_attr; if (S_ISDIR(inode->i_mode)) - nfsi->cache_validity |= NFS_INO_INVALID_DATA; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); ret |= NFS_INO_INVALID_ATTR; } /* If we have atomic WCC data, we may update some attributes */ @@ -1058,7 +1130,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); if (S_ISDIR(inode->i_mode)) - nfsi->cache_validity |= NFS_INO_INVALID_DATA; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); ret |= NFS_INO_INVALID_ATTR; } if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) @@ -1069,9 +1141,6 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr ret |= NFS_INO_INVALID_ATTR; } - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) - nfs_fscache_invalidate(inode); - return ret; } @@ -1130,7 +1199,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat invalid |= NFS_INO_INVALID_ATIME; if (invalid != 0) - nfsi->cache_validity |= invalid; + nfs_set_cache_invalid(inode, invalid); nfsi->read_cache_jiffies = fattr->time_start; return 0; @@ -1209,6 +1278,7 @@ u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh) * not on the result */ return nfs_fhandle_hash(fh); } +EXPORT_SYMBOL_GPL(_nfs_display_fhandle_hash); /* * _nfs_display_fhandle - display an NFS file handle on the console @@ -1253,6 +1323,7 @@ void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption) } } } +EXPORT_SYMBOL_GPL(_nfs_display_fhandle); #endif /** @@ -1284,12 +1355,28 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); } +/* + * Don't trust the change_attribute, mtime, ctime or size if + * a pnfs LAYOUTCOMMIT is outstanding + */ +static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode, + struct nfs_fattr *fattr) +{ + if (pnfs_layoutcommit_outstanding(inode)) + fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE | + NFS_ATTR_FATTR_MTIME | + NFS_ATTR_FATTR_CTIME | + NFS_ATTR_FATTR_SIZE); +} + static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { int ret; trace_nfs_refresh_inode_enter(inode); + nfs_inode_attrs_handle_layoutcommit(inode, fattr); + if (nfs_inode_attrs_need_update(inode, fattr)) ret = nfs_update_inode(inode, fattr); else @@ -1325,13 +1412,11 @@ EXPORT_SYMBOL_GPL(nfs_refresh_inode); static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { - struct nfs_inode *nfsi = NFS_I(inode); + unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; - if (S_ISDIR(inode->i_mode)) { - nfsi->cache_validity |= NFS_INO_INVALID_DATA; - nfs_fscache_invalidate(inode); - } + if (S_ISDIR(inode->i_mode)) + invalid |= NFS_INO_INVALID_DATA; + nfs_set_cache_invalid(inode, invalid); if ((fattr->valid & NFS_ATTR_FATTR) == 0) return 0; return nfs_refresh_inode_locked(inode, fattr); @@ -1436,7 +1521,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) unsigned long now = jiffies; unsigned long save_cache_validity; - dfprintk(VFS, "NFS: %s(%s/%ld fh_crc=0x%08x ct=%d info=0x%x)\n", + dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", __func__, inode->i_sb->s_id, inode->i_ino, nfs_display_fhandle_hash(NFS_FH(inode)), atomic_read(&inode->i_count), fattr->valid); @@ -1457,7 +1542,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* * Big trouble! The inode has become a different object. */ - printk(KERN_DEBUG "NFS: %s: inode %ld mode changed, %07o to %07o\n", + printk(KERN_DEBUG "NFS: %s: inode %lu mode changed, %07o to %07o\n", __func__, inode->i_ino, inode->i_mode, fattr->mode); goto out_err; } @@ -1498,18 +1583,20 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_version = fattr->change_attr; } } else if (server->caps & NFS_CAP_CHANGE_ATTR) - invalid |= save_cache_validity; + nfsi->cache_validity |= save_cache_validity; if (fattr->valid & NFS_ATTR_FATTR_MTIME) { memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); } else if (server->caps & NFS_CAP_MTIME) - invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_CTIME) { memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); } else if (server->caps & NFS_CAP_CTIME) - invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR | NFS_INO_REVAL_FORCED); /* Check if our cached file size is stale */ @@ -1519,10 +1606,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (new_isize != cur_isize) { /* Do we perhaps have any outstanding writes, or has * the file grown beyond our last write? */ - if ((nfsi->npages == 0 && !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) || - new_isize > cur_isize) { + if ((nfsi->npages == 0) || new_isize > cur_isize) { i_size_write(inode, new_isize); invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + invalid &= ~NFS_INO_REVAL_PAGECACHE; } dprintk("NFS: isize change on server for file %s/%ld " "(%Ld to %Ld)\n", @@ -1532,7 +1619,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) (long long)new_isize); } } else - invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED); @@ -1540,7 +1628,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (fattr->valid & NFS_ATTR_FATTR_ATIME) memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); else if (server->caps & NFS_CAP_ATIME) - invalid |= save_cache_validity & (NFS_INO_INVALID_ATIME + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATIME | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_MODE) { @@ -1551,7 +1640,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; } } else if (server->caps & NFS_CAP_MODE) - invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_REVAL_FORCED); @@ -1562,7 +1652,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_uid = fattr->uid; } } else if (server->caps & NFS_CAP_OWNER) - invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_REVAL_FORCED); @@ -1573,7 +1664,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_gid = fattr->gid; } } else if (server->caps & NFS_CAP_OWNER_GROUP) - invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_REVAL_FORCED); @@ -1586,7 +1678,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) set_nlink(inode, fattr->nlink); } } else if (server->caps & NFS_CAP_NLINK) - invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { @@ -1599,7 +1692,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_blocks = fattr->du.nfs2.blocks; /* Update attrtimeo value if we're out of the unstable period */ - if (invalid & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) { + if (invalid & NFS_INO_INVALID_ATTR) { nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; @@ -1612,17 +1705,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } } invalid &= ~NFS_INO_INVALID_ATTR; - invalid &= ~NFS_INO_INVALID_LABEL; /* Don't invalidate the data if we were to blame */ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) invalid &= ~NFS_INO_INVALID_DATA; if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) || (save_cache_validity & NFS_INO_REVAL_FORCED)) - nfsi->cache_validity |= invalid; - - if (invalid & NFS_INO_INVALID_DATA) - nfs_fscache_invalidate(inode); + nfs_set_cache_invalid(inode, invalid); return 0; out_err: @@ -1643,10 +1732,6 @@ struct inode *nfs_alloc_inode(struct super_block *sb) return NULL; nfsi->flags = 0UL; nfsi->cache_validity = 0UL; -#ifdef CONFIG_NFS_V3_ACL - nfsi->acl_access = ERR_PTR(-EAGAIN); - nfsi->acl_default = ERR_PTR(-EAGAIN); -#endif #if IS_ENABLED(CONFIG_NFS_V4) nfsi->nfs4_acl = NULL; #endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 38da8c2b81a..f415cbf9f6c 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -88,8 +88,8 @@ struct nfs_parsed_mount_data { unsigned int namlen; unsigned int options; unsigned int bsize; - unsigned int auth_flavor_len; - rpc_authflavor_t auth_flavors[1]; + struct nfs_auth_info auth_info; + rpc_authflavor_t selected_flavor; char *client_address; unsigned int version; unsigned int minorversion; @@ -154,6 +154,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *, rpc_authflavor_t); int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); void nfs_server_insert_lists(struct nfs_server *); +void nfs_server_remove_lists(struct nfs_server *); void nfs_init_timeout_values(struct rpc_timeout *, int, unsigned int, unsigned int); int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t, rpc_authflavor_t); @@ -174,6 +175,9 @@ extern struct nfs_server *nfs4_create_server( struct nfs_subversion *); extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *, struct nfs_fh *); +extern int nfs4_update_server(struct nfs_server *server, const char *hostname, + struct sockaddr *sap, size_t salen, + struct net *net); extern void nfs_free_server(struct nfs_server *server); extern struct nfs_server *nfs_clone_server(struct nfs_server *, struct nfs_fh *, @@ -227,13 +231,21 @@ extern void nfs_destroy_writepagecache(void); extern int __init nfs_init_directcache(void); extern void nfs_destroy_directcache(void); -extern bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount); extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr, void (*release)(struct nfs_pgio_header *hdr)); void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos); int nfs_iocounter_wait(struct nfs_io_counter *c); +extern const struct nfs_pageio_ops nfs_pgio_rw_ops; +struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *); +void nfs_rw_header_free(struct nfs_pgio_header *); +void nfs_pgio_data_release(struct nfs_pgio_data *); +int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); +int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_data *, + const struct rpc_call_ops *, int, int); +void nfs_free_request(struct nfs_page *req); + static inline void nfs_iocounter_init(struct nfs_io_counter *c) { c->flags = 0; @@ -266,6 +278,30 @@ extern const u32 nfs41_maxgetdevinfo_overhead; extern struct rpc_procinfo nfs4_procedures[]; #endif +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags); +static inline void nfs4_label_free(struct nfs4_label *label) +{ + if (label) { + kfree(label->label); + kfree(label); + } + return; +} + +static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) +{ + if (nfs_server_capable(&nfsi->vfs_inode, NFS_CAP_SECURITY_LABEL)) + nfsi->cache_validity |= NFS_INO_INVALID_LABEL; +} +#else +static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; } +static inline void nfs4_label_free(void *label) {} +static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) +{ +} +#endif /* CONFIG_NFS_V4_SECURITY_LABEL */ + /* proc.c */ void nfs_close_context(struct nfs_open_context *ctx, int is_sync); extern struct nfs_client *nfs_init_client(struct nfs_client *clp, @@ -273,6 +309,7 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp, const char *ip_addr); /* dir.c */ +extern void nfs_force_use_readdirplus(struct inode *dir); extern unsigned long nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc); extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, @@ -291,16 +328,14 @@ int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *) int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int); loff_t nfs_file_llseek(struct file *, loff_t, int); int nfs_file_flush(struct file *, fl_owner_t); -ssize_t nfs_file_read(struct kiocb *, const struct iovec *, unsigned long, loff_t); +ssize_t nfs_file_read(struct kiocb *, struct iov_iter *); ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); int nfs_file_mmap(struct file *, struct vm_area_struct *); -ssize_t nfs_file_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); +ssize_t nfs_file_write(struct kiocb *, struct iov_iter *); int nfs_file_release(struct inode *, struct file *); int nfs_lock(struct file *, int, struct file_lock *); int nfs_flock(struct file *, int, struct file_lock *); -ssize_t nfs_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *, - size_t, unsigned int); int nfs_check_flags(int); int nfs_setlease(struct file *, long, struct file_lock **); @@ -323,6 +358,7 @@ extern struct file_system_type nfs_xdev_fs_type; extern struct file_system_type nfs4_xdev_fs_type; extern struct file_system_type nfs4_referral_fs_type; #endif +bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t); struct dentry *nfs_try_mount(int, const char *, struct nfs_mount_info *, struct nfs_subversion *); void nfs_initialise_sb(struct super_block *); @@ -365,19 +401,11 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool struct nfs_pgio_completion_ops; /* read.c */ -extern struct nfs_read_header *nfs_readhdr_alloc(void); -extern void nfs_readhdr_free(struct nfs_pgio_header *hdr); extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, - struct inode *inode, + struct inode *inode, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops); -extern int nfs_initiate_read(struct rpc_clnt *clnt, - struct nfs_read_data *data, - const struct rpc_call_ops *call_ops, int flags); extern void nfs_read_prepare(struct rpc_task *task, void *calldata); -extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr); extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); -extern void nfs_readdata_release(struct nfs_read_data *rdata); /* super.c */ void nfs_clone_super(struct super_block *, struct nfs_mount_info *); @@ -392,19 +420,10 @@ int nfs_remount(struct super_block *sb, int *flags, char *raw_data); /* write.c */ extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, - struct inode *inode, int ioflags, + struct inode *inode, int ioflags, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops); -extern struct nfs_write_header *nfs_writehdr_alloc(void); -extern void nfs_writehdr_free(struct nfs_pgio_header *hdr); -extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr); extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); -extern void nfs_writedata_release(struct nfs_write_data *wdata); extern void nfs_commit_free(struct nfs_commit_data *p); -extern int nfs_initiate_write(struct rpc_clnt *clnt, - struct nfs_write_data *data, - const struct rpc_call_ops *call_ops, - int how, int flags); extern void nfs_write_prepare(struct rpc_task *task, void *calldata); extern void nfs_commit_prepare(struct rpc_task *task, void *calldata); extern int nfs_initiate_commit(struct rpc_clnt *clnt, @@ -417,6 +436,7 @@ extern void nfs_init_commit(struct nfs_commit_data *data, struct nfs_commit_info *cinfo); int nfs_scan_commit_list(struct list_head *src, struct list_head *dst, struct nfs_commit_info *cinfo, int max); +unsigned long nfs_reqs_to_commit(struct nfs_commit_info *); int nfs_scan_commit(struct inode *inode, struct list_head *dst, struct nfs_commit_info *cinfo); void nfs_mark_request_commit(struct nfs_page *req, @@ -445,6 +465,13 @@ extern int nfs_migrate_page(struct address_space *, #define nfs_migrate_page NULL #endif +/* unlink.c */ +extern struct rpc_task * +nfs_async_rename(struct inode *old_dir, struct inode *new_dir, + struct dentry *old_dentry, struct dentry *new_dentry, + void (*complete)(struct rpc_task *, struct nfs_renamedata *)); +extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry); + /* direct.c */ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, struct nfs_direct_req *dreq); @@ -455,7 +482,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode) extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); /* nfs4proc.c */ -extern void __nfs4_read_done_cb(struct nfs_read_data *); +extern void __nfs4_read_done_cb(struct nfs_pgio_data *); extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms, const char *ip_addr); diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 62db136339e..5f61b83f4a1 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -103,7 +103,7 @@ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) /* * typedef opaque nfsdata<>; */ -static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_readres *result) +static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_pgio_res *result) { u32 recvd, count; __be32 *p; @@ -613,7 +613,7 @@ static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req, * }; */ static void encode_readargs(struct xdr_stream *xdr, - const struct nfs_readargs *args) + const struct nfs_pgio_args *args) { u32 offset = args->offset; u32 count = args->count; @@ -629,7 +629,7 @@ static void encode_readargs(struct xdr_stream *xdr, static void nfs2_xdr_enc_readargs(struct rpc_rqst *req, struct xdr_stream *xdr, - const struct nfs_readargs *args) + const struct nfs_pgio_args *args) { encode_readargs(xdr, args); prepare_reply_buffer(req, args->pages, args->pgbase, @@ -649,7 +649,7 @@ static void nfs2_xdr_enc_readargs(struct rpc_rqst *req, * }; */ static void encode_writeargs(struct xdr_stream *xdr, - const struct nfs_writeargs *args) + const struct nfs_pgio_args *args) { u32 offset = args->offset; u32 count = args->count; @@ -669,7 +669,7 @@ static void encode_writeargs(struct xdr_stream *xdr, static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req, struct xdr_stream *xdr, - const struct nfs_writeargs *args) + const struct nfs_pgio_args *args) { encode_writeargs(xdr, args); xdr->buf->flags |= XDRBUF_WRITE; @@ -857,7 +857,7 @@ out_default: * }; */ static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr, - struct nfs_readres *result) + struct nfs_pgio_res *result) { enum nfs_stat status; int error; @@ -878,7 +878,7 @@ out_default: } static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr, - struct nfs_writeres *result) + struct nfs_pgio_res *result) { /* All NFSv2 writes are "file sync" writes */ result->verf->committed = NFS_FILE_SYNC; diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 4a1aafba6a2..8f854dde415 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -10,179 +10,7 @@ #define NFSDBG_FACILITY NFSDBG_PROC -ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size) -{ - struct inode *inode = dentry->d_inode; - struct posix_acl *acl; - int pos=0, len=0; - -# define output(s) do { \ - if (pos + sizeof(s) <= size) { \ - memcpy(buffer + pos, s, sizeof(s)); \ - pos += sizeof(s); \ - } \ - len += sizeof(s); \ - } while(0) - - acl = nfs3_proc_getacl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - output("system.posix_acl_access"); - posix_acl_release(acl); - } - - if (S_ISDIR(inode->i_mode)) { - acl = nfs3_proc_getacl(inode, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - output("system.posix_acl_default"); - posix_acl_release(acl); - } - } - -# undef output - - if (!buffer || len <= size) - return len; - return -ERANGE; -} - -ssize_t nfs3_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size) -{ - struct inode *inode = dentry->d_inode; - struct posix_acl *acl; - int type, error = 0; - - if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) - type = ACL_TYPE_ACCESS; - else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) - type = ACL_TYPE_DEFAULT; - else - return -EOPNOTSUPP; - - acl = nfs3_proc_getacl(inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - if (type == ACL_TYPE_ACCESS && acl->a_count == 0) - error = -ENODATA; - else - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - } else - error = -ENODATA; - - return error; -} - -int nfs3_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) -{ - struct inode *inode = dentry->d_inode; - struct posix_acl *acl; - int type, error; - - if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) - type = ACL_TYPE_ACCESS; - else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) - type = ACL_TYPE_DEFAULT; - else - return -EOPNOTSUPP; - - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - error = nfs3_proc_setacl(inode, type, acl); - posix_acl_release(acl); - - return error; -} - -int nfs3_removexattr(struct dentry *dentry, const char *name) -{ - struct inode *inode = dentry->d_inode; - int type; - - if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) - type = ACL_TYPE_ACCESS; - else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) - type = ACL_TYPE_DEFAULT; - else - return -EOPNOTSUPP; - - return nfs3_proc_setacl(inode, type, NULL); -} - -static void __nfs3_forget_cached_acls(struct nfs_inode *nfsi) -{ - if (!IS_ERR(nfsi->acl_access)) { - posix_acl_release(nfsi->acl_access); - nfsi->acl_access = ERR_PTR(-EAGAIN); - } - if (!IS_ERR(nfsi->acl_default)) { - posix_acl_release(nfsi->acl_default); - nfsi->acl_default = ERR_PTR(-EAGAIN); - } -} - -void nfs3_forget_cached_acls(struct inode *inode) -{ - dprintk("NFS: nfs3_forget_cached_acls(%s/%ld)\n", inode->i_sb->s_id, - inode->i_ino); - spin_lock(&inode->i_lock); - __nfs3_forget_cached_acls(NFS_I(inode)); - spin_unlock(&inode->i_lock); -} - -static struct posix_acl *nfs3_get_cached_acl(struct inode *inode, int type) -{ - struct nfs_inode *nfsi = NFS_I(inode); - struct posix_acl *acl = ERR_PTR(-EINVAL); - - spin_lock(&inode->i_lock); - switch(type) { - case ACL_TYPE_ACCESS: - acl = nfsi->acl_access; - break; - - case ACL_TYPE_DEFAULT: - acl = nfsi->acl_default; - break; - - default: - goto out; - } - if (IS_ERR(acl)) - acl = ERR_PTR(-EAGAIN); - else - acl = posix_acl_dup(acl); -out: - spin_unlock(&inode->i_lock); - dprintk("NFS: nfs3_get_cached_acl(%s/%ld, %d) = %p\n", inode->i_sb->s_id, - inode->i_ino, type, acl); - return acl; -} - -static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl, - struct posix_acl *dfacl) -{ - struct nfs_inode *nfsi = NFS_I(inode); - - dprintk("nfs3_cache_acls(%s/%ld, %p, %p)\n", inode->i_sb->s_id, - inode->i_ino, acl, dfacl); - spin_lock(&inode->i_lock); - __nfs3_forget_cached_acls(NFS_I(inode)); - if (!IS_ERR(acl)) - nfsi->acl_access = posix_acl_dup(acl); - if (!IS_ERR(dfacl)) - nfsi->acl_default = posix_acl_dup(dfacl); - spin_unlock(&inode->i_lock); -} - -struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) +struct posix_acl *nfs3_get_acl(struct inode *inode, int type) { struct nfs_server *server = NFS_SERVER(inode); struct page *pages[NFSACL_MAXPAGES] = { }; @@ -198,7 +26,6 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) .rpc_argp = &args, .rpc_resp = &res, }; - struct posix_acl *acl; int status, count; if (!nfs_server_capable(inode, NFS_CAP_ACLS)) @@ -207,10 +34,6 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) status = nfs_revalidate_inode(server, inode); if (status < 0) return ERR_PTR(status); - acl = nfs3_get_cached_acl(inode, type); - if (acl != ERR_PTR(-EAGAIN)) - return acl; - acl = NULL; /* * Only get the access acl when explicitly requested: We don't @@ -257,40 +80,41 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) } if (res.acl_access != NULL) { - if (posix_acl_equiv_mode(res.acl_access, NULL) == 0) { + if ((posix_acl_equiv_mode(res.acl_access, NULL) == 0) || + res.acl_access->a_count == 0) { posix_acl_release(res.acl_access); res.acl_access = NULL; } } - nfs3_cache_acls(inode, - (res.mask & NFS_ACL) ? res.acl_access : ERR_PTR(-EINVAL), - (res.mask & NFS_DFACL) ? res.acl_default : ERR_PTR(-EINVAL)); - switch(type) { - case ACL_TYPE_ACCESS: - acl = res.acl_access; - res.acl_access = NULL; - break; + if (res.mask & NFS_ACL) + set_cached_acl(inode, ACL_TYPE_ACCESS, res.acl_access); + else + forget_cached_acl(inode, ACL_TYPE_ACCESS); - case ACL_TYPE_DEFAULT: - acl = res.acl_default; - res.acl_default = NULL; + if (res.mask & NFS_DFACL) + set_cached_acl(inode, ACL_TYPE_DEFAULT, res.acl_default); + else + forget_cached_acl(inode, ACL_TYPE_DEFAULT); + + nfs_free_fattr(res.fattr); + if (type == ACL_TYPE_ACCESS) { + posix_acl_release(res.acl_default); + return res.acl_access; + } else { + posix_acl_release(res.acl_access); + return res.acl_default; } getout: posix_acl_release(res.acl_access); posix_acl_release(res.acl_default); nfs_free_fattr(res.fattr); - - if (status != 0) { - posix_acl_release(acl); - acl = ERR_PTR(status); - } - return acl; + return ERR_PTR(status); } -static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, - struct posix_acl *dfacl) +static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, + struct posix_acl *dfacl) { struct nfs_server *server = NFS_SERVER(inode); struct nfs_fattr *fattr; @@ -353,7 +177,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, switch (status) { case 0: status = nfs_refresh_inode(inode, fattr); - nfs3_cache_acls(inode, acl, dfacl); + set_cached_acl(inode, ACL_TYPE_ACCESS, acl); + set_cached_acl(inode, ACL_TYPE_DEFAULT, dfacl); break; case -EPFNOSUPPORT: case -EPROTONOSUPPORT: @@ -373,40 +198,43 @@ out: return status; } -int nfs3_proc_setacl(struct inode *inode, int type, struct posix_acl *acl) +int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, + struct posix_acl *dfacl) +{ + int ret; + ret = __nfs3_proc_setacls(inode, acl, dfacl); + return (ret == -EOPNOTSUPP) ? 0 : ret; + +} + +int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type) { struct posix_acl *alloc = NULL, *dfacl = NULL; int status; if (S_ISDIR(inode->i_mode)) { switch(type) { - case ACL_TYPE_ACCESS: - alloc = dfacl = nfs3_proc_getacl(inode, - ACL_TYPE_DEFAULT); - if (IS_ERR(alloc)) - goto fail; - break; - - case ACL_TYPE_DEFAULT: - dfacl = acl; - alloc = acl = nfs3_proc_getacl(inode, - ACL_TYPE_ACCESS); - if (IS_ERR(alloc)) - goto fail; - break; - - default: - return -EINVAL; + case ACL_TYPE_ACCESS: + alloc = dfacl = get_acl(inode, ACL_TYPE_DEFAULT); + if (IS_ERR(alloc)) + goto fail; + break; + + case ACL_TYPE_DEFAULT: + dfacl = acl; + alloc = acl = get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(alloc)) + goto fail; + break; } - } else if (type != ACL_TYPE_ACCESS) - return -EINVAL; + } if (acl == NULL) { alloc = acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); if (IS_ERR(alloc)) goto fail; } - status = nfs3_proc_setacls(inode, acl, dfacl); + status = __nfs3_proc_setacls(inode, acl, dfacl); posix_acl_release(alloc); return status; @@ -414,27 +242,51 @@ fail: return PTR_ERR(alloc); } -int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, - umode_t mode) +const struct xattr_handler *nfs3_xattr_handlers[] = { + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, + NULL, +}; + +static int +nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data, + size_t size, ssize_t *result) { - struct posix_acl *dfacl, *acl; - int error = 0; + struct posix_acl *acl; + char *p = data + *result; - dfacl = nfs3_proc_getacl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(dfacl)) { - error = PTR_ERR(dfacl); - return (error == -EOPNOTSUPP) ? 0 : error; - } - if (!dfacl) + acl = get_acl(inode, type); + if (!acl) return 0; - acl = posix_acl_dup(dfacl); - error = posix_acl_create(&acl, GFP_KERNEL, &mode); - if (error < 0) - goto out_release_dfacl; - error = nfs3_proc_setacls(inode, acl, S_ISDIR(inode->i_mode) ? - dfacl : NULL); + posix_acl_release(acl); -out_release_dfacl: - posix_acl_release(dfacl); - return error; + + *result += strlen(name); + *result += 1; + if (!size) + return 0; + if (*result > size) + return -ERANGE; + + strcpy(p, name); + return 0; +} + +ssize_t +nfs3_listxattr(struct dentry *dentry, char *data, size_t size) +{ + struct inode *inode = dentry->d_inode; + ssize_t result = 0; + int error; + + error = nfs3_list_one_acl(inode, ACL_TYPE_ACCESS, + POSIX_ACL_XATTR_ACCESS, data, size, &result); + if (error) + return error; + + error = nfs3_list_one_acl(inode, ACL_TYPE_DEFAULT, + POSIX_ACL_XATTR_DEFAULT, data, size, &result); + if (error) + return error; + return result; } diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 01b6f6a49d1..f0afa291fd5 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -18,6 +18,7 @@ #include <linux/lockd/bind.h> #include <linux/nfs_mount.h> #include <linux/freezer.h> +#include <linux/xattr.h> #include "iostat.h" #include "internal.h" @@ -317,8 +318,8 @@ static int nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, int flags) { + struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; - umode_t mode = sattr->ia_mode; int status = -ENOMEM; dprintk("NFS call create %pd\n", dentry); @@ -340,7 +341,9 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, data->arg.create.verifier[1] = cpu_to_be32(current->pid); } - sattr->ia_mode &= ~current_umask(); + status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); + if (status) + goto out; for (;;) { status = nfs3_do_create(dir, dentry, data); @@ -366,7 +369,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, } if (status != 0) - goto out; + goto out_release_acls; /* When we created the file with exclusive semantics, make * sure we set the attributes afterwards. */ @@ -385,9 +388,14 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, nfs_post_op_update_inode(dentry->d_inode, data->res.fattr); dprintk("NFS reply setattr (post-create): %d\n", status); if (status != 0) - goto out; + goto out_release_acls; } - status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); + + status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + +out_release_acls: + posix_acl_release(acl); + posix_acl_release(default_acl); out: nfs3_free_createdata(data); dprintk("NFS reply create: %d\n", status); @@ -471,41 +479,6 @@ nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir, } static int -nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, - struct inode *new_dir, struct qstr *new_name) -{ - struct nfs_renameargs arg = { - .old_dir = NFS_FH(old_dir), - .old_name = old_name, - .new_dir = NFS_FH(new_dir), - .new_name = new_name, - }; - struct nfs_renameres res; - struct rpc_message msg = { - .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME], - .rpc_argp = &arg, - .rpc_resp = &res, - }; - int status = -ENOMEM; - - dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); - - res.old_fattr = nfs_alloc_fattr(); - res.new_fattr = nfs_alloc_fattr(); - if (res.old_fattr == NULL || res.new_fattr == NULL) - goto out; - - status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); - nfs_post_op_update_inode(old_dir, res.old_fattr); - nfs_post_op_update_inode(new_dir, res.new_fattr); -out: - nfs_free_fattr(res.old_fattr); - nfs_free_fattr(res.new_fattr); - dprintk("NFS reply rename: %d\n", status); - return status; -} - -static int nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) { struct nfs3_linkargs arg = { @@ -572,18 +545,20 @@ out: static int nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { + struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; - umode_t mode = sattr->ia_mode; int status = -ENOMEM; dprintk("NFS call mkdir %pd\n", dentry); - sattr->ia_mode &= ~current_umask(); - data = nfs3_alloc_createdata(); if (data == NULL) goto out; + status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); + if (status) + goto out; + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR]; data->arg.mkdir.fh = NFS_FH(dir); data->arg.mkdir.name = dentry->d_name.name; @@ -592,9 +567,13 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) status = nfs3_do_create(dir, dentry, data); if (status != 0) - goto out; + goto out_release_acls; - status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); + status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + +out_release_acls: + posix_acl_release(acl); + posix_acl_release(default_acl); out: nfs3_free_createdata(data); dprintk("NFS reply mkdir: %d\n", status); @@ -691,19 +670,21 @@ static int nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, dev_t rdev) { + struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; - umode_t mode = sattr->ia_mode; int status = -ENOMEM; dprintk("NFS call mknod %pd %u:%u\n", dentry, MAJOR(rdev), MINOR(rdev)); - sattr->ia_mode &= ~current_umask(); - data = nfs3_alloc_createdata(); if (data == NULL) goto out; + status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); + if (status) + goto out; + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD]; data->arg.mknod.fh = NFS_FH(dir); data->arg.mknod.name = dentry->d_name.name; @@ -731,8 +712,13 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, status = nfs3_do_create(dir, dentry, data); if (status != 0) - goto out; - status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); + goto out_release_acls; + + status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + +out_release_acls: + posix_acl_release(acl); + posix_acl_release(default_acl); out: nfs3_free_createdata(data); dprintk("NFS reply mknod: %d\n", status); @@ -809,7 +795,7 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, return status; } -static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data) +static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_data *data) { struct inode *inode = data->header->inode; @@ -821,18 +807,18 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data) return 0; } -static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) +static void nfs3_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ]; } -static int nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) +static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data) { rpc_call_start(task); return 0; } -static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_data *data) { struct inode *inode = data->header->inode; @@ -843,17 +829,11 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data) return 0; } -static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) +static void nfs3_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE]; } -static int nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) -{ - rpc_call_start(task); - return 0; -} - static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) { rpc_call_start(task); @@ -904,20 +884,28 @@ static const struct inode_operations nfs3_dir_inode_operations = { .permission = nfs_permission, .getattr = nfs_getattr, .setattr = nfs_setattr, +#ifdef CONFIG_NFS_V3_ACL .listxattr = nfs3_listxattr, - .getxattr = nfs3_getxattr, - .setxattr = nfs3_setxattr, - .removexattr = nfs3_removexattr, + .getxattr = generic_getxattr, + .setxattr = generic_setxattr, + .removexattr = generic_removexattr, + .get_acl = nfs3_get_acl, + .set_acl = nfs3_set_acl, +#endif }; static const struct inode_operations nfs3_file_inode_operations = { .permission = nfs_permission, .getattr = nfs_getattr, .setattr = nfs_setattr, +#ifdef CONFIG_NFS_V3_ACL .listxattr = nfs3_listxattr, - .getxattr = nfs3_getxattr, - .setxattr = nfs3_setxattr, - .removexattr = nfs3_removexattr, + .getxattr = generic_getxattr, + .setxattr = generic_setxattr, + .removexattr = generic_removexattr, + .get_acl = nfs3_get_acl, + .set_acl = nfs3_set_acl, +#endif }; const struct nfs_rpc_ops nfs_v3_clientops = { @@ -939,7 +927,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .unlink_setup = nfs3_proc_unlink_setup, .unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare, .unlink_done = nfs3_proc_unlink_done, - .rename = nfs3_proc_rename, .rename_setup = nfs3_proc_rename_setup, .rename_rpc_prepare = nfs3_proc_rename_rpc_prepare, .rename_done = nfs3_proc_rename_done, @@ -953,19 +940,16 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .fsinfo = nfs3_proc_fsinfo, .pathconf = nfs3_proc_pathconf, .decode_dirent = nfs3_decode_dirent, + .pgio_rpc_prepare = nfs3_proc_pgio_rpc_prepare, .read_setup = nfs3_proc_read_setup, - .read_pageio_init = nfs_pageio_init_read, - .read_rpc_prepare = nfs3_proc_read_rpc_prepare, .read_done = nfs3_read_done, .write_setup = nfs3_proc_write_setup, - .write_pageio_init = nfs_pageio_init_write, - .write_rpc_prepare = nfs3_proc_write_rpc_prepare, .write_done = nfs3_write_done, .commit_setup = nfs3_proc_commit_setup, .commit_rpc_prepare = nfs3_proc_commit_rpc_prepare, .commit_done = nfs3_commit_done, .lock = nfs3_proc_lock, - .clear_acl_cache = nfs3_forget_cached_acls, + .clear_acl_cache = forget_all_cached_acls, .close_context = nfs_close_context, .have_delegation = nfs3_have_delegation, .return_delegation = nfs3_return_delegation, diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c index cc471c72523..d6a98949af1 100644 --- a/fs/nfs/nfs3super.c +++ b/fs/nfs/nfs3super.c @@ -12,6 +12,9 @@ static struct nfs_subversion nfs_v3 = { .rpc_vers = &nfs_version3, .rpc_ops = &nfs_v3_clientops, .sops = &nfs_sops, +#ifdef CONFIG_NFS_V3_ACL + .xattr = nfs3_xattr_handlers, +#endif }; static int __init init_nfs_v3(void) diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index fa6d72131c1..8f4cbe7f4aa 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -953,7 +953,7 @@ static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req, * }; */ static void encode_read3args(struct xdr_stream *xdr, - const struct nfs_readargs *args) + const struct nfs_pgio_args *args) { __be32 *p; @@ -966,7 +966,7 @@ static void encode_read3args(struct xdr_stream *xdr, static void nfs3_xdr_enc_read3args(struct rpc_rqst *req, struct xdr_stream *xdr, - const struct nfs_readargs *args) + const struct nfs_pgio_args *args) { encode_read3args(xdr, args); prepare_reply_buffer(req, args->pages, args->pgbase, @@ -992,7 +992,7 @@ static void nfs3_xdr_enc_read3args(struct rpc_rqst *req, * }; */ static void encode_write3args(struct xdr_stream *xdr, - const struct nfs_writeargs *args) + const struct nfs_pgio_args *args) { __be32 *p; @@ -1008,7 +1008,7 @@ static void encode_write3args(struct xdr_stream *xdr, static void nfs3_xdr_enc_write3args(struct rpc_rqst *req, struct xdr_stream *xdr, - const struct nfs_writeargs *args) + const struct nfs_pgio_args *args) { encode_write3args(xdr, args); xdr->buf->flags |= XDRBUF_WRITE; @@ -1589,7 +1589,7 @@ out_default: * }; */ static int decode_read3resok(struct xdr_stream *xdr, - struct nfs_readres *result) + struct nfs_pgio_res *result) { u32 eof, count, ocount, recvd; __be32 *p; @@ -1625,7 +1625,7 @@ out_overflow: } static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, - struct nfs_readres *result) + struct nfs_pgio_res *result) { enum nfs_stat status; int error; @@ -1673,7 +1673,7 @@ out_status: * }; */ static int decode_write3resok(struct xdr_stream *xdr, - struct nfs_writeres *result) + struct nfs_pgio_res *result) { __be32 *p; @@ -1697,7 +1697,7 @@ out_eio: } static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr, - struct nfs_writeres *result) + struct nfs_pgio_res *result) { enum nfs_stat status; int error; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 28842abafab..ba2affa5194 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -9,6 +9,14 @@ #ifndef __LINUX_FS_NFS_NFS4_FS_H #define __LINUX_FS_NFS_NFS4_FS_H +#if defined(CONFIG_NFS_V4_2) +#define NFS4_MAX_MINOR_VERSION 2 +#elif defined(CONFIG_NFS_V4_1) +#define NFS4_MAX_MINOR_VERSION 1 +#else +#define NFS4_MAX_MINOR_VERSION 0 +#endif + #if IS_ENABLED(CONFIG_NFS_V4) #define NFS4_MAX_LOOP_ON_RECOVER (10) @@ -29,6 +37,8 @@ enum nfs4_client_state { NFS4CLNT_SERVER_SCOPE_MISMATCH, NFS4CLNT_PURGE_STATE, NFS4CLNT_BIND_CONN_TO_SESSION, + NFS4CLNT_MOVED, + NFS4CLNT_LEASE_MOVED, }; #define NFS4_RENEW_TIMEOUT 0x01 @@ -50,6 +60,7 @@ struct nfs4_minor_version_ops { const struct nfs4_state_recovery_ops *reboot_recovery_ops; const struct nfs4_state_recovery_ops *nograce_recovery_ops; const struct nfs4_state_maintenance_ops *state_renewal_ops; + const struct nfs4_mig_recovery_ops *mig_recovery_ops; }; #define NFS_SEQID_CONFIRMED 1 @@ -203,6 +214,12 @@ struct nfs4_state_maintenance_ops { int (*renew_lease)(struct nfs_client *, struct rpc_cred *); }; +struct nfs4_mig_recovery_ops { + int (*get_locations)(struct inode *, struct nfs4_fs_locations *, + struct page *, struct rpc_cred *); + int (*fsid_present)(struct inode *, struct rpc_cred *); +}; + extern const struct dentry_operations nfs4_dentry_operations; /* dir.c */ @@ -213,10 +230,11 @@ int nfs_atomic_open(struct inode *, struct dentry *, struct file *, extern struct file_system_type nfs4_fs_type; /* nfs4namespace.c */ -rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *); -struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *, struct inode *, struct qstr *); +struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *, struct qstr *); struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *, struct nfs_fh *, struct nfs_fattr *); +int nfs4_replace_transport(struct nfs_server *server, + const struct nfs4_fs_locations *locations); /* nfs4proc.c */ extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); @@ -231,6 +249,9 @@ extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait); extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *, struct nfs4_fs_locations *, struct page *); +extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *, + struct page *page, struct rpc_cred *); +extern int nfs4_proc_fsid_present(struct inode *, struct rpc_cred *); extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr *, struct nfs_fh *, struct nfs_fattr *); extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *); @@ -249,6 +270,7 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser extern int nfs41_setup_sequence(struct nfs4_session *session, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, struct rpc_task *task); +extern int nfs41_sequence_done(struct rpc_task *, struct nfs4_sequence_res *); extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *); extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, @@ -315,7 +337,7 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode, */ static inline void nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp, - struct rpc_message *msg, struct nfs_write_data *wdata) + struct rpc_message *msg, struct nfs_pgio_data *wdata) { if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) && !test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags)) @@ -347,7 +369,7 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags, static inline void nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp, - struct rpc_message *msg, struct nfs_write_data *wdata) + struct rpc_message *msg, struct nfs_pgio_data *wdata) { } #endif /* CONFIG_NFS_V4_1 */ @@ -405,12 +427,15 @@ extern void nfs4_close_sync(struct nfs4_state *, fmode_t); extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); extern void nfs_inode_find_state_and_recover(struct inode *inode, const nfs4_stateid *stateid); +extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *, struct nfs4_state *); extern void nfs4_schedule_lease_recovery(struct nfs_client *); extern int nfs4_wait_clnt_recover(struct nfs_client *clp); extern int nfs4_client_recover_expired_lease(struct nfs_client *clp); extern void nfs4_schedule_state_manager(struct nfs_client *); extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp); extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); +extern int nfs4_schedule_migration_recovery(const struct nfs_server *); +extern void nfs4_schedule_lease_moved_recovery(struct nfs_client *); extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); extern void nfs41_handle_server_scope(struct nfs_client *, struct nfs41_server_scope **); @@ -476,6 +501,16 @@ static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_statei return memcmp(dst, src, sizeof(*dst)) == 0; } +static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src) +{ + return memcmp(dst->other, src->other, NFS4_STATEID_OTHER_SIZE) == 0; +} + +static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stateid *s2) +{ + return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0; +} + static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state) { return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index a860ab566d6..aa9ef487604 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -10,6 +10,7 @@ #include <linux/sunrpc/auth.h> #include <linux/sunrpc/xprt.h> #include <linux/sunrpc/bc_xprt.h> +#include <linux/sunrpc/rpc_pipe_fs.h> #include "internal.h" #include "callback.h" #include "delegation.h" @@ -169,7 +170,7 @@ void nfs41_shutdown_client(struct nfs_client *clp) void nfs40_shutdown_client(struct nfs_client *clp) { if (clp->cl_slot_tbl) { - nfs4_release_slot_table(clp->cl_slot_tbl); + nfs4_shutdown_slot_table(clp->cl_slot_tbl); kfree(clp->cl_slot_tbl); } } @@ -197,6 +198,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; clp->cl_minorversion = cl_init->minorversion; clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; + clp->cl_mig_gen = 1; return clp; error: @@ -368,6 +370,8 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, if (clp->cl_minorversion != 0) __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags); __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); + __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); + error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); if (error == -EINVAL) error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); @@ -407,13 +411,11 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, error = nfs4_discover_server_trunking(clp, &old); if (error < 0) goto error; - nfs_put_client(clp); - if (clp != old) { - clp->cl_preserve_clid = true; - clp = old; - } - return clp; + if (clp != old) + clp->cl_preserve_clid = true; + nfs_put_client(clp); + return old; error: nfs_mark_client_ready(clp, error); @@ -491,9 +493,10 @@ int nfs40_walk_client_list(struct nfs_client *new, prev = pos; status = nfs_wait_client_init_complete(pos); - spin_lock(&nn->nfs_client_lock); if (status < 0) - continue; + goto out; + status = -NFS4ERR_STALE_CLIENTID; + spin_lock(&nn->nfs_client_lock); } if (pos->cl_cons_state != NFS_CS_READY) continue; @@ -528,6 +531,13 @@ int nfs40_walk_client_list(struct nfs_client *new, *result = pos; dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", __func__, pos, atomic_read(&pos->cl_count)); + goto out; + case -ERESTARTSYS: + case -ETIMEDOUT: + /* The callback path may have been inadvertently + * changed. Schedule recovery! + */ + nfs4_schedule_path_down_recovery(pos); default: goto out; } @@ -631,7 +641,8 @@ int nfs41_walk_client_list(struct nfs_client *new, } spin_lock(&nn->nfs_client_lock); if (status < 0) - continue; + break; + status = -NFS4ERR_STALE_CLIENTID; } if (pos->cl_cons_state != NFS_CS_READY) continue; @@ -924,7 +935,7 @@ static int nfs4_server_common_setup(struct nfs_server *server, dprintk("Server FSID: %llx:%llx\n", (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); - dprintk("Mount FH: %d\n", mntfh->size); + nfs_display_fhandle(mntfh, "Pseudo-fs root FH"); nfs4_session_set_rwsize(server); @@ -947,9 +958,8 @@ out: * Create a version 4 volume record */ static int nfs4_init_server(struct nfs_server *server, - const struct nfs_parsed_mount_data *data) + struct nfs_parsed_mount_data *data) { - rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX; struct rpc_timeout timeparms; int error; @@ -961,9 +971,15 @@ static int nfs4_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = data->flags; server->options = data->options; + server->auth_info = data->auth_info; - if (data->auth_flavor_len >= 1) - pseudoflavor = data->auth_flavors[0]; + /* Use the first specified auth flavor. If this flavor isn't + * allowed by the server, use the SECINFO path to try the + * other specified flavors */ + if (data->auth_info.flavor_len >= 1) + data->selected_flavor = data->auth_info.flavors[0]; + else + data->selected_flavor = RPC_AUTH_UNIX; /* Get a client record */ error = nfs4_set_client(server, @@ -971,7 +987,7 @@ static int nfs4_init_server(struct nfs_server *server, (const struct sockaddr *)&data->nfs_server.address, data->nfs_server.addrlen, data->client_address, - pseudoflavor, + data->selected_flavor, data->nfs_server.protocol, &timeparms, data->minorversion, @@ -991,7 +1007,8 @@ static int nfs4_init_server(struct nfs_server *server, server->port = data->nfs_server.port; - error = nfs_init_server_rpcclient(server, &timeparms, pseudoflavor); + error = nfs_init_server_rpcclient(server, &timeparms, + data->selected_flavor); error: /* Done */ @@ -1018,7 +1035,7 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info, if (!server) return ERR_PTR(-ENOMEM); - auth_probe = mount_info->parsed->auth_flavor_len < 1; + auth_probe = mount_info->parsed->auth_info.flavor_len < 1; /* set up the general RPC client */ error = nfs4_init_server(server, mount_info->parsed); @@ -1046,6 +1063,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, { struct nfs_client *parent_client; struct nfs_server *server, *parent_server; + bool auth_probe; int error; dprintk("--> nfs4_create_referral_server()\n"); @@ -1078,8 +1096,9 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, if (error < 0) goto error; - error = nfs4_server_common_setup(server, mntfh, - !(parent_server->flags & NFS_MOUNT_SECFLAVOUR)); + auth_probe = parent_server->auth_info.flavor_len < 1; + + error = nfs4_server_common_setup(server, mntfh, auth_probe); if (error < 0) goto error; @@ -1091,3 +1110,112 @@ error: dprintk("<-- nfs4_create_referral_server() = error %d\n", error); return ERR_PTR(error); } + +/* + * Grab the destination's particulars, including lease expiry time. + * + * Returns zero if probe succeeded and retrieved FSID matches the FSID + * we have cached. + */ +static int nfs_probe_destination(struct nfs_server *server) +{ + struct inode *inode = server->super->s_root->d_inode; + struct nfs_fattr *fattr; + int error; + + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + return -ENOMEM; + + /* Sanity: the probe won't work if the destination server + * does not recognize the migrated FH. */ + error = nfs_probe_fsinfo(server, NFS_FH(inode), fattr); + + nfs_free_fattr(fattr); + return error; +} + +/** + * nfs4_update_server - Move an nfs_server to a different nfs_client + * + * @server: represents FSID to be moved + * @hostname: new end-point's hostname + * @sap: new end-point's socket address + * @salen: size of "sap" + * @net: net namespace + * + * The nfs_server must be quiescent before this function is invoked. + * Either its session is drained (NFSv4.1+), or its transport is + * plugged and drained (NFSv4.0). + * + * Returns zero on success, or a negative errno value. + */ +int nfs4_update_server(struct nfs_server *server, const char *hostname, + struct sockaddr *sap, size_t salen, struct net *net) +{ + struct nfs_client *clp = server->nfs_client; + struct rpc_clnt *clnt = server->client; + struct xprt_create xargs = { + .ident = clp->cl_proto, + .net = net, + .dstaddr = sap, + .addrlen = salen, + .servername = hostname, + }; + char buf[INET6_ADDRSTRLEN + 1]; + struct sockaddr_storage address; + struct sockaddr *localaddr = (struct sockaddr *)&address; + int error; + + dprintk("--> %s: move FSID %llx:%llx to \"%s\")\n", __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + hostname); + + error = rpc_switch_client_transport(clnt, &xargs, clnt->cl_timeout); + if (error != 0) { + dprintk("<-- %s(): rpc_switch_client_transport returned %d\n", + __func__, error); + goto out; + } + + error = rpc_localaddr(clnt, localaddr, sizeof(address)); + if (error != 0) { + dprintk("<-- %s(): rpc_localaddr returned %d\n", + __func__, error); + goto out; + } + + error = -EAFNOSUPPORT; + if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0) { + dprintk("<-- %s(): rpc_ntop returned %d\n", + __func__, error); + goto out; + } + + nfs_server_remove_lists(server); + error = nfs4_set_client(server, hostname, sap, salen, buf, + clp->cl_rpcclient->cl_auth->au_flavor, + clp->cl_proto, clnt->cl_timeout, + clp->cl_minorversion, net); + nfs_put_client(clp); + if (error != 0) { + nfs_server_insert_lists(server); + dprintk("<-- %s(): nfs4_set_client returned %d\n", + __func__, error); + goto out; + } + + if (server->nfs_client->cl_hostname == NULL) + server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL); + nfs_server_insert_lists(server); + + error = nfs_probe_destination(server); + if (error < 0) + goto out; + + dprintk("<-- %s() succeeded\n", __func__); + +out: + return error; +} diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 9c8f09a2156..a816f0627a6 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -73,7 +73,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); nfs_file_set_open_context(filp, ctx); - nfs_fscache_set_inode_cookie(inode, filp); + nfs_fscache_open_file(inode, filp); err = 0; out_put_ctx: @@ -100,8 +100,7 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) break; mutex_lock(&inode->i_mutex); ret = nfs_file_fsync_commit(file, start, end, datasync); - if (!ret && !datasync) - /* application has asked for meta-data sync */ + if (!ret) ret = pnfs_layoutcommit_inode(inode, true); mutex_unlock(&inode->i_mutex); /* @@ -118,10 +117,10 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) const struct file_operations nfs4_file_operations = { .llseek = nfs_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = nfs_file_read, - .aio_write = nfs_file_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = nfs_file_read, + .write_iter = nfs_file_write, .mmap = nfs_file_mmap, .open = nfs4_file_open, .flush = nfs_file_flush, @@ -130,7 +129,7 @@ const struct file_operations nfs4_file_operations = { .lock = nfs_lock, .flock = nfs_flock, .splice_read = nfs_file_splice_read, - .splice_write = nfs_file_splice_write, + .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, .setlease = nfs_setlease, }; diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 049b9fb0d2c..3d83cb1fdc7 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -121,9 +121,8 @@ static int nfs4_validate_fspath(struct dentry *dentry, } static size_t nfs_parse_server_name(char *string, size_t len, - struct sockaddr *sa, size_t salen, struct nfs_server *server) + struct sockaddr *sa, size_t salen, struct net *net) { - struct net *net = rpc_net_ns(server->client); ssize_t ret; ret = rpc_pton(net, string, len, sa, salen); @@ -137,17 +136,25 @@ static size_t nfs_parse_server_name(char *string, size_t len, /** * nfs_find_best_sec - Find a security mechanism supported locally + * @server: NFS server struct * @flavors: List of security tuples returned by SECINFO procedure * - * Return the pseudoflavor of the first security mechanism in - * "flavors" that is locally supported. Return RPC_AUTH_UNIX if - * no matching flavor is found in the array. The "flavors" array + * Return an rpc client that uses the first security mechanism in + * "flavors" that is locally supported. The "flavors" array * is searched in the order returned from the server, per RFC 3530 - * recommendation. + * recommendation and each flavor is checked for membership in the + * sec= mount option list if it exists. + * + * Return -EPERM if no matching flavor is found in the array. + * + * Please call rpc_shutdown_client() when you are done with this rpc client. + * */ -rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) +static struct rpc_clnt *nfs_find_best_sec(struct rpc_clnt *clnt, + struct nfs_server *server, + struct nfs4_secinfo_flavors *flavors) { - rpc_authflavor_t pseudoflavor; + rpc_authflavor_t pflavor; struct nfs4_secinfo4 *secinfo; unsigned int i; @@ -158,55 +165,73 @@ rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) case RPC_AUTH_NULL: case RPC_AUTH_UNIX: case RPC_AUTH_GSS: - pseudoflavor = rpcauth_get_pseudoflavor(secinfo->flavor, + pflavor = rpcauth_get_pseudoflavor(secinfo->flavor, &secinfo->flavor_info); - if (pseudoflavor != RPC_AUTH_MAXFLAVOR) - return pseudoflavor; - break; + /* does the pseudoflavor match a sec= mount opt? */ + if (pflavor != RPC_AUTH_MAXFLAVOR && + nfs_auth_info_match(&server->auth_info, pflavor)) { + struct rpc_clnt *new; + struct rpc_cred *cred; + + /* Cloning creates an rpc_auth for the flavor */ + new = rpc_clone_client_set_auth(clnt, pflavor); + if (IS_ERR(new)) + continue; + /** + * Check that the user actually can use the + * flavor. This is mostly for RPC_AUTH_GSS + * where cr_init obtains a gss context + */ + cred = rpcauth_lookupcred(new->cl_auth, 0); + if (IS_ERR(cred)) { + rpc_shutdown_client(new); + continue; + } + put_rpccred(cred); + return new; + } } } - - return RPC_AUTH_UNIX; + return ERR_PTR(-EPERM); } -static rpc_authflavor_t nfs4_negotiate_security(struct inode *inode, struct qstr *name) +/** + * nfs4_negotiate_security - in response to an NFS4ERR_WRONGSEC on lookup, + * return an rpc_clnt that uses the best available security flavor with + * respect to the secinfo flavor list and the sec= mount options. + * + * @clnt: RPC client to clone + * @inode: directory inode + * @name: lookup name + * + * Please call rpc_shutdown_client() when you are done with this rpc client. + */ +struct rpc_clnt * +nfs4_negotiate_security(struct rpc_clnt *clnt, struct inode *inode, + struct qstr *name) { struct page *page; struct nfs4_secinfo_flavors *flavors; - rpc_authflavor_t flavor; + struct rpc_clnt *new; int err; page = alloc_page(GFP_KERNEL); if (!page) - return -ENOMEM; + return ERR_PTR(-ENOMEM); + flavors = page_address(page); err = nfs4_proc_secinfo(inode, name, flavors); if (err < 0) { - flavor = err; + new = ERR_PTR(err); goto out; } - flavor = nfs_find_best_sec(flavors); + new = nfs_find_best_sec(clnt, NFS_SERVER(inode), flavors); out: put_page(page); - return flavor; -} - -/* - * Please call rpc_shutdown_client() when you are done with this client. - */ -struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *clnt, struct inode *inode, - struct qstr *name) -{ - rpc_authflavor_t flavor; - - flavor = nfs4_negotiate_security(inode, name); - if ((int)flavor < 0) - return ERR_PTR((int)flavor); - - return rpc_clone_client_set_auth(clnt, flavor); + return new; } static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, @@ -214,6 +239,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, const struct nfs4_fs_location *location) { const size_t addr_bufsize = sizeof(struct sockaddr_storage); + struct net *net = rpc_net_ns(NFS_SB(mountdata->sb)->client); struct vfsmount *mnt = ERR_PTR(-ENOENT); char *mnt_path; unsigned int maxbuflen; @@ -239,8 +265,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, continue; mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, - mountdata->addr, addr_bufsize, - NFS_SB(mountdata->sb)); + mountdata->addr, addr_bufsize, net); if (mountdata->addrlen == 0) continue; @@ -389,13 +414,110 @@ struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry, if (client->cl_auth->au_flavor != flavor) flavor = client->cl_auth->au_flavor; - else if (!(server->flags & NFS_MOUNT_SECFLAVOUR)) { - rpc_authflavor_t new = nfs4_negotiate_security(dir, name); - if ((int)new >= 0) - flavor = new; - } mnt = nfs_do_submount(dentry, fh, fattr, flavor); out: rpc_shutdown_client(client); return mnt; } + +/* + * Try one location from the fs_locations array. + * + * Returns zero on success, or a negative errno value. + */ +static int nfs4_try_replacing_one_location(struct nfs_server *server, + char *page, char *page2, + const struct nfs4_fs_location *location) +{ + const size_t addr_bufsize = sizeof(struct sockaddr_storage); + struct net *net = rpc_net_ns(server->client); + struct sockaddr *sap; + unsigned int s; + size_t salen; + int error; + + sap = kmalloc(addr_bufsize, GFP_KERNEL); + if (sap == NULL) + return -ENOMEM; + + error = -ENOENT; + for (s = 0; s < location->nservers; s++) { + const struct nfs4_string *buf = &location->servers[s]; + char *hostname; + + if (buf->len <= 0 || buf->len > PAGE_SIZE) + continue; + + if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len) != NULL) + continue; + + salen = nfs_parse_server_name(buf->data, buf->len, + sap, addr_bufsize, net); + if (salen == 0) + continue; + rpc_set_port(sap, NFS_PORT); + + error = -ENOMEM; + hostname = kstrndup(buf->data, buf->len, GFP_KERNEL); + if (hostname == NULL) + break; + + error = nfs4_update_server(server, hostname, sap, salen, net); + kfree(hostname); + if (error == 0) + break; + } + + kfree(sap); + return error; +} + +/** + * nfs4_replace_transport - set up transport to destination server + * + * @server: export being migrated + * @locations: fs_locations array + * + * Returns zero on success, or a negative errno value. + * + * The client tries all the entries in the "locations" array, in the + * order returned by the server, until one works or the end of the + * array is reached. + */ +int nfs4_replace_transport(struct nfs_server *server, + const struct nfs4_fs_locations *locations) +{ + char *page = NULL, *page2 = NULL; + int loc, error; + + error = -ENOENT; + if (locations == NULL || locations->nlocations <= 0) + goto out; + + error = -ENOMEM; + page = (char *) __get_free_page(GFP_USER); + if (!page) + goto out; + page2 = (char *) __get_free_page(GFP_USER); + if (!page2) + goto out; + + for (loc = 0; loc < locations->nlocations; loc++) { + const struct nfs4_fs_location *location = + &locations->locations[loc]; + + if (location == NULL || location->nservers <= 0 || + location->rootpath.ncomponents == 0) + continue; + + error = nfs4_try_replacing_one_location(server, page, + page2, location); + if (error == 0) + break; + } + +out: + free_page((unsigned long)page); + free_page((unsigned long)page2); + return error; +} diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 75e46bbf7f4..4bf3d97cc5a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -105,9 +105,6 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry, if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0) return NULL; - if (NFS_SERVER(dir)->nfs_client->cl_minorversion < 2) - return NULL; - err = security_dentry_init_security(dentry, sattr->ia_mode, &dentry->d_name, (void **)&label->label, &label->len); if (err == 0) @@ -384,6 +381,14 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc case -NFS4ERR_STALE_CLIENTID: nfs4_schedule_lease_recovery(clp); goto wait_on_recovery; + case -NFS4ERR_MOVED: + ret = nfs4_schedule_migration_recovery(server); + if (ret < 0) + break; + goto wait_on_recovery; + case -NFS4ERR_LEASE_MOVED: + nfs4_schedule_lease_moved_recovery(clp); + goto wait_on_recovery; #if defined(CONFIG_NFS_V4_1) case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: @@ -431,6 +436,8 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc return nfs4_map_errors(ret); wait_on_recovery: ret = nfs4_wait_clnt_recover(clp); + if (test_bit(NFS_MIG_FAILED, &server->mig_status)) + return -EIO; if (ret == 0) exception->retry = 1; return ret; @@ -532,7 +539,7 @@ static int nfs40_sequence_done(struct rpc_task *task, struct nfs4_slot *slot = res->sr_slot; struct nfs4_slot_table *tbl; - if (!RPC_WAS_SENT(task)) + if (slot == NULL) goto out; tbl = slot->table; @@ -552,15 +559,10 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) { struct nfs4_session *session; struct nfs4_slot_table *tbl; + struct nfs4_slot *slot = res->sr_slot; bool send_new_highest_used_slotid = false; - if (!res->sr_slot) { - /* just wake up the next guy waiting since - * we may have not consumed a slot after all */ - dprintk("%s: No slot\n", __func__); - return; - } - tbl = res->sr_slot->table; + tbl = slot->table; session = tbl->session; spin_lock(&tbl->slot_tbl_lock); @@ -570,11 +572,11 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) if (tbl->highest_used_slotid > tbl->target_highest_slotid) send_new_highest_used_slotid = true; - if (nfs41_wake_and_assign_slot(tbl, res->sr_slot)) { + if (nfs41_wake_and_assign_slot(tbl, slot)) { send_new_highest_used_slotid = false; goto out_unlock; } - nfs4_free_slot(tbl, res->sr_slot); + nfs4_free_slot(tbl, slot); if (tbl->highest_used_slotid != NFS4_NO_SLOT) send_new_highest_used_slotid = false; @@ -585,19 +587,20 @@ out_unlock: nfs41_server_notify_highest_slotid_update(session->clp); } -static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) +int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { struct nfs4_session *session; - struct nfs4_slot *slot; + struct nfs4_slot *slot = res->sr_slot; struct nfs_client *clp; bool interrupted = false; int ret = 1; + if (slot == NULL) + goto out_noaction; /* don't increment the sequence number if the task wasn't sent */ if (!RPC_WAS_SENT(task)) goto out; - slot = res->sr_slot; session = slot->table->session; if (slot->interrupted) { @@ -672,6 +675,7 @@ out: /* The session may be reset by one of the error handlers. */ dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); nfs41_sequence_free_slot(res); +out_noaction: return ret; retry_nowait: if (rpc_restart_call_prepare(task)) { @@ -685,6 +689,7 @@ out_retry: rpc_delay(task, NFS4_POLL_RETRY_MAX); return 0; } +EXPORT_SYMBOL_GPL(nfs41_sequence_done); static int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) @@ -1063,6 +1068,7 @@ static void nfs4_opendata_free(struct kref *kref) dput(p->dentry); nfs_sb_deactive(sb); nfs_fattr_free_names(&p->f_attr); + kfree(p->f_attr.mdsthreshold); kfree(p); } @@ -1132,12 +1138,71 @@ static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode) nfs4_state_set_mode_locked(state, state->state | fmode); } -static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state) +{ + struct nfs_client *clp = state->owner->so_server->nfs_client; + bool need_recover = false; + + if (test_and_clear_bit(NFS_O_RDONLY_STATE, &state->flags) && state->n_rdonly) + need_recover = true; + if (test_and_clear_bit(NFS_O_WRONLY_STATE, &state->flags) && state->n_wronly) + need_recover = true; + if (test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags) && state->n_rdwr) + need_recover = true; + if (need_recover) + nfs4_state_mark_reclaim_nograce(clp, state); +} + +static bool nfs_need_update_open_stateid(struct nfs4_state *state, + nfs4_stateid *stateid) +{ + if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0) + return true; + if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) { + nfs_test_and_clear_all_open_stateid(state); + return true; + } + if (nfs4_stateid_is_newer(stateid, &state->open_stateid)) + return true; + return false; +} + +static void nfs_clear_open_stateid_locked(struct nfs4_state *state, + nfs4_stateid *stateid, fmode_t fmode) { + clear_bit(NFS_O_RDWR_STATE, &state->flags); + switch (fmode & (FMODE_READ|FMODE_WRITE)) { + case FMODE_WRITE: + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + break; + case FMODE_READ: + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + break; + case 0: + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_OPEN_STATE, &state->flags); + } + if (stateid == NULL) + return; + if (!nfs_need_update_open_stateid(state, stateid)) + return; if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) nfs4_stateid_copy(&state->stateid, stateid); nfs4_stateid_copy(&state->open_stateid, stateid); - set_bit(NFS_OPEN_STATE, &state->flags); +} + +static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +{ + write_seqlock(&state->seqlock); + nfs_clear_open_stateid_locked(state, stateid, fmode); + write_sequnlock(&state->seqlock); + if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) + nfs4_schedule_state_manager(state->owner->so_server->nfs_client); +} + +static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +{ switch (fmode) { case FMODE_READ: set_bit(NFS_O_RDONLY_STATE, &state->flags); @@ -1148,13 +1213,11 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid * case FMODE_READ|FMODE_WRITE: set_bit(NFS_O_RDWR_STATE, &state->flags); } -} - -static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) -{ - write_seqlock(&state->seqlock); - nfs_set_open_stateid_locked(state, stateid, fmode); - write_sequnlock(&state->seqlock); + if (!nfs_need_update_open_stateid(state, stateid)) + return; + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) + nfs4_stateid_copy(&state->stateid, stateid); + nfs4_stateid_copy(&state->open_stateid, stateid); } static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode) @@ -1212,6 +1275,8 @@ no_delegation: __update_open_stateid(state, open_stateid, NULL, fmode); ret = 1; } + if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) + nfs4_schedule_state_manager(state->owner->so_server->nfs_client); return ret; } @@ -1318,31 +1383,24 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) int ret; if (!data->rpc_done) { - ret = data->rpc_status; - goto err; + if (data->rpc_status) { + ret = data->rpc_status; + goto err; + } + /* cached opens have already been processed */ + goto update; } - ret = -ESTALE; - if (!(data->f_attr.valid & NFS_ATTR_FATTR_TYPE) || - !(data->f_attr.valid & NFS_ATTR_FATTR_FILEID) || - !(data->f_attr.valid & NFS_ATTR_FATTR_CHANGE)) - goto err; - - ret = -ENOMEM; - state = nfs4_get_open_state(inode, data->owner); - if (state == NULL) - goto err; - ret = nfs_refresh_inode(inode, &data->f_attr); if (ret) goto err; - nfs_setsecurity(inode, &data->f_attr, data->f_label); - if (data->o_res.delegation_type != 0) nfs4_opendata_check_deleg(data, state); +update: update_open_stateid(state, &data->o_res.stateid, NULL, data->o_arg.fmode); + atomic_inc(&state->count); return state; err: @@ -1452,12 +1510,15 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * struct nfs4_state *newstate; int ret; + /* Don't trigger recovery in nfs_test_and_clear_all_open_stateid */ + clear_bit(NFS_O_RDWR_STATE, &state->flags); + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_O_RDONLY_STATE, &state->flags); /* memory barrier prior to reading state->n_* */ clear_bit(NFS_DELEGATED_STATE, &state->flags); clear_bit(NFS_OPEN_STATE, &state->flags); smp_rmb(); if (state->n_rdwr != 0) { - clear_bit(NFS_O_RDWR_STATE, &state->flags); ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate); if (ret != 0) return ret; @@ -1465,7 +1526,6 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * return -ESTALE; } if (state->n_wronly != 0) { - clear_bit(NFS_O_WRONLY_STATE, &state->flags); ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate); if (ret != 0) return ret; @@ -1473,7 +1533,6 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * return -ESTALE; } if (state->n_rdonly != 0) { - clear_bit(NFS_O_RDONLY_STATE, &state->flags); ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate); if (ret != 0) return ret; @@ -1575,6 +1634,12 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct /* Don't recall a delegation if it was lost */ nfs4_schedule_lease_recovery(server->nfs_client); return -EAGAIN; + case -NFS4ERR_MOVED: + nfs4_schedule_migration_recovery(server); + return -EAGAIN; + case -NFS4ERR_LEASE_MOVED: + nfs4_schedule_lease_moved_recovery(server->nfs_client); + return -EAGAIN; case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: @@ -1616,15 +1681,15 @@ static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata) { struct nfs4_opendata *data = calldata; - nfs40_setup_sequence(data->o_arg.server, &data->o_arg.seq_args, - &data->o_res.seq_res, task); + nfs40_setup_sequence(data->o_arg.server, &data->c_arg.seq_args, + &data->c_res.seq_res, task); } static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) { struct nfs4_opendata *data = calldata; - nfs40_sequence_done(task, &data->o_res.seq_res); + nfs40_sequence_done(task, &data->c_res.seq_res); data->rpc_status = task->tk_status; if (data->rpc_status == 0) { @@ -1682,7 +1747,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data) }; int status; - nfs4_init_sequence(&data->o_arg.seq_args, &data->o_res.seq_res, 1); + nfs4_init_sequence(&data->c_arg.seq_args, &data->c_res.seq_res, 1); kref_get(&data->kref); data->rpc_done = 0; data->rpc_status = 0; @@ -1962,7 +2027,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) return status; } if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) - _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label); + nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label); return 0; } @@ -2240,10 +2305,12 @@ static int _nfs4_do_open(struct inode *dir, } } - if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) { - opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc(); - if (!opendata->f_attr.mdsthreshold) - goto err_free_label; + if (server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) { + if (!opendata->f_attr.mdsthreshold) { + opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc(); + if (!opendata->f_attr.mdsthreshold) + goto err_free_label; + } opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0]; } if (dentry->d_inode != NULL) @@ -2271,11 +2338,10 @@ static int _nfs4_do_open(struct inode *dir, if (opendata->file_created) *opened |= FILE_CREATED; - if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) + if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) { *ctx_th = opendata->f_attr.mdsthreshold; - else - kfree(opendata->f_attr.mdsthreshold); - opendata->f_attr.mdsthreshold = NULL; + opendata->f_attr.mdsthreshold = NULL; + } nfs4_label_free(olabel); @@ -2285,7 +2351,6 @@ static int _nfs4_do_open(struct inode *dir, err_free_label: nfs4_label_free(olabel); err_opendata_put: - kfree(opendata->f_attr.mdsthreshold); nfs4_opendata_put(opendata); err_put_state_owner: nfs4_put_state_owner(sp); @@ -2394,13 +2459,16 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) { /* Use that stateid */ - } else if (truncate && state != NULL && nfs4_valid_open_stateid(state)) { + } else if (truncate && state != NULL) { struct nfs_lockowner lockowner = { .l_owner = current->files, .l_pid = current->tgid, }; - nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE, - &lockowner); + if (!nfs4_valid_open_stateid(state)) + return -EBADF; + if (nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE, + &lockowner) == -EIO) + return -EBADF; } else nfs4_stateid_copy(&arg.stateid, &zero_stateid); @@ -2472,26 +2540,6 @@ static void nfs4_free_closedata(void *data) kfree(calldata); } -static void nfs4_close_clear_stateid_flags(struct nfs4_state *state, - fmode_t fmode) -{ - spin_lock(&state->owner->so_lock); - clear_bit(NFS_O_RDWR_STATE, &state->flags); - switch (fmode & (FMODE_READ|FMODE_WRITE)) { - case FMODE_WRITE: - clear_bit(NFS_O_RDONLY_STATE, &state->flags); - break; - case FMODE_READ: - clear_bit(NFS_O_WRONLY_STATE, &state->flags); - break; - case 0: - clear_bit(NFS_O_RDONLY_STATE, &state->flags); - clear_bit(NFS_O_WRONLY_STATE, &state->flags); - clear_bit(NFS_OPEN_STATE, &state->flags); - } - spin_unlock(&state->owner->so_lock); -} - static void nfs4_close_done(struct rpc_task *task, void *data) { struct nfs4_closedata *calldata = data; @@ -2510,11 +2558,10 @@ static void nfs4_close_done(struct rpc_task *task, void *data) if (calldata->roc) pnfs_roc_set_barrier(state->inode, calldata->roc_barrier); - nfs_set_open_stateid(state, &calldata->res.stateid, 0); + nfs_clear_open_stateid(state, &calldata->res.stateid, 0); renew_lease(server, calldata->timestamp); - nfs4_close_clear_stateid_flags(state, - calldata->arg.fmode); - break; + goto out_release; + case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_OLD_STATEID: case -NFS4ERR_BAD_STATEID: @@ -2522,9 +2569,13 @@ static void nfs4_close_done(struct rpc_task *task, void *data) if (calldata->arg.fmode == 0) break; default: - if (nfs4_async_handle_error(task, server, state) == -EAGAIN) + if (nfs4_async_handle_error(task, server, state) == -EAGAIN) { rpc_restart_call_prepare(task); + goto out_release; + } } + nfs_clear_open_stateid(state, NULL, calldata->arg.fmode); +out_release: nfs_release_seqid(calldata->arg.seqid); nfs_refresh_inode(calldata->inode, calldata->res.fattr); dprintk("%s: done, ret = %d!\n", __func__, task->tk_status); @@ -2697,6 +2748,10 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) nfs4_close_state(ctx->state, ctx->mode); } +#define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL) +#define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL) +#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_SECURITY_LABEL - 1UL) + static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) { struct nfs4_server_caps_arg args = { @@ -2712,13 +2767,27 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == 0) { + /* Sanity check the server answers */ + switch (server->nfs_client->cl_minorversion) { + case 0: + res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK; + res.attr_bitmask[2] = 0; + break; + case 1: + res.attr_bitmask[2] &= FATTR4_WORD2_NFS41_MASK; + break; + case 2: + res.attr_bitmask[2] &= FATTR4_WORD2_NFS42_MASK; + } memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS| NFS_CAP_SYMLINKS|NFS_CAP_FILEID| NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER| NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME| - NFS_CAP_CTIME|NFS_CAP_MTIME); - if (res.attr_bitmask[0] & FATTR4_WORD0_ACL) + NFS_CAP_CTIME|NFS_CAP_MTIME| + NFS_CAP_SECURITY_LABEL); + if (res.attr_bitmask[0] & FATTR4_WORD0_ACL && + res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL) server->caps |= NFS_CAP_ACLS; if (res.has_links != 0) server->caps |= NFS_CAP_HARDLINKS; @@ -2746,14 +2815,12 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f #endif memcpy(server->attr_bitmask_nl, res.attr_bitmask, sizeof(server->attr_bitmask)); + server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL; - if (server->caps & NFS_CAP_SECURITY_LABEL) { - server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL; - res.attr_bitmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL; - } memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask)); server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; + server->cache_consistency_bitmask[2] = 0; server->acl_bitmask = res.acl_bitmask; server->fh_expire_type = res.fh_expire_type; } @@ -2864,11 +2931,24 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, int status = -EPERM; size_t i; - for (i = 0; i < ARRAY_SIZE(flav_array); i++) { - status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]); - if (status == -NFS4ERR_WRONGSEC || status == -EACCES) - continue; - break; + if (server->auth_info.flavor_len > 0) { + /* try each flavor specified by user */ + for (i = 0; i < server->auth_info.flavor_len; i++) { + status = nfs4_lookup_root_sec(server, fhandle, info, + server->auth_info.flavors[i]); + if (status == -NFS4ERR_WRONGSEC || status == -EACCES) + continue; + break; + } + } else { + /* no flavors specified by user, try default list */ + for (i = 0; i < ARRAY_SIZE(flav_array); i++) { + status = nfs4_lookup_root_sec(server, fhandle, info, + flav_array[i]); + if (status == -NFS4ERR_WRONGSEC || status == -EACCES) + continue; + break; + } } /* @@ -2910,9 +2990,6 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, status = nfs4_lookup_root(server, fhandle, info); if (status != -NFS4ERR_WRONGSEC) break; - /* Did user force a 'sec=' mount option? */ - if (server->flags & NFS_MOUNT_SECFLAVOUR) - break; default: status = nfs4_do_find_root_sec(server, fhandle, info); } @@ -2981,11 +3058,16 @@ static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir, status = nfs4_proc_fs_locations(client, dir, name, locations, page); if (status != 0) goto out; - /* Make sure server returned a different fsid for the referral */ + + /* + * If the fsid didn't change, this is a migration event, not a + * referral. Cause us to drop into the exception handler, which + * will kick off migration recovery. + */ if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) { dprintk("%s: server did not return a different fsid for" " a referral at %s\n", __func__, name->name); - status = -EIO; + status = -NFS4ERR_MOVED; goto out; } /* Fixup attributes for the nfs_lookup() call to nfs_fhget() */ @@ -3165,10 +3247,7 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, err = -EPERM; if (client != *clnt) goto out; - /* No security negotiation if the user specified 'sec=' */ - if (NFS_SERVER(dir)->flags & NFS_MOUNT_SECFLAVOUR) - goto out; - client = nfs4_create_sec_client(client, dir, name); + client = nfs4_negotiate_security(client, dir, name); if (IS_ERR(client)) return PTR_ERR(client); @@ -3469,49 +3548,6 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, return 1; } -static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, - struct inode *new_dir, struct qstr *new_name) -{ - struct nfs_server *server = NFS_SERVER(old_dir); - struct nfs_renameargs arg = { - .old_dir = NFS_FH(old_dir), - .new_dir = NFS_FH(new_dir), - .old_name = old_name, - .new_name = new_name, - }; - struct nfs_renameres res = { - .server = server, - }; - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME], - .rpc_argp = &arg, - .rpc_resp = &res, - }; - int status = -ENOMEM; - - status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); - if (!status) { - update_changeattr(old_dir, &res.old_cinfo); - update_changeattr(new_dir, &res.new_cinfo); - } - return status; -} - -static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, - struct inode *new_dir, struct qstr *new_name) -{ - struct nfs4_exception exception = { }; - int err; - do { - err = _nfs4_proc_rename(old_dir, old_name, - new_dir, new_name); - trace_nfs4_rename(old_dir, old_name, new_dir, new_name, err); - err = nfs4_handle_exception(NFS_SERVER(old_dir), err, - &exception); - } while (exception.retry); - return err; -} - static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) { struct nfs_server *server = NFS_SERVER(inode); @@ -3976,8 +4012,9 @@ static bool nfs4_stateid_is_current(nfs4_stateid *stateid, { nfs4_stateid current_stateid; - if (nfs4_set_rw_stateid(¤t_stateid, ctx, l_ctx, fmode)) - return false; + /* If the current stateid represents a lost lock, then exit */ + if (nfs4_set_rw_stateid(¤t_stateid, ctx, l_ctx, fmode) == -EIO) + return true; return nfs4_stateid_match(stateid, ¤t_stateid); } @@ -3996,12 +4033,12 @@ static bool nfs4_error_stateid_expired(int err) return false; } -void __nfs4_read_done_cb(struct nfs_read_data *data) +void __nfs4_read_done_cb(struct nfs_pgio_data *data) { nfs_invalidate_atime(data->header->inode); } -static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) +static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_data *data) { struct nfs_server *server = NFS_SERVER(data->header->inode); @@ -4018,7 +4055,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) } static bool nfs4_read_stateid_changed(struct rpc_task *task, - struct nfs_readargs *args) + struct nfs_pgio_args *args) { if (!nfs4_error_stateid_expired(task->tk_status) || @@ -4031,7 +4068,7 @@ static bool nfs4_read_stateid_changed(struct rpc_task *task, return true; } -static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) +static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_data *data) { dprintk("--> %s\n", __func__); @@ -4040,19 +4077,19 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) return -EAGAIN; if (nfs4_read_stateid_changed(task, &data->args)) return -EAGAIN; - return data->read_done_cb ? data->read_done_cb(task, data) : + return data->pgio_done_cb ? data->pgio_done_cb(task, data) : nfs4_read_done_cb(task, data); } -static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) +static void nfs4_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg) { data->timestamp = jiffies; - data->read_done_cb = nfs4_read_done_cb; + data->pgio_done_cb = nfs4_read_done_cb; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); } -static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) +static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data) { if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), &data->args.seq_args, @@ -4060,14 +4097,14 @@ static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_dat task)) return 0; if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context, - data->args.lock_context, FMODE_READ) == -EIO) + data->args.lock_context, data->header->rw_ops->rw_mode) == -EIO) return -EIO; if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags))) return -EIO; return 0; } -static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data) +static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_pgio_data *data) { struct inode *inode = data->header->inode; @@ -4084,7 +4121,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data } static bool nfs4_write_stateid_changed(struct rpc_task *task, - struct nfs_writeargs *args) + struct nfs_pgio_args *args) { if (!nfs4_error_stateid_expired(task->tk_status) || @@ -4097,18 +4134,18 @@ static bool nfs4_write_stateid_changed(struct rpc_task *task, return true; } -static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_data *data) { if (!nfs4_sequence_done(task, &data->res.seq_res)) return -EAGAIN; if (nfs4_write_stateid_changed(task, &data->args)) return -EAGAIN; - return data->write_done_cb ? data->write_done_cb(task, data) : + return data->pgio_done_cb ? data->pgio_done_cb(task, data) : nfs4_write_done_cb(task, data); } static -bool nfs4_write_need_cache_consistency_data(const struct nfs_write_data *data) +bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data) { const struct nfs_pgio_header *hdr = data->header; @@ -4121,7 +4158,7 @@ bool nfs4_write_need_cache_consistency_data(const struct nfs_write_data *data) return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0; } -static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) +static void nfs4_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg) { struct nfs_server *server = NFS_SERVER(data->header->inode); @@ -4131,8 +4168,8 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag } else data->args.bitmask = server->cache_consistency_bitmask; - if (!data->write_done_cb) - data->write_done_cb = nfs4_write_done_cb; + if (!data->pgio_done_cb) + data->pgio_done_cb = nfs4_write_done_cb; data->res.server = server; data->timestamp = jiffies; @@ -4140,21 +4177,6 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); } -static int nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) -{ - if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), - &data->args.seq_args, - &data->res.seq_res, - task)) - return 0; - if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context, - data->args.lock_context, FMODE_WRITE) == -EIO) - return -EIO; - if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags))) - return -EIO; - return 0; -} - static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) { nfs4_setup_sequence(NFS_SERVER(data->inode), @@ -4220,7 +4242,13 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata) unsigned long timestamp = data->timestamp; trace_nfs4_renew_async(clp, task->tk_status); - if (task->tk_status < 0) { + switch (task->tk_status) { + case 0: + break; + case -NFS4ERR_LEASE_MOVED: + nfs4_schedule_lease_moved_recovery(clp); + break; + default: /* Unless we're shutting down, schedule state recovery! */ if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0) return; @@ -4279,9 +4307,7 @@ static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) static inline int nfs4_server_supports_acls(struct nfs_server *server) { - return (server->caps & NFS_CAP_ACLS) - && (server->acl_bitmask & ACL4_SUPPORT_ALLOW_ACL) - && (server->acl_bitmask & ACL4_SUPPORT_DENY_ACL); + return server->caps & NFS_CAP_ACLS; } /* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_SIZE, and that @@ -4574,7 +4600,7 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf, struct nfs4_label label = {0, 0, buflen, buf}; u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; - struct nfs4_getattr_arg args = { + struct nfs4_getattr_arg arg = { .fh = NFS_FH(inode), .bitmask = bitmask, }; @@ -4585,14 +4611,14 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf, }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETATTR], - .rpc_argp = &args, + .rpc_argp = &arg, .rpc_resp = &res, }; int ret; nfs_fattr_init(&fattr); - ret = rpc_call_sync(server->client, &msg, 0); + ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0); if (ret) return ret; if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL)) @@ -4629,7 +4655,7 @@ static int _nfs4_do_set_security_label(struct inode *inode, struct iattr sattr = {0}; struct nfs_server *server = NFS_SERVER(inode); const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; - struct nfs_setattrargs args = { + struct nfs_setattrargs arg = { .fh = NFS_FH(inode), .iap = &sattr, .server = server, @@ -4643,14 +4669,14 @@ static int _nfs4_do_set_security_label(struct inode *inode, }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], - .rpc_argp = &args, + .rpc_argp = &arg, .rpc_resp = &res, }; int status; - nfs4_stateid_copy(&args.stateid, &zero_stateid); + nfs4_stateid_copy(&arg.stateid, &zero_stateid); - status = rpc_call_sync(server->client, &msg, 0); + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (status) dprintk("%s failed: %d\n", __func__, status); @@ -4734,17 +4760,24 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, if (state == NULL) break; if (nfs4_schedule_stateid_recovery(server, state) < 0) - goto stateid_invalid; + goto recovery_failed; goto wait_on_recovery; case -NFS4ERR_EXPIRED: if (state != NULL) { if (nfs4_schedule_stateid_recovery(server, state) < 0) - goto stateid_invalid; + goto recovery_failed; } case -NFS4ERR_STALE_STATEID: case -NFS4ERR_STALE_CLIENTID: nfs4_schedule_lease_recovery(clp); goto wait_on_recovery; + case -NFS4ERR_MOVED: + if (nfs4_schedule_migration_recovery(server) < 0) + goto recovery_failed; + goto wait_on_recovery; + case -NFS4ERR_LEASE_MOVED: + nfs4_schedule_lease_moved_recovery(clp); + goto wait_on_recovery; #if defined(CONFIG_NFS_V4_1) case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: @@ -4756,29 +4789,28 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, dprintk("%s ERROR %d, Reset session\n", __func__, task->tk_status); nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); - task->tk_status = 0; - return -EAGAIN; + goto wait_on_recovery; #endif /* CONFIG_NFS_V4_1 */ case -NFS4ERR_DELAY: nfs_inc_server_stats(server, NFSIOS_DELAY); case -NFS4ERR_GRACE: rpc_delay(task, NFS4_POLL_RETRY_MAX); - task->tk_status = 0; - return -EAGAIN; case -NFS4ERR_RETRY_UNCACHED_REP: case -NFS4ERR_OLD_STATEID: - task->tk_status = 0; - return -EAGAIN; + goto restart_call; } task->tk_status = nfs4_map_errors(task->tk_status); return 0; -stateid_invalid: +recovery_failed: task->tk_status = -EIO; return 0; wait_on_recovery: rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); + if (test_bit(NFS_MIG_FAILED, &server->mig_status)) + goto recovery_failed; +restart_call: task->tk_status = 0; return -EAGAIN; } @@ -4835,6 +4867,20 @@ nfs4_init_uniform_client_string(const struct nfs_client *clp, nodename); } +/* + * nfs4_callback_up_net() starts only "tcp" and "tcp6" callback + * services. Advertise one based on the address family of the + * clientaddr. + */ +static unsigned int +nfs4_init_callback_netid(const struct nfs_client *clp, char *buf, size_t len) +{ + if (strchr(clp->cl_ipaddr, ':') != NULL) + return scnprintf(buf, len, "tcp6"); + else + return scnprintf(buf, len, "tcp"); +} + /** * nfs4_proc_setclientid - Negotiate client ID * @clp: state data structure @@ -4876,12 +4922,10 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, setclientid.sc_name, sizeof(setclientid.sc_name)); /* cb_client4 */ - rcu_read_lock(); - setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, - sizeof(setclientid.sc_netid), "%s", - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_NETID)); - rcu_read_unlock(); + setclientid.sc_netid_len = + nfs4_init_callback_netid(clp, + setclientid.sc_netid, + sizeof(setclientid.sc_netid)); setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr, sizeof(setclientid.sc_uaddr), "%s.%u.%u", clp->cl_ipaddr, port >> 8, port & 255); @@ -4942,11 +4986,17 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status); switch (task->tk_status) { - case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_EXPIRED: case 0: renew_lease(data->res.server, data->timestamp); break; + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + task->tk_status = 0; + break; default: if (nfs4_async_handle_error(task, data->res.server, NULL) == -EAGAIN) { @@ -5105,6 +5155,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock status = 0; } request->fl_ops->fl_release_private(request); + request->fl_ops = NULL; out: return status; } @@ -5776,21 +5827,36 @@ struct nfs_release_lockowner_data { struct nfs4_lock_state *lsp; struct nfs_server *server; struct nfs_release_lockowner_args args; - struct nfs4_sequence_args seq_args; - struct nfs4_sequence_res seq_res; + struct nfs_release_lockowner_res res; + unsigned long timestamp; }; static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata) { struct nfs_release_lockowner_data *data = calldata; nfs40_setup_sequence(data->server, - &data->seq_args, &data->seq_res, task); + &data->args.seq_args, &data->res.seq_res, task); + data->timestamp = jiffies; } static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata) { struct nfs_release_lockowner_data *data = calldata; - nfs40_sequence_done(task, &data->seq_res); + struct nfs_server *server = data->server; + + nfs40_sequence_done(task, &data->res.seq_res); + + switch (task->tk_status) { + case 0: + renew_lease(server, data->timestamp); + break; + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_EXPIRED: + case -NFS4ERR_LEASE_MOVED: + case -NFS4ERR_DELAY: + if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) + rpc_restart_call_prepare(task); + } } static void nfs4_release_lockowner_release(void *calldata) @@ -5819,7 +5885,6 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st data = kmalloc(sizeof(*data), GFP_NOFS); if (!data) return -ENOMEM; - nfs4_init_sequence(&data->seq_args, &data->seq_res, 0); data->lsp = lsp; data->server = server; data->args.lock_owner.clientid = server->nfs_client->cl_clientid; @@ -5827,6 +5892,8 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st data->args.lock_owner.s_dev = server->s_dev; msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data); return 0; } @@ -5989,6 +6056,283 @@ int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, return err; } +/* + * This operation also signals the server that this client is + * performing migration recovery. The server can stop returning + * NFS4ERR_LEASE_MOVED to this client. A RENEW operation is + * appended to this compound to identify the client ID which is + * performing recovery. + */ +static int _nfs40_proc_get_locations(struct inode *inode, + struct nfs4_fs_locations *locations, + struct page *page, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct rpc_clnt *clnt = server->client; + u32 bitmask[2] = { + [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, + }; + struct nfs4_fs_locations_arg args = { + .clientid = server->nfs_client->cl_clientid, + .fh = NFS_FH(inode), + .page = page, + .bitmask = bitmask, + .migration = 1, /* skip LOOKUP */ + .renew = 1, /* append RENEW */ + }; + struct nfs4_fs_locations_res res = { + .fs_locations = locations, + .migration = 1, + .renew = 1, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + unsigned long now = jiffies; + int status; + + nfs_fattr_init(&locations->fattr); + locations->server = server; + locations->nlocations = 0; + + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_set_sequence_privileged(&args.seq_args); + status = nfs4_call_sync_sequence(clnt, server, &msg, + &args.seq_args, &res.seq_res); + if (status) + return status; + + renew_lease(server, now); + return 0; +} + +#ifdef CONFIG_NFS_V4_1 + +/* + * This operation also signals the server that this client is + * performing migration recovery. The server can stop asserting + * SEQ4_STATUS_LEASE_MOVED for this client. The client ID + * performing this operation is identified in the SEQUENCE + * operation in this compound. + * + * When the client supports GETATTR(fs_locations_info), it can + * be plumbed in here. + */ +static int _nfs41_proc_get_locations(struct inode *inode, + struct nfs4_fs_locations *locations, + struct page *page, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct rpc_clnt *clnt = server->client; + u32 bitmask[2] = { + [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, + }; + struct nfs4_fs_locations_arg args = { + .fh = NFS_FH(inode), + .page = page, + .bitmask = bitmask, + .migration = 1, /* skip LOOKUP */ + }; + struct nfs4_fs_locations_res res = { + .fs_locations = locations, + .migration = 1, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + int status; + + nfs_fattr_init(&locations->fattr); + locations->server = server; + locations->nlocations = 0; + + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_set_sequence_privileged(&args.seq_args); + status = nfs4_call_sync_sequence(clnt, server, &msg, + &args.seq_args, &res.seq_res); + if (status == NFS4_OK && + res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED) + status = -NFS4ERR_LEASE_MOVED; + return status; +} + +#endif /* CONFIG_NFS_V4_1 */ + +/** + * nfs4_proc_get_locations - discover locations for a migrated FSID + * @inode: inode on FSID that is migrating + * @locations: result of query + * @page: buffer + * @cred: credential to use for this operation + * + * Returns NFS4_OK on success, a negative NFS4ERR status code if the + * operation failed, or a negative errno if a local error occurred. + * + * On success, "locations" is filled in, but if the server has + * no locations information, NFS_ATTR_FATTR_V4_LOCATIONS is not + * asserted. + * + * -NFS4ERR_LEASE_MOVED is returned if the server still has leases + * from this client that require migration recovery. + */ +int nfs4_proc_get_locations(struct inode *inode, + struct nfs4_fs_locations *locations, + struct page *page, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = server->nfs_client; + const struct nfs4_mig_recovery_ops *ops = + clp->cl_mvops->mig_recovery_ops; + struct nfs4_exception exception = { }; + int status; + + dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + clp->cl_hostname); + nfs_display_fhandle(NFS_FH(inode), __func__); + + do { + status = ops->get_locations(inode, locations, page, cred); + if (status != -NFS4ERR_DELAY) + break; + nfs4_handle_exception(server, status, &exception); + } while (exception.retry); + return status; +} + +/* + * This operation also signals the server that this client is + * performing "lease moved" recovery. The server can stop + * returning NFS4ERR_LEASE_MOVED to this client. A RENEW operation + * is appended to this compound to identify the client ID which is + * performing recovery. + */ +static int _nfs40_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct rpc_clnt *clnt = server->client; + struct nfs4_fsid_present_arg args = { + .fh = NFS_FH(inode), + .clientid = clp->cl_clientid, + .renew = 1, /* append RENEW */ + }; + struct nfs4_fsid_present_res res = { + .renew = 1, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + unsigned long now = jiffies; + int status; + + res.fh = nfs_alloc_fhandle(); + if (res.fh == NULL) + return -ENOMEM; + + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_set_sequence_privileged(&args.seq_args); + status = nfs4_call_sync_sequence(clnt, server, &msg, + &args.seq_args, &res.seq_res); + nfs_free_fhandle(res.fh); + if (status) + return status; + + do_renew_lease(clp, now); + return 0; +} + +#ifdef CONFIG_NFS_V4_1 + +/* + * This operation also signals the server that this client is + * performing "lease moved" recovery. The server can stop asserting + * SEQ4_STATUS_LEASE_MOVED for this client. The client ID performing + * this operation is identified in the SEQUENCE operation in this + * compound. + */ +static int _nfs41_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct rpc_clnt *clnt = server->client; + struct nfs4_fsid_present_arg args = { + .fh = NFS_FH(inode), + }; + struct nfs4_fsid_present_res res = { + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + int status; + + res.fh = nfs_alloc_fhandle(); + if (res.fh == NULL) + return -ENOMEM; + + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_set_sequence_privileged(&args.seq_args); + status = nfs4_call_sync_sequence(clnt, server, &msg, + &args.seq_args, &res.seq_res); + nfs_free_fhandle(res.fh); + if (status == NFS4_OK && + res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED) + status = -NFS4ERR_LEASE_MOVED; + return status; +} + +#endif /* CONFIG_NFS_V4_1 */ + +/** + * nfs4_proc_fsid_present - Is this FSID present or absent on server? + * @inode: inode on FSID to check + * @cred: credential to use for this operation + * + * Server indicates whether the FSID is present, moved, or not + * recognized. This operation is necessary to clear a LEASE_MOVED + * condition for this client ID. + * + * Returns NFS4_OK if the FSID is present on this server, + * -NFS4ERR_MOVED if the FSID is no longer present, a negative + * NFS4ERR code if some error occurred on the server, or a + * negative errno if a local failure occurred. + */ +int nfs4_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = server->nfs_client; + const struct nfs4_mig_recovery_ops *ops = + clp->cl_mvops->mig_recovery_ops; + struct nfs4_exception exception = { }; + int status; + + dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + clp->cl_hostname); + nfs_display_fhandle(NFS_FH(inode), __func__); + + do { + status = ops->fsid_present(inode, cred); + if (status != -NFS4ERR_DELAY) + break; + nfs4_handle_exception(server, status, &exception); + } while (exception.retry); + return status; +} + /** * If 'use_integrity' is true and the state managment nfs_client * cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient @@ -6275,8 +6619,14 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, struct nfs41_exchange_id_args args = { .verifier = &verifier, .client = clp, +#ifdef CONFIG_NFS_V4_1_MIGRATION + .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | + EXCHGID4_FLAG_BIND_PRINC_STATEID | + EXCHGID4_FLAG_SUPP_MOVED_MIGR, +#else .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | - EXCHGID4_FLAG_BIND_PRINC_STATEID, + EXCHGID4_FLAG_BIND_PRINC_STATEID, +#endif }; struct nfs41_exchange_id_res res = { 0 @@ -7055,9 +7405,9 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) struct nfs_server *server = NFS_SERVER(inode); struct pnfs_layout_hdr *lo; struct nfs4_state *state = NULL; - unsigned long timeo, giveup; + unsigned long timeo, now, giveup; - dprintk("--> %s\n", __func__); + dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); if (!nfs41_sequence_done(task, &lgp->res.seq_res)) goto out; @@ -7065,12 +7415,38 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case 0: goto out; + /* + * NFS4ERR_LAYOUTTRYLATER is a conflict with another client + * (or clients) writing to the same RAID stripe + */ case -NFS4ERR_LAYOUTTRYLATER: + /* + * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall + * existing layout before getting a new one). + */ case -NFS4ERR_RECALLCONFLICT: timeo = rpc_get_timeout(task->tk_client); giveup = lgp->args.timestamp + timeo; - if (time_after(giveup, jiffies)) - task->tk_status = -NFS4ERR_DELAY; + now = jiffies; + if (time_after(giveup, now)) { + unsigned long delay; + + /* Delay for: + * - Not less then NFS4_POLL_RETRY_MIN. + * - One last time a jiffie before we give up + * - exponential backoff (time_now minus start_attempt) + */ + delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN, + min((giveup - now - 1), + now - lgp->args.timestamp)); + + dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n", + __func__, delay); + rpc_delay(task, delay); + task->tk_status = 0; + rpc_restart_call_prepare(task); + goto out; /* Do not call nfs4_async_handle_error() */ + } break; case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: @@ -7244,7 +7620,14 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) return; server = NFS_SERVER(lrp->args.inode); - if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { + switch (task->tk_status) { + default: + task->tk_status = 0; + case 0: + break; + case -NFS4ERR_DELAY: + if (nfs4_async_handle_error(task, server, NULL) != -EAGAIN) + break; rpc_restart_call_prepare(task); return; } @@ -7419,10 +7802,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) case -NFS4ERR_BADLAYOUT: /* no layout */ case -NFS4ERR_GRACE: /* loca_recalim always false */ task->tk_status = 0; - break; case 0: - nfs_post_op_update_inode_force_wcc(data->args.inode, - data->res.fattr); break; default: if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { @@ -7437,6 +7817,8 @@ static void nfs4_layoutcommit_release(void *calldata) struct nfs4_layoutcommit_data *data = calldata; pnfs_cleanup_layoutcommit(data); + nfs_post_op_update_inode_force_wcc(data->args.inode, + data->res.fattr); put_rpccred(data->cred); kfree(data); } @@ -7559,7 +7941,7 @@ nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, switch (err) { case 0: case -NFS4ERR_WRONGSEC: - case -NFS4ERR_NOTSUPP: + case -ENOTSUPP: goto out; default: err = nfs4_handle_exception(server, err, &exception); @@ -7593,7 +7975,7 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, * Fall back on "guess and check" method if * the server doesn't support SECINFO_NO_NAME */ - if (err == -NFS4ERR_WRONGSEC || err == -NFS4ERR_NOTSUPP) { + if (err == -NFS4ERR_WRONGSEC || err == -ENOTSUPP) { err = nfs4_find_root_sec(server, fhandle, info); goto out_freepage; } @@ -7615,6 +7997,9 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, break; } + if (!nfs_auth_info_match(&server->auth_info, flavor)) + flavor = RPC_AUTH_MAXFLAVOR; + if (flavor != RPC_AUTH_MAXFLAVOR) { err = nfs4_lookup_root_sec(server, fhandle, info, flavor); @@ -7886,6 +8271,18 @@ static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = { }; #endif +static const struct nfs4_mig_recovery_ops nfs40_mig_recovery_ops = { + .get_locations = _nfs40_proc_get_locations, + .fsid_present = _nfs40_proc_fsid_present, +}; + +#if defined(CONFIG_NFS_V4_1) +static const struct nfs4_mig_recovery_ops nfs41_mig_recovery_ops = { + .get_locations = _nfs41_proc_get_locations, + .fsid_present = _nfs41_proc_fsid_present, +}; +#endif /* CONFIG_NFS_V4_1 */ + static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .minor_version = 0, .init_caps = NFS_CAP_READDIRPLUS @@ -7901,6 +8298,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .reboot_recovery_ops = &nfs40_reboot_recovery_ops, .nograce_recovery_ops = &nfs40_nograce_recovery_ops, .state_renewal_ops = &nfs40_state_renewal_ops, + .mig_recovery_ops = &nfs40_mig_recovery_ops, }; #if defined(CONFIG_NFS_V4_1) @@ -7921,6 +8319,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, .state_renewal_ops = &nfs41_state_renewal_ops, + .mig_recovery_ops = &nfs41_mig_recovery_ops, }; #endif @@ -8004,7 +8403,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .unlink_setup = nfs4_proc_unlink_setup, .unlink_rpc_prepare = nfs4_proc_unlink_rpc_prepare, .unlink_done = nfs4_proc_unlink_done, - .rename = nfs4_proc_rename, .rename_setup = nfs4_proc_rename_setup, .rename_rpc_prepare = nfs4_proc_rename_rpc_prepare, .rename_done = nfs4_proc_rename_done, @@ -8019,13 +8417,10 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .pathconf = nfs4_proc_pathconf, .set_capabilities = nfs4_server_capabilities, .decode_dirent = nfs4_decode_dirent, + .pgio_rpc_prepare = nfs4_proc_pgio_rpc_prepare, .read_setup = nfs4_proc_read_setup, - .read_pageio_init = pnfs_pageio_init_read, - .read_rpc_prepare = nfs4_proc_read_rpc_prepare, .read_done = nfs4_read_done, .write_setup = nfs4_proc_write_setup, - .write_pageio_init = pnfs_pageio_init_write, - .write_rpc_prepare = nfs4_proc_write_rpc_prepare, .write_done = nfs4_write_done, .commit_setup = nfs4_proc_commit_setup, .commit_rpc_prepare = nfs4_proc_commit_rpc_prepare, diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index cf883c7ae05..e799dc3c3b1 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -231,14 +231,23 @@ out: return ret; } +/* + * nfs4_release_slot_table - release all slot table entries + */ +static void nfs4_release_slot_table(struct nfs4_slot_table *tbl) +{ + nfs4_shrink_slot_table(tbl, 0); +} + /** - * nfs4_release_slot_table - release resources attached to a slot table + * nfs4_shutdown_slot_table - release resources attached to a slot table * @tbl: slot table to shut down * */ -void nfs4_release_slot_table(struct nfs4_slot_table *tbl) +void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl) { - nfs4_shrink_slot_table(tbl, 0); + nfs4_release_slot_table(tbl); + rpc_destroy_wait_queue(&tbl->slot_tbl_waitq); } /** @@ -422,7 +431,7 @@ void nfs41_update_target_slotid(struct nfs4_slot_table *tbl, spin_unlock(&tbl->slot_tbl_lock); } -static void nfs4_destroy_session_slot_tables(struct nfs4_session *session) +static void nfs4_release_session_slot_tables(struct nfs4_session *session) { nfs4_release_slot_table(&session->fc_slot_table); nfs4_release_slot_table(&session->bc_slot_table); @@ -450,7 +459,7 @@ int nfs4_setup_session_slot_tables(struct nfs4_session *ses) if (status && tbl->slots == NULL) /* Fore and back channel share a connection so get * both slot tables or neither */ - nfs4_destroy_session_slot_tables(ses); + nfs4_release_session_slot_tables(ses); return status; } @@ -470,6 +479,12 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) return session; } +static void nfs4_destroy_session_slot_tables(struct nfs4_session *session) +{ + nfs4_shutdown_slot_table(&session->fc_slot_table); + nfs4_shutdown_slot_table(&session->bc_slot_table); +} + void nfs4_destroy_session(struct nfs4_session *session) { struct rpc_xprt *xprt; diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 23230610065..b34ada9bc6a 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -74,7 +74,7 @@ enum nfs4_session_state { extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, unsigned int max_reqs, const char *queue); -extern void nfs4_release_slot_table(struct nfs4_slot_table *tbl); +extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl); extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl); extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot); extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index cc14cbb78b7..848f6853c59 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -239,14 +239,12 @@ static void nfs4_end_drain_session(struct nfs_client *clp) } } -#if defined(CONFIG_NFS_V4_1) - static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl) { set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state); spin_lock(&tbl->slot_tbl_lock); if (tbl->highest_used_slotid != NFS4_NO_SLOT) { - INIT_COMPLETION(tbl->complete); + reinit_completion(&tbl->complete); spin_unlock(&tbl->slot_tbl_lock); return wait_for_completion_interruptible(&tbl->complete); } @@ -270,6 +268,8 @@ static int nfs4_begin_drain_session(struct nfs_client *clp) return nfs4_drain_slot_tbl(&ses->fc_slot_table); } +#if defined(CONFIG_NFS_V4_1) + static int nfs41_setup_state_renewal(struct nfs_client *clp) { int status; @@ -974,9 +974,6 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst, else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) { nfs4_stateid_copy(dst, &lsp->ls_stateid); ret = 0; - smp_rmb(); - if (!list_empty(&lsp->ls_seqid.list)) - ret = -EWOULDBLOCK; } spin_unlock(&state->state_lock); nfs4_put_lock_state(lsp); @@ -984,10 +981,9 @@ out: return ret; } -static int nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) +static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) { const nfs4_stateid *src; - int ret; int seq; do { @@ -996,12 +992,7 @@ static int nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) if (test_bit(NFS_OPEN_STATE, &state->flags)) src = &state->open_stateid; nfs4_stateid_copy(dst, src); - ret = 0; - smp_rmb(); - if (!list_empty(&state->owner->so_seqid.list)) - ret = -EWOULDBLOCK; } while (read_seqretry(&state->seqlock, seq)); - return ret; } /* @@ -1015,15 +1006,19 @@ int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state, if (ret == -EIO) /* A lost lock - don't even consider delegations */ goto out; - if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) + /* returns true if delegation stateid found and copied */ + if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) { + ret = 0; goto out; + } if (ret != -ENOENT) /* nfs4_copy_delegation_stateid() didn't over-write * dst, so it still has the lock stateid which we now * choose to use. */ goto out; - ret = nfs4_copy_open_stateid(dst, state); + nfs4_copy_open_stateid(dst, state); + ret = 0; out: if (nfs_server_capable(state->inode, NFS_CAP_STATEID_NFSV41)) dst->seqid = 0; @@ -1071,7 +1066,7 @@ void nfs_free_seqid(struct nfs_seqid *seqid) /* * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or * failed with a seqid incrementing error - - * see comments nfs_fs.h:seqid_mutating_error() + * see comments nfs4.h:seqid_mutating_error() */ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) { @@ -1116,7 +1111,7 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid) /* * Increment the seqid if the LOCK/LOCKU succeeded, or * failed with a seqid incrementing error - - * see comments nfs_fs.h:seqid_mutating_error() + * see comments nfs4.h:seqid_mutating_error() */ void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid) { @@ -1145,9 +1140,9 @@ static int nfs4_run_state_manager(void *); static void nfs4_clear_state_manager_bit(struct nfs_client *clp) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING); rpc_wake_up(&clp->cl_rpcwaitq); } @@ -1197,20 +1192,74 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp) } EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery); +/** + * nfs4_schedule_migration_recovery - trigger migration recovery + * + * @server: FSID that is migrating + * + * Returns zero if recovery has started, otherwise a negative NFS4ERR + * value is returned. + */ +int nfs4_schedule_migration_recovery(const struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + + if (server->fh_expire_type != NFS4_FH_PERSISTENT) { + pr_err("NFS: volatile file handles not supported (server %s)\n", + clp->cl_hostname); + return -NFS4ERR_IO; + } + + if (test_bit(NFS_MIG_FAILED, &server->mig_status)) + return -NFS4ERR_IO; + + dprintk("%s: scheduling migration recovery for (%llx:%llx) on %s\n", + __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + clp->cl_hostname); + + set_bit(NFS_MIG_IN_TRANSITION, + &((struct nfs_server *)server)->mig_status); + set_bit(NFS4CLNT_MOVED, &clp->cl_state); + + nfs4_schedule_state_manager(clp); + return 0; +} +EXPORT_SYMBOL_GPL(nfs4_schedule_migration_recovery); + +/** + * nfs4_schedule_lease_moved_recovery - start lease-moved recovery + * + * @clp: server to check for moved leases + * + */ +void nfs4_schedule_lease_moved_recovery(struct nfs_client *clp) +{ + dprintk("%s: scheduling lease-moved recovery for client ID %llx on %s\n", + __func__, clp->cl_clientid, clp->cl_hostname); + + set_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state); + nfs4_schedule_state_manager(clp); +} +EXPORT_SYMBOL_GPL(nfs4_schedule_lease_moved_recovery); + int nfs4_wait_clnt_recover(struct nfs_client *clp) { int res; might_sleep(); + atomic_inc(&clp->cl_count); res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, nfs_wait_bit_killable, TASK_KILLABLE); if (res) - return res; - + goto out; if (clp->cl_cons_state < 0) - return clp->cl_cons_state; - return 0; + res = clp->cl_cons_state; +out: + nfs_put_client(clp); + return res; } int nfs4_client_recover_expired_lease(struct nfs_client *clp) @@ -1267,7 +1316,7 @@ static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_st return 1; } -static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) +int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) { set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); @@ -1375,8 +1424,8 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: goto out; default: - printk(KERN_ERR "NFS: %s: unhandled error %d. " - "Zeroing state\n", __func__, status); + printk(KERN_ERR "NFS: %s: unhandled error %d\n", + __func__, status); case -ENOMEM: case -NFS4ERR_DENIED: case -NFS4ERR_RECLAIM_BAD: @@ -1407,7 +1456,7 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs * server that doesn't support a grace period. */ spin_lock(&sp->so_lock); - write_seqcount_begin(&sp->so_reclaim_seqcount); + raw_write_seqcount_begin(&sp->so_reclaim_seqcount); restart: list_for_each_entry(state, &sp->so_states, open_states) { if (!test_and_clear_bit(ops->state_flag_bit, &state->flags)) @@ -1422,7 +1471,7 @@ restart: if (status >= 0) { status = nfs4_reclaim_locks(state, ops); if (status >= 0) { - if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) { + if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) { spin_lock(&state->state_lock); list_for_each_entry(lock, &state->lock_states, ls_locks) { if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) @@ -1439,15 +1488,12 @@ restart: } switch (status) { default: - printk(KERN_ERR "NFS: %s: unhandled error %d. " - "Zeroing state\n", __func__, status); + printk(KERN_ERR "NFS: %s: unhandled error %d\n", + __func__, status); case -ENOENT: case -ENOMEM: case -ESTALE: - /* - * Open state on this file cannot be recovered - * All we can do is revert to using the zero stateid. - */ + /* Open state on this file cannot be recovered */ nfs4_state_mark_recovery_failed(state, status); break; case -EAGAIN: @@ -1473,13 +1519,13 @@ restart: spin_lock(&sp->so_lock); goto restart; } - write_seqcount_end(&sp->so_reclaim_seqcount); + raw_write_seqcount_end(&sp->so_reclaim_seqcount); spin_unlock(&sp->so_lock); return 0; out_err: nfs4_put_open_state(state); spin_lock(&sp->so_lock); - write_seqcount_end(&sp->so_reclaim_seqcount); + raw_write_seqcount_end(&sp->so_reclaim_seqcount); spin_unlock(&sp->so_lock); return status; } @@ -1628,7 +1674,6 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) nfs4_state_end_reclaim_reboot(clp); break; case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_LEASE_MOVED: set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); nfs4_state_clear_reclaim_reboot(clp); nfs4_state_start_reclaim_reboot(clp); @@ -1829,6 +1874,168 @@ static int nfs4_purge_lease(struct nfs_client *clp) return 0; } +/* + * Try remote migration of one FSID from a source server to a + * destination server. The source server provides a list of + * potential destinations. + * + * Returns zero or a negative NFS4ERR status code. + */ +static int nfs4_try_migration(struct nfs_server *server, struct rpc_cred *cred) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_fs_locations *locations = NULL; + struct inode *inode; + struct page *page; + int status, result; + + dprintk("--> %s: FSID %llx:%llx on \"%s\"\n", __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + clp->cl_hostname); + + result = 0; + page = alloc_page(GFP_KERNEL); + locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); + if (page == NULL || locations == NULL) { + dprintk("<-- %s: no memory\n", __func__); + goto out; + } + + inode = server->super->s_root->d_inode; + result = nfs4_proc_get_locations(inode, locations, page, cred); + if (result) { + dprintk("<-- %s: failed to retrieve fs_locations: %d\n", + __func__, result); + goto out; + } + + result = -NFS4ERR_NXIO; + if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) { + dprintk("<-- %s: No fs_locations data, migration skipped\n", + __func__); + goto out; + } + + nfs4_begin_drain_session(clp); + + status = nfs4_replace_transport(server, locations); + if (status != 0) { + dprintk("<-- %s: failed to replace transport: %d\n", + __func__, status); + goto out; + } + + result = 0; + dprintk("<-- %s: migration succeeded\n", __func__); + +out: + if (page != NULL) + __free_page(page); + kfree(locations); + if (result) { + pr_err("NFS: migration recovery failed (server %s)\n", + clp->cl_hostname); + set_bit(NFS_MIG_FAILED, &server->mig_status); + } + return result; +} + +/* + * Returns zero or a negative NFS4ERR status code. + */ +static int nfs4_handle_migration(struct nfs_client *clp) +{ + const struct nfs4_state_maintenance_ops *ops = + clp->cl_mvops->state_renewal_ops; + struct nfs_server *server; + struct rpc_cred *cred; + + dprintk("%s: migration reported on \"%s\"\n", __func__, + clp->cl_hostname); + + spin_lock(&clp->cl_lock); + cred = ops->get_state_renewal_cred_locked(clp); + spin_unlock(&clp->cl_lock); + if (cred == NULL) + return -NFS4ERR_NOENT; + + clp->cl_mig_gen++; +restart: + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + int status; + + if (server->mig_gen == clp->cl_mig_gen) + continue; + server->mig_gen = clp->cl_mig_gen; + + if (!test_and_clear_bit(NFS_MIG_IN_TRANSITION, + &server->mig_status)) + continue; + + rcu_read_unlock(); + status = nfs4_try_migration(server, cred); + if (status < 0) { + put_rpccred(cred); + return status; + } + goto restart; + } + rcu_read_unlock(); + put_rpccred(cred); + return 0; +} + +/* + * Test each nfs_server on the clp's cl_superblocks list to see + * if it's moved to another server. Stop when the server no longer + * returns NFS4ERR_LEASE_MOVED. + */ +static int nfs4_handle_lease_moved(struct nfs_client *clp) +{ + const struct nfs4_state_maintenance_ops *ops = + clp->cl_mvops->state_renewal_ops; + struct nfs_server *server; + struct rpc_cred *cred; + + dprintk("%s: lease moved reported on \"%s\"\n", __func__, + clp->cl_hostname); + + spin_lock(&clp->cl_lock); + cred = ops->get_state_renewal_cred_locked(clp); + spin_unlock(&clp->cl_lock); + if (cred == NULL) + return -NFS4ERR_NOENT; + + clp->cl_mig_gen++; +restart: + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + struct inode *inode; + int status; + + if (server->mig_gen == clp->cl_mig_gen) + continue; + server->mig_gen = clp->cl_mig_gen; + + rcu_read_unlock(); + + inode = server->super->s_root->d_inode; + status = nfs4_proc_fsid_present(inode, cred); + if (status != -NFS4ERR_MOVED) + goto restart; /* wasn't this one */ + if (nfs4_try_migration(server, cred) == -NFS4ERR_LEASE_MOVED) + goto restart; /* there are more */ + goto out; + } + rcu_read_unlock(); + +out: + put_rpccred(cred); + return 0; +} + /** * nfs4_discover_server_trunking - Detect server IP address trunking * @@ -1868,8 +2075,10 @@ again: switch (status) { case 0: break; - case -NFS4ERR_DELAY: case -ETIMEDOUT: + if (clnt->cl_softrtry) + break; + case -NFS4ERR_DELAY: case -EAGAIN: ssleep(1); case -NFS4ERR_STALE_CLIENTID: @@ -1881,10 +2090,15 @@ again: nfs4_root_machine_cred(clp); goto again; } - if (i > 2) + if (clnt->cl_auth->au_flavor == RPC_AUTH_UNIX) break; case -NFS4ERR_CLID_INUSE: case -NFS4ERR_WRONGSEC: + /* No point in retrying if we already used RPC_AUTH_UNIX */ + if (clnt->cl_auth->au_flavor == RPC_AUTH_UNIX) { + status = -EPERM; + break; + } clnt = rpc_clone_client_set_auth(clnt, RPC_AUTH_UNIX); if (IS_ERR(clnt)) { status = PTR_ERR(clnt); @@ -2017,9 +2231,10 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) nfs41_handle_server_reboot(clp); if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | - SEQ4_STATUS_ADMIN_STATE_REVOKED | - SEQ4_STATUS_LEASE_MOVED)) + SEQ4_STATUS_ADMIN_STATE_REVOKED)) nfs41_handle_state_revoked(clp); + if (flags & SEQ4_STATUS_LEASE_MOVED) + nfs4_schedule_lease_moved_recovery(clp); if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) nfs41_handle_recallable_state_revoked(clp); if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT) @@ -2157,7 +2372,20 @@ static void nfs4_state_manager(struct nfs_client *clp) status = nfs4_check_lease(clp); if (status < 0) goto out_error; - continue; + } + + if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) { + section = "migration"; + status = nfs4_handle_migration(clp); + if (status < 0) + goto out_error; + } + + if (test_and_clear_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state)) { + section = "lease moved"; + status = nfs4_handle_lease_moved(clp); + if (status < 0) + goto out_error; } /* First recover reboot state... */ diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index e26acdd1a64..6f340f02f2b 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -77,17 +77,9 @@ static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc) { int ret = nfs_write_inode(inode, wbc); - if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) { - int status; - bool sync = true; - - if (wbc->sync_mode == WB_SYNC_NONE) - sync = false; - - status = pnfs_layoutcommit_inode(inode, sync); - if (status < 0) - return status; - } + if (ret == 0) + ret = pnfs_layoutcommit_inode(inode, + wbc->sync_mode == WB_SYNC_ALL); return ret; } @@ -98,7 +90,7 @@ static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc) */ static void nfs4_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); pnfs_return_layout(inode); pnfs_destroy_layout(NFS_I(inode)); @@ -261,9 +253,9 @@ struct dentry *nfs4_try_mount(int flags, const char *dev_name, res = nfs_follow_remote_path(root_mnt, export_path); - dfprintk(MOUNT, "<-- nfs4_try_mount() = %ld%s\n", - IS_ERR(res) ? PTR_ERR(res) : 0, - IS_ERR(res) ? " [error]" : ""); + dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", + PTR_ERR_OR_ZERO(res), + IS_ERR(res) ? " [error]" : ""); return res; } @@ -319,9 +311,9 @@ static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type, data->mnt_path = export_path; res = nfs_follow_remote_path(root_mnt, export_path); - dprintk("<-- nfs4_referral_mount() = %ld%s\n", - IS_ERR(res) ? PTR_ERR(res) : 0, - IS_ERR(res) ? " [error]" : ""); + dprintk("<-- nfs4_referral_mount() = %d%s\n", + PTR_ERR_OR_ZERO(res), + IS_ERR(res) ? " [error]" : ""); return res; } diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index 2628d921b7e..b6ebe7e445f 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -16,7 +16,7 @@ static const int nfs_set_port_min = 0; static const int nfs_set_port_max = 65535; static struct ctl_table_header *nfs4_callback_sysctl_table; -static ctl_table nfs4_cb_sysctls[] = { +static struct ctl_table nfs4_cb_sysctls[] = { { .procname = "nfs_callback_tcpport", .data = &nfs_callback_set_tcpport, @@ -36,7 +36,7 @@ static ctl_table nfs4_cb_sysctls[] = { { } }; -static ctl_table nfs4_cb_sysctl_dir[] = { +static struct ctl_table nfs4_cb_sysctl_dir[] = { { .procname = "nfs", .mode = 0555, @@ -45,7 +45,7 @@ static ctl_table nfs4_cb_sysctl_dir[] = { { } }; -static ctl_table nfs4_cb_sysctl_root[] = { +static struct ctl_table nfs4_cb_sysctl_root[] = { { .procname = "fs", .mode = 0555, diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 849cf146db3..0a744f3a86f 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -932,7 +932,7 @@ DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group); DECLARE_EVENT_CLASS(nfs4_read_event, TP_PROTO( - const struct nfs_read_data *data, + const struct nfs_pgio_data *data, int error ), @@ -972,7 +972,7 @@ DECLARE_EVENT_CLASS(nfs4_read_event, #define DEFINE_NFS4_READ_EVENT(name) \ DEFINE_EVENT(nfs4_read_event, name, \ TP_PROTO( \ - const struct nfs_read_data *data, \ + const struct nfs_pgio_data *data, \ int error \ ), \ TP_ARGS(data, error)) @@ -983,7 +983,7 @@ DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read); DECLARE_EVENT_CLASS(nfs4_write_event, TP_PROTO( - const struct nfs_write_data *data, + const struct nfs_pgio_data *data, int error ), @@ -1024,7 +1024,7 @@ DECLARE_EVENT_CLASS(nfs4_write_event, #define DEFINE_NFS4_WRITE_EVENT(name) \ DEFINE_EVENT(nfs4_write_event, name, \ TP_PROTO( \ - const struct nfs_write_data *data, \ + const struct nfs_pgio_data *data, \ int error \ ), \ TP_ARGS(data, error)) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 79210d23f60..939ae606cfa 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -105,12 +105,8 @@ static int nfs4_stat_to_errno(int); #ifdef CONFIG_NFS_V4_SECURITY_LABEL /* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */ #define nfs4_label_maxsz (4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN)) -#define encode_readdir_space 24 -#define encode_readdir_bitmask_sz 3 #else #define nfs4_label_maxsz 0 -#define encode_readdir_space 20 -#define encode_readdir_bitmask_sz 2 #endif /* We support only one layout type per file system */ #define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8) @@ -207,8 +203,7 @@ static int nfs4_stat_to_errno(int); 2 + encode_verifier_maxsz + 5 + \ nfs4_label_maxsz) #define decode_readdir_maxsz (op_decode_hdr_maxsz + \ - decode_verifier_maxsz + \ - nfs4_label_maxsz + nfs4_fattr_maxsz) + decode_verifier_maxsz) #define encode_readlink_maxsz (op_encode_hdr_maxsz) #define decode_readlink_maxsz (op_decode_hdr_maxsz + 1) #define encode_write_maxsz (op_encode_hdr_maxsz + \ @@ -595,11 +590,13 @@ static int nfs4_stat_to_errno(int); #define NFS4_enc_getattr_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ - encode_getattr_maxsz) + encode_getattr_maxsz + \ + encode_renew_maxsz) #define NFS4_dec_getattr_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ - decode_getattr_maxsz) + decode_getattr_maxsz + \ + decode_renew_maxsz) #define NFS4_enc_lookup_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -736,13 +733,15 @@ static int nfs4_stat_to_errno(int); encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_lookup_maxsz + \ - encode_fs_locations_maxsz) + encode_fs_locations_maxsz + \ + encode_renew_maxsz) #define NFS4_dec_fs_locations_sz \ (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_lookup_maxsz + \ - decode_fs_locations_maxsz) + decode_fs_locations_maxsz + \ + decode_renew_maxsz) #define NFS4_enc_secinfo_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -751,6 +750,18 @@ static int nfs4_stat_to_errno(int); decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_secinfo_maxsz) +#define NFS4_enc_fsid_present_sz \ + (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_getfh_maxsz + \ + encode_renew_maxsz) +#define NFS4_dec_fsid_present_sz \ + (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_getfh_maxsz + \ + decode_renew_maxsz) #if defined(CONFIG_NFS_V4_1) #define NFS4_enc_bind_conn_to_session_sz \ (compound_encode_hdr_maxsz + \ @@ -1545,7 +1556,8 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) encode_op_hdr(xdr, OP_PUTROOTFH, decode_putrootfh_maxsz, hdr); } -static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) +static void encode_read(struct xdr_stream *xdr, const struct nfs_pgio_args *args, + struct compound_hdr *hdr) { __be32 *p; @@ -1565,6 +1577,8 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg }; uint32_t dircount = readdir->count >> 1; __be32 *p, verf[2]; + uint32_t attrlen = 0; + unsigned int i; if (readdir->plus) { attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE| @@ -1573,26 +1587,27 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV| FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS| FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; + attrs[2] |= FATTR4_WORD2_SECURITY_LABEL; dircount >>= 1; } /* Use mounted_on_fileid only if the server supports it */ if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) attrs[0] |= FATTR4_WORD0_FILEID; + for (i = 0; i < ARRAY_SIZE(attrs); i++) { + attrs[i] &= readdir->bitmask[i]; + if (attrs[i] != 0) + attrlen = i+1; + } encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr); encode_uint64(xdr, readdir->cookie); encode_nfs4_verifier(xdr, &readdir->verifier); - p = reserve_space(xdr, encode_readdir_space); + p = reserve_space(xdr, 12 + (attrlen << 2)); *p++ = cpu_to_be32(dircount); *p++ = cpu_to_be32(readdir->count); - *p++ = cpu_to_be32(encode_readdir_bitmask_sz); - *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); - *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); - if (encode_readdir_bitmask_sz > 2) { - if (hdr->minorversion > 1) - attrs[2] |= FATTR4_WORD2_SECURITY_LABEL; - p++, *p++ = cpu_to_be32(attrs[2] & readdir->bitmask[2]); - } + *p++ = cpu_to_be32(attrlen); + for (i = 0; i < attrlen; i++) + *p++ = cpu_to_be32(attrs[i]); memcpy(verf, readdir->verifier.data, sizeof(verf)); dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n", @@ -1687,7 +1702,8 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4 encode_nfs4_verifier(xdr, &arg->confirm); } -static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr) +static void encode_write(struct xdr_stream *xdr, const struct nfs_pgio_args *args, + struct compound_hdr *hdr) { __be32 *p; @@ -2437,7 +2453,7 @@ static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr, * Encode a READ request */ static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr, - struct nfs_readargs *args) + struct nfs_pgio_args *args) { struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), @@ -2499,7 +2515,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, * Encode a WRITE request */ static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr, - struct nfs_writeargs *args) + struct nfs_pgio_args *args) { struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), @@ -2687,11 +2703,20 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); - encode_putfh(xdr, args->dir_fh, &hdr); - encode_lookup(xdr, args->name, &hdr); - replen = hdr.replen; /* get the attribute into args->page */ - encode_fs_locations(xdr, args->bitmask, &hdr); + if (args->migration) { + encode_putfh(xdr, args->fh, &hdr); + replen = hdr.replen; + encode_fs_locations(xdr, args->bitmask, &hdr); + if (args->renew) + encode_renew(xdr, args->clientid, &hdr); + } else { + encode_putfh(xdr, args->dir_fh, &hdr); + encode_lookup(xdr, args->name, &hdr); + replen = hdr.replen; + encode_fs_locations(xdr, args->bitmask, &hdr); + } + /* Set up reply kvec to capture returned fs_locations array. */ xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page, 0, PAGE_SIZE); encode_nops(&hdr); @@ -2715,6 +2740,26 @@ static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req, encode_nops(&hdr); } +/* + * Encode FSID_PRESENT request + */ +static void nfs4_xdr_enc_fsid_present(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_fsid_present_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_getfh(xdr, &hdr); + if (args->renew) + encode_renew(xdr, args->clientid, &hdr); + encode_nops(&hdr); +} + #if defined(CONFIG_NFS_V4_1) /* * BIND_CONN_TO_SESSION request @@ -3053,7 +3098,8 @@ out_overflow: return -EIO; } -static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) +static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected, + int *nfs_retval) { __be32 *p; uint32_t opnum; @@ -3063,19 +3109,32 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) if (unlikely(!p)) goto out_overflow; opnum = be32_to_cpup(p++); - if (opnum != expected) { - dprintk("nfs: Server returned operation" - " %d but we issued a request for %d\n", - opnum, expected); - return -EIO; - } + if (unlikely(opnum != expected)) + goto out_bad_operation; nfserr = be32_to_cpup(p); - if (nfserr != NFS_OK) - return nfs4_stat_to_errno(nfserr); - return 0; + if (nfserr == NFS_OK) + *nfs_retval = 0; + else + *nfs_retval = nfs4_stat_to_errno(nfserr); + return true; +out_bad_operation: + dprintk("nfs: Server returned operation" + " %d but we issued a request for %d\n", + opnum, expected); + *nfs_retval = -EREMOTEIO; + return false; out_overflow: print_overflow_msg(__func__, xdr); - return -EIO; + *nfs_retval = -EIO; + return false; +} + +static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) +{ + int retval; + + __decode_op_hdr(xdr, expected, &retval); + return retval; } /* Dummy routine */ @@ -3391,7 +3450,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint { __be32 *p; - *res = ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL; + *res = 0; if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) { @@ -4957,11 +5016,12 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) uint32_t savewords, bmlen, i; int status; - status = decode_op_hdr(xdr, OP_OPEN); - if (status != -EIO) - nfs_increment_open_seqid(status, res->seqid); - if (!status) - status = decode_stateid(xdr, &res->stateid); + if (!__decode_op_hdr(xdr, OP_OPEN, &status)) + return status; + nfs_increment_open_seqid(status, res->seqid); + if (status) + return status; + status = decode_stateid(xdr, &res->stateid); if (unlikely(status)) return status; @@ -5027,7 +5087,8 @@ static int decode_putrootfh(struct xdr_stream *xdr) return decode_op_hdr(xdr, OP_PUTROOTFH); } -static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_readres *res) +static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, + struct nfs_pgio_res *res) { __be32 *p; uint32_t count, eof, recvd; @@ -5281,7 +5342,7 @@ static int decode_setclientid_confirm(struct xdr_stream *xdr) return decode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM); } -static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res) +static int decode_write(struct xdr_stream *xdr, struct nfs_pgio_res *res) { __be32 *p; int status; @@ -6578,7 +6639,7 @@ out: * Decode Read response */ static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr, - struct nfs_readres *res) + struct nfs_pgio_res *res) { struct compound_hdr hdr; int status; @@ -6603,7 +6664,7 @@ out: * Decode WRITE response */ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr, - struct nfs_writeres *res) + struct nfs_pgio_res *res) { struct compound_hdr hdr; int status; @@ -6824,13 +6885,26 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, status = decode_putfh(xdr); if (status) goto out; - status = decode_lookup(xdr); - if (status) - goto out; - xdr_enter_page(xdr, PAGE_SIZE); - status = decode_getfattr_generic(xdr, &res->fs_locations->fattr, + if (res->migration) { + xdr_enter_page(xdr, PAGE_SIZE); + status = decode_getfattr_generic(xdr, + &res->fs_locations->fattr, NULL, res->fs_locations, NULL, res->fs_locations->server); + if (status) + goto out; + if (res->renew) + status = decode_renew(xdr); + } else { + status = decode_lookup(xdr); + if (status) + goto out; + xdr_enter_page(xdr, PAGE_SIZE); + status = decode_getfattr_generic(xdr, + &res->fs_locations->fattr, + NULL, res->fs_locations, + NULL, res->fs_locations->server); + } out: return status; } @@ -6859,6 +6933,34 @@ out: return status; } +/* + * Decode FSID_PRESENT response + */ +static int nfs4_xdr_dec_fsid_present(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_fsid_present_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_getfh(xdr, res->fh); + if (status) + goto out; + if (res->renew) + status = decode_renew(xdr); +out: + return status; +} + #if defined(CONFIG_NFS_V4_1) /* * Decode BIND_CONN_TO_SESSION response @@ -7373,6 +7475,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), PROC(SECINFO, enc_secinfo, dec_secinfo), + PROC(FSID_PRESENT, enc_fsid_present, dec_fsid_present), #if defined(CONFIG_NFS_V4_1) PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), PROC(CREATE_SESSION, enc_create_session, dec_create_session), diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 89fe741e58b..59f838cdc00 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -36,6 +36,7 @@ __print_flags(v, "|", \ { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \ { 1 << NFS_INO_STALE, "STALE" }, \ + { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \ { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \ { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ { 1 << NFS_INO_COMMIT, "COMMIT" }, \ diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 5457745dd4f..611320753db 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -439,7 +439,7 @@ static void _read_done(struct ore_io_state *ios, void *private) objlayout_read_done(&objios->oir, status, objios->sync); } -int objio_read_pagelist(struct nfs_read_data *rdata) +int objio_read_pagelist(struct nfs_pgio_data *rdata) { struct nfs_pgio_header *hdr = rdata->header; struct objio_state *objios; @@ -487,7 +487,7 @@ static void _write_done(struct ore_io_state *ios, void *private) static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) { struct objio_state *objios = priv; - struct nfs_write_data *wdata = objios->oir.rpcdata; + struct nfs_pgio_data *wdata = objios->oir.rpcdata; struct address_space *mapping = wdata->header->inode->i_mapping; pgoff_t index = offset / PAGE_SIZE; struct page *page; @@ -531,7 +531,7 @@ static const struct _ore_r4w_op _r4w_op = { .put_page = &__r4w_put_page, }; -int objio_write_pagelist(struct nfs_write_data *wdata, int how) +int objio_write_pagelist(struct nfs_pgio_data *wdata, int how) { struct nfs_pgio_header *hdr = wdata->header; struct objio_state *objios; @@ -564,14 +564,22 @@ int objio_write_pagelist(struct nfs_write_data *wdata, int how) return 0; } -static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { - if (!pnfs_generic_pg_test(pgio, prev, req)) - return false; + unsigned int size; + + size = pnfs_generic_pg_test(pgio, prev, req); + + if (!size || pgio->pg_count + req->wb_bytes > + (unsigned long)pgio->pg_layout_private) + return 0; - return pgio->pg_count + req->wb_bytes <= - (unsigned long)pgio->pg_layout_private; + return min(size, req->wb_bytes); } static void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index e4f9cbfec67..765d3f54e98 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -53,10 +53,10 @@ objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) struct objlayout *objlay; objlay = kzalloc(sizeof(struct objlayout), gfp_flags); - if (objlay) { - spin_lock_init(&objlay->lock); - INIT_LIST_HEAD(&objlay->err_list); - } + if (!objlay) + return NULL; + spin_lock_init(&objlay->lock); + INIT_LIST_HEAD(&objlay->err_list); dprintk("%s: Return %p\n", __func__, objlay); return &objlay->pnfs_layout; } @@ -229,11 +229,11 @@ objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index, static void _rpc_read_complete(struct work_struct *work) { struct rpc_task *task; - struct nfs_read_data *rdata; + struct nfs_pgio_data *rdata; dprintk("%s enter\n", __func__); task = container_of(work, struct rpc_task, u.tk_work); - rdata = container_of(task, struct nfs_read_data, task); + rdata = container_of(task, struct nfs_pgio_data, task); pnfs_ld_read_done(rdata); } @@ -241,7 +241,7 @@ static void _rpc_read_complete(struct work_struct *work) void objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) { - struct nfs_read_data *rdata = oir->rpcdata; + struct nfs_pgio_data *rdata = oir->rpcdata; oir->status = rdata->task.tk_status = status; if (status >= 0) @@ -266,7 +266,7 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) * Perform sync or async reads. */ enum pnfs_try_status -objlayout_read_pagelist(struct nfs_read_data *rdata) +objlayout_read_pagelist(struct nfs_pgio_data *rdata) { struct nfs_pgio_header *hdr = rdata->header; struct inode *inode = hdr->inode; @@ -312,11 +312,11 @@ objlayout_read_pagelist(struct nfs_read_data *rdata) static void _rpc_write_complete(struct work_struct *work) { struct rpc_task *task; - struct nfs_write_data *wdata; + struct nfs_pgio_data *wdata; dprintk("%s enter\n", __func__); task = container_of(work, struct rpc_task, u.tk_work); - wdata = container_of(task, struct nfs_write_data, task); + wdata = container_of(task, struct nfs_pgio_data, task); pnfs_ld_write_done(wdata); } @@ -324,7 +324,7 @@ static void _rpc_write_complete(struct work_struct *work) void objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) { - struct nfs_write_data *wdata = oir->rpcdata; + struct nfs_pgio_data *wdata = oir->rpcdata; oir->status = wdata->task.tk_status = status; if (status >= 0) { @@ -351,7 +351,7 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) * Perform sync or async writes. */ enum pnfs_try_status -objlayout_write_pagelist(struct nfs_write_data *wdata, +objlayout_write_pagelist(struct nfs_pgio_data *wdata, int how) { struct nfs_pgio_header *hdr = wdata->header; diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index 87aa1dec612..01e041029a6 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -119,8 +119,8 @@ extern void objio_free_lseg(struct pnfs_layout_segment *lseg); */ extern void objio_free_result(struct objlayout_io_res *oir); -extern int objio_read_pagelist(struct nfs_read_data *rdata); -extern int objio_write_pagelist(struct nfs_write_data *wdata, int how); +extern int objio_read_pagelist(struct nfs_pgio_data *rdata); +extern int objio_write_pagelist(struct nfs_pgio_data *wdata, int how); /* * callback API @@ -168,10 +168,10 @@ extern struct pnfs_layout_segment *objlayout_alloc_lseg( extern void objlayout_free_lseg(struct pnfs_layout_segment *); extern enum pnfs_try_status objlayout_read_pagelist( - struct nfs_read_data *); + struct nfs_pgio_data *); extern enum pnfs_try_status objlayout_write_pagelist( - struct nfs_write_data *, + struct nfs_pgio_data *, int how); extern void objlayout_encode_layoutcommit( diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 2ffebf2081c..17fab89f635 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -24,9 +24,12 @@ #include "internal.h" #include "pnfs.h" +#define NFSDBG_FACILITY NFSDBG_PAGECACHE + static struct kmem_cache *nfs_page_cachep; +static const struct rpc_call_ops nfs_pgio_common_ops; -bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount) +static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount) { p->npages = pagecount; if (pagecount <= ARRAY_SIZE(p->page_array)) @@ -95,7 +98,7 @@ nfs_iocounter_dec(struct nfs_io_counter *c) { if (atomic_dec_and_test(&c->io_count)) { clear_bit(NFS_IO_INPROGRESS, &c->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&c->flags, NFS_IO_INPROGRESS); } } @@ -133,11 +136,168 @@ nfs_iocounter_wait(struct nfs_io_counter *c) return __nfs_iocounter_wait(c); } +static int nfs_wait_bit_uninterruptible(void *word) +{ + io_schedule(); + return 0; +} + +/* + * nfs_page_group_lock - lock the head of the page group + * @req - request in group that is to be locked + * + * this lock must be held if modifying the page group list + */ +void +nfs_page_group_lock(struct nfs_page *req) +{ + struct nfs_page *head = req->wb_head; + + WARN_ON_ONCE(head != head->wb_head); + + wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, + nfs_wait_bit_uninterruptible, + TASK_UNINTERRUPTIBLE); +} + +/* + * nfs_page_group_unlock - unlock the head of the page group + * @req - request in group that is to be unlocked + */ +void +nfs_page_group_unlock(struct nfs_page *req) +{ + struct nfs_page *head = req->wb_head; + + WARN_ON_ONCE(head != head->wb_head); + + smp_mb__before_atomic(); + clear_bit(PG_HEADLOCK, &head->wb_flags); + smp_mb__after_atomic(); + wake_up_bit(&head->wb_flags, PG_HEADLOCK); +} + +/* + * nfs_page_group_sync_on_bit_locked + * + * must be called with page group lock held + */ +static bool +nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit) +{ + struct nfs_page *head = req->wb_head; + struct nfs_page *tmp; + + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags)); + WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags)); + + tmp = req->wb_this_page; + while (tmp != req) { + if (!test_bit(bit, &tmp->wb_flags)) + return false; + tmp = tmp->wb_this_page; + } + + /* true! reset all bits */ + tmp = req; + do { + clear_bit(bit, &tmp->wb_flags); + tmp = tmp->wb_this_page; + } while (tmp != req); + + return true; +} + +/* + * nfs_page_group_sync_on_bit - set bit on current request, but only + * return true if the bit is set for all requests in page group + * @req - request in page group + * @bit - PG_* bit that is used to sync page group + */ +bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit) +{ + bool ret; + + nfs_page_group_lock(req); + ret = nfs_page_group_sync_on_bit_locked(req, bit); + nfs_page_group_unlock(req); + + return ret; +} + +/* + * nfs_page_group_init - Initialize the page group linkage for @req + * @req - a new nfs request + * @prev - the previous request in page group, or NULL if @req is the first + * or only request in the group (the head). + */ +static inline void +nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev) +{ + WARN_ON_ONCE(prev == req); + + if (!prev) { + /* a head request */ + req->wb_head = req; + req->wb_this_page = req; + } else { + /* a subrequest */ + WARN_ON_ONCE(prev->wb_this_page != prev->wb_head); + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags)); + req->wb_head = prev->wb_head; + req->wb_this_page = prev->wb_this_page; + prev->wb_this_page = req; + + /* All subrequests take a ref on the head request until + * nfs_page_group_destroy is called */ + kref_get(&req->wb_head->wb_kref); + + /* grab extra ref if head request has extra ref from + * the write/commit path to handle handoff between write + * and commit lists */ + if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags)) { + set_bit(PG_INODE_REF, &req->wb_flags); + kref_get(&req->wb_kref); + } + } +} + +/* + * nfs_page_group_destroy - sync the destruction of page groups + * @req - request that no longer needs the page group + * + * releases the page group reference from each member once all + * members have called this function. + */ +static void +nfs_page_group_destroy(struct kref *kref) +{ + struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); + struct nfs_page *tmp, *next; + + /* subrequests must release the ref on the head request */ + if (req->wb_head != req) + nfs_release_request(req->wb_head); + + if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN)) + return; + + tmp = req; + do { + next = tmp->wb_this_page; + /* unlink and free */ + tmp->wb_this_page = tmp; + tmp->wb_head = tmp; + nfs_free_request(tmp); + tmp = next; + } while (tmp != req); +} + /** * nfs_create_request - Create an NFS read/write request. * @ctx: open context to use - * @inode: inode to which the request is attached * @page: page to write + * @last: last nfs request created for this page group or NULL if head * @offset: starting offset within the page for the write * @count: number of bytes to read/write * @@ -146,9 +306,9 @@ nfs_iocounter_wait(struct nfs_io_counter *c) * User should ensure it is safe to sleep in this function. */ struct nfs_page * -nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, - struct page *page, - unsigned int offset, unsigned int count) +nfs_create_request(struct nfs_open_context *ctx, struct page *page, + struct nfs_page *last, unsigned int offset, + unsigned int count) { struct nfs_page *req; struct nfs_lock_context *l_ctx; @@ -180,6 +340,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, req->wb_bytes = count; req->wb_context = get_nfs_open_context(ctx); kref_init(&req->wb_kref); + nfs_page_group_init(req, last); return req; } @@ -193,9 +354,9 @@ void nfs_unlock_request(struct nfs_page *req) printk(KERN_ERR "NFS: Invalid unlock attempted\n"); BUG(); } - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(PG_BUSY, &req->wb_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&req->wb_flags, PG_BUSY); } @@ -237,16 +398,22 @@ static void nfs_clear_request(struct nfs_page *req) } } - /** * nfs_release_request - Release the count on an NFS read/write request * @req: request to release * * Note: Should never be called with the spinlock held! */ -static void nfs_free_request(struct kref *kref) +void nfs_free_request(struct nfs_page *req) { - struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); + WARN_ON_ONCE(req->wb_this_page != req); + + /* extra debug: make sure no sync bits are still set */ + WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags)); + WARN_ON_ONCE(test_bit(PG_UNLOCKPAGE, &req->wb_flags)); + WARN_ON_ONCE(test_bit(PG_UPTODATE, &req->wb_flags)); + WARN_ON_ONCE(test_bit(PG_WB_END, &req->wb_flags)); + WARN_ON_ONCE(test_bit(PG_REMOVE, &req->wb_flags)); /* Release struct file and open context */ nfs_clear_request(req); @@ -255,13 +422,7 @@ static void nfs_free_request(struct kref *kref) void nfs_release_request(struct nfs_page *req) { - kref_put(&req->wb_kref, nfs_free_request); -} - -static int nfs_wait_bit_uninterruptible(void *word) -{ - io_schedule(); - return 0; + kref_put(&req->wb_kref, nfs_page_group_destroy); } /** @@ -279,22 +440,249 @@ nfs_wait_on_request(struct nfs_page *req) TASK_UNINTERRUPTIBLE); } -bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req) +/* + * nfs_generic_pg_test - determine if requests can be coalesced + * @desc: pointer to descriptor + * @prev: previous request in desc, or NULL + * @req: this request + * + * Returns zero if @req can be coalesced into @desc, otherwise it returns + * the size of the request. + */ +size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, + struct nfs_page *prev, struct nfs_page *req) { - /* - * FIXME: ideally we should be able to coalesce all requests - * that are not block boundary aligned, but currently this - * is problematic for the case of bsize < PAGE_CACHE_SIZE, - * since nfs_flush_multi and nfs_pagein_multi assume you - * can have only one struct nfs_page. - */ - if (desc->pg_bsize < PAGE_SIZE) + if (desc->pg_count > desc->pg_bsize) { + /* should never happen */ + WARN_ON_ONCE(1); return 0; + } - return desc->pg_count + req->wb_bytes <= desc->pg_bsize; + return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes); } EXPORT_SYMBOL_GPL(nfs_generic_pg_test); +static inline struct nfs_rw_header *NFS_RW_HEADER(struct nfs_pgio_header *hdr) +{ + return container_of(hdr, struct nfs_rw_header, header); +} + +/** + * nfs_rw_header_alloc - Allocate a header for a read or write + * @ops: Read or write function vector + */ +struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *ops) +{ + struct nfs_rw_header *header = ops->rw_alloc_header(); + + if (header) { + struct nfs_pgio_header *hdr = &header->header; + + INIT_LIST_HEAD(&hdr->pages); + spin_lock_init(&hdr->lock); + atomic_set(&hdr->refcnt, 0); + hdr->rw_ops = ops; + } + return header; +} +EXPORT_SYMBOL_GPL(nfs_rw_header_alloc); + +/* + * nfs_rw_header_free - Free a read or write header + * @hdr: The header to free + */ +void nfs_rw_header_free(struct nfs_pgio_header *hdr) +{ + hdr->rw_ops->rw_free_header(NFS_RW_HEADER(hdr)); +} +EXPORT_SYMBOL_GPL(nfs_rw_header_free); + +/** + * nfs_pgio_data_alloc - Allocate pageio data + * @hdr: The header making a request + * @pagecount: Number of pages to create + */ +static struct nfs_pgio_data *nfs_pgio_data_alloc(struct nfs_pgio_header *hdr, + unsigned int pagecount) +{ + struct nfs_pgio_data *data, *prealloc; + + prealloc = &NFS_RW_HEADER(hdr)->rpc_data; + if (prealloc->header == NULL) + data = prealloc; + else + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out; + + if (nfs_pgarray_set(&data->pages, pagecount)) { + data->header = hdr; + atomic_inc(&hdr->refcnt); + } else { + if (data != prealloc) + kfree(data); + data = NULL; + } +out: + return data; +} + +/** + * nfs_pgio_data_release - Properly free pageio data + * @data: The data to release + */ +void nfs_pgio_data_release(struct nfs_pgio_data *data) +{ + struct nfs_pgio_header *hdr = data->header; + struct nfs_rw_header *pageio_header = NFS_RW_HEADER(hdr); + + put_nfs_open_context(data->args.context); + if (data->pages.pagevec != data->pages.page_array) + kfree(data->pages.pagevec); + if (data == &pageio_header->rpc_data) { + data->header = NULL; + data = NULL; + } + if (atomic_dec_and_test(&hdr->refcnt)) + hdr->completion_ops->completion(hdr); + /* Note: we only free the rpc_task after callbacks are done. + * See the comment in rpc_free_task() for why + */ + kfree(data); +} +EXPORT_SYMBOL_GPL(nfs_pgio_data_release); + +/** + * nfs_pgio_rpcsetup - Set up arguments for a pageio call + * @data: The pageio data + * @count: Number of bytes to read + * @offset: Initial offset + * @how: How to commit data (writes only) + * @cinfo: Commit information for the call (writes only) + */ +static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data, + unsigned int count, unsigned int offset, + int how, struct nfs_commit_info *cinfo) +{ + struct nfs_page *req = data->header->req; + + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + data->args.fh = NFS_FH(data->header->inode); + data->args.offset = req_offset(req) + offset; + /* pnfs_set_layoutcommit needs this */ + data->mds_offset = data->args.offset; + data->args.pgbase = req->wb_pgbase + offset; + data->args.pages = data->pages.pagevec; + data->args.count = count; + data->args.context = get_nfs_open_context(req->wb_context); + data->args.lock_context = req->wb_lock_context; + data->args.stable = NFS_UNSTABLE; + switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { + case 0: + break; + case FLUSH_COND_STABLE: + if (nfs_reqs_to_commit(cinfo)) + break; + default: + data->args.stable = NFS_FILE_SYNC; + } + + data->res.fattr = &data->fattr; + data->res.count = count; + data->res.eof = 0; + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); +} + +/** + * nfs_pgio_prepare - Prepare pageio data to go over the wire + * @task: The current task + * @calldata: pageio data to prepare + */ +static void nfs_pgio_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs_pgio_data *data = calldata; + int err; + err = NFS_PROTO(data->header->inode)->pgio_rpc_prepare(task, data); + if (err) + rpc_exit(task, err); +} + +int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_data *data, + const struct rpc_call_ops *call_ops, int how, int flags) +{ + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = data->header->cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clnt, + .task = &data->task, + .rpc_message = &msg, + .callback_ops = call_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC | flags, + }; + int ret = 0; + + data->header->rw_ops->rw_initiate(data, &msg, &task_setup_data, how); + + dprintk("NFS: %5u initiated pgio call " + "(req %s/%llu, %u bytes @ offset %llu)\n", + data->task.tk_pid, + data->header->inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(data->header->inode), + data->args.count, + (unsigned long long)data->args.offset); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) { + ret = PTR_ERR(task); + goto out; + } + if (how & FLUSH_SYNC) { + ret = rpc_wait_for_completion_task(task); + if (ret == 0) + ret = task->tk_status; + } + rpc_put_task(task); +out: + return ret; +} +EXPORT_SYMBOL_GPL(nfs_initiate_pgio); + +/** + * nfs_pgio_error - Clean up from a pageio error + * @desc: IO descriptor + * @hdr: pageio header + */ +static int nfs_pgio_error(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) +{ + set_bit(NFS_IOHDR_REDO, &hdr->flags); + nfs_pgio_data_release(hdr->data); + hdr->data = NULL; + desc->pg_completion_ops->error_cleanup(&desc->pg_list); + return -ENOMEM; +} + +/** + * nfs_pgio_release - Release pageio data + * @calldata: The pageio data to release + */ +static void nfs_pgio_release(void *calldata) +{ + struct nfs_pgio_data *data = calldata; + if (data->header->rw_ops->rw_release) + data->header->rw_ops->rw_release(data); + nfs_pgio_data_release(data); +} + /** * nfs_pageio_init - initialise a page io descriptor * @desc: pointer to descriptor @@ -307,6 +695,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, struct inode *inode, const struct nfs_pageio_ops *pg_ops, const struct nfs_pgio_completion_ops *compl_ops, + const struct nfs_rw_ops *rw_ops, size_t bsize, int io_flags) { @@ -320,6 +709,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_inode = inode; desc->pg_ops = pg_ops; desc->pg_completion_ops = compl_ops; + desc->pg_rw_ops = rw_ops; desc->pg_ioflags = io_flags; desc->pg_error = 0; desc->pg_lseg = NULL; @@ -328,6 +718,94 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, } EXPORT_SYMBOL_GPL(nfs_pageio_init); +/** + * nfs_pgio_result - Basic pageio error handling + * @task: The task that ran + * @calldata: Pageio data to check + */ +static void nfs_pgio_result(struct rpc_task *task, void *calldata) +{ + struct nfs_pgio_data *data = calldata; + struct inode *inode = data->header->inode; + + dprintk("NFS: %s: %5u, (status %d)\n", __func__, + task->tk_pid, task->tk_status); + + if (data->header->rw_ops->rw_done(task, data, inode) != 0) + return; + if (task->tk_status < 0) + nfs_set_pgio_error(data->header, task->tk_status, data->args.offset); + else + data->header->rw_ops->rw_result(task, data); +} + +/* + * Create an RPC task for the given read or write request and kick it. + * The page must have been locked by the caller. + * + * It may happen that the page we're passed is not marked dirty. + * This is the case if nfs_updatepage detects a conflicting request + * that has been written but not committed. + */ +int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) +{ + struct nfs_page *req; + struct page **pages; + struct nfs_pgio_data *data; + struct list_head *head = &desc->pg_list; + struct nfs_commit_info cinfo; + + data = nfs_pgio_data_alloc(hdr, nfs_page_array_len(desc->pg_base, + desc->pg_count)); + if (!data) + return nfs_pgio_error(desc, hdr); + + nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); + pages = data->pages.pagevec; + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_list_add_request(req, &hdr->pages); + *pages++ = req->wb_page; + } + + if ((desc->pg_ioflags & FLUSH_COND_STABLE) && + (desc->pg_moreio || nfs_reqs_to_commit(&cinfo))) + desc->pg_ioflags &= ~FLUSH_COND_STABLE; + + /* Set up the argument struct */ + nfs_pgio_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo); + hdr->data = data; + desc->pg_rpc_callops = &nfs_pgio_common_ops; + return 0; +} +EXPORT_SYMBOL_GPL(nfs_generic_pgio); + +static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) +{ + struct nfs_rw_header *rw_hdr; + struct nfs_pgio_header *hdr; + int ret; + + rw_hdr = nfs_rw_header_alloc(desc->pg_rw_ops); + if (!rw_hdr) { + desc->pg_completion_ops->error_cleanup(&desc->pg_list); + return -ENOMEM; + } + hdr = &rw_hdr->header; + nfs_pgheader_init(desc, hdr, nfs_rw_header_free); + atomic_inc(&hdr->refcnt); + ret = nfs_generic_pgio(desc, hdr); + if (ret == 0) + ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode), + hdr->data, desc->pg_rpc_callops, + desc->pg_ioflags, 0); + if (atomic_dec_and_test(&hdr->refcnt)) + hdr->completion_ops->completion(hdr); + return ret; +} + static bool nfs_match_open_context(const struct nfs_open_context *ctx1, const struct nfs_open_context *ctx2) { @@ -356,18 +834,23 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, struct nfs_page *req, struct nfs_pageio_descriptor *pgio) { - if (!nfs_match_open_context(req->wb_context, prev->wb_context)) - return false; - if (req->wb_context->dentry->d_inode->i_flock != NULL && - !nfs_match_lock_context(req->wb_lock_context, prev->wb_lock_context)) - return false; - if (req->wb_pgbase != 0) - return false; - if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) - return false; - if (req_offset(req) != req_offset(prev) + prev->wb_bytes) - return false; - return pgio->pg_ops->pg_test(pgio, prev, req); + size_t size; + + if (prev) { + if (!nfs_match_open_context(req->wb_context, prev->wb_context)) + return false; + if (req->wb_context->dentry->d_inode->i_flock != NULL && + !nfs_match_lock_context(req->wb_lock_context, + prev->wb_lock_context)) + return false; + if (req_offset(req) != req_offset(prev) + prev->wb_bytes) + return false; + } + size = pgio->pg_ops->pg_test(pgio, prev, req); + WARN_ON_ONCE(size > req->wb_bytes); + if (size && size < req->wb_bytes) + req->wb_bytes = size; + return size > 0; } /** @@ -381,17 +864,16 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { + struct nfs_page *prev = NULL; if (desc->pg_count != 0) { - struct nfs_page *prev; - prev = nfs_list_entry(desc->pg_list.prev); - if (!nfs_can_coalesce_requests(prev, req, desc)) - return 0; } else { if (desc->pg_ops->pg_init) desc->pg_ops->pg_init(desc, req); desc->pg_base = req->wb_pgbase; } + if (!nfs_can_coalesce_requests(prev, req, desc)) + return 0; nfs_list_remove_request(req); nfs_list_add_request(req, &desc->pg_list); desc->pg_count += req->wb_bytes; @@ -421,22 +903,72 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) * @desc: destination io descriptor * @req: request * + * This may split a request into subrequests which are all part of the + * same page group. + * * Returns true if the request 'req' was successfully coalesced into the * existing list of pages 'desc'. */ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { - while (!nfs_pageio_do_add_request(desc, req)) { - desc->pg_moreio = 1; - nfs_pageio_doio(desc); - if (desc->pg_error < 0) - return 0; - desc->pg_moreio = 0; - if (desc->pg_recoalesce) - return 0; - } + struct nfs_page *subreq; + unsigned int bytes_left = 0; + unsigned int offset, pgbase; + + nfs_page_group_lock(req); + + subreq = req; + bytes_left = subreq->wb_bytes; + offset = subreq->wb_offset; + pgbase = subreq->wb_pgbase; + + do { + if (!nfs_pageio_do_add_request(desc, subreq)) { + /* make sure pg_test call(s) did nothing */ + WARN_ON_ONCE(subreq->wb_bytes != bytes_left); + WARN_ON_ONCE(subreq->wb_offset != offset); + WARN_ON_ONCE(subreq->wb_pgbase != pgbase); + + nfs_page_group_unlock(req); + desc->pg_moreio = 1; + nfs_pageio_doio(desc); + if (desc->pg_error < 0) + return 0; + if (desc->pg_recoalesce) + return 0; + /* retry add_request for this subreq */ + nfs_page_group_lock(req); + continue; + } + + /* check for buggy pg_test call(s) */ + WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE); + WARN_ON_ONCE(subreq->wb_bytes > bytes_left); + WARN_ON_ONCE(subreq->wb_bytes == 0); + + bytes_left -= subreq->wb_bytes; + offset += subreq->wb_bytes; + pgbase += subreq->wb_bytes; + + if (bytes_left) { + subreq = nfs_create_request(req->wb_context, + req->wb_page, + subreq, pgbase, bytes_left); + if (IS_ERR(subreq)) + goto err_ptr; + nfs_lock_request(subreq); + subreq->wb_offset = offset; + subreq->wb_index = req->wb_index; + } + } while (bytes_left > 0); + + nfs_page_group_unlock(req); return 1; +err_ptr: + desc->pg_error = PTR_ERR(subreq); + nfs_page_group_unlock(req); + return 0; } static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) @@ -449,6 +981,7 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) desc->pg_count = 0; desc->pg_base = 0; desc->pg_recoalesce = 0; + desc->pg_moreio = 0; while (!list_empty(&head)) { struct nfs_page *req; @@ -535,3 +1068,13 @@ void nfs_destroy_nfspagecache(void) kmem_cache_destroy(nfs_page_cachep); } +static const struct rpc_call_ops nfs_pgio_common_ops = { + .rpc_call_prepare = nfs_pgio_prepare, + .rpc_call_done = nfs_pgio_result, + .rpc_release = nfs_pgio_release, +}; + +const struct nfs_pageio_ops nfs_pgio_rw_ops = { + .pg_test = nfs_generic_pg_test, + .pg_doio = nfs_generic_pg_pgios, +}; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index d75d938d36c..6fdcd233d6f 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -662,7 +662,18 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) */ static bool pnfs_seqid_is_newer(u32 s1, u32 s2) { - return (s32)s1 - (s32)s2 > 0; + return (s32)(s1 - s2) > 0; +} + +static void +pnfs_verify_layout_stateid(struct pnfs_layout_hdr *lo, + const nfs4_stateid *new, + struct list_head *free_me_list) +{ + if (nfs4_stateid_match_other(&lo->plh_stateid, new)) + return; + /* Layout is new! Kill existing layout segments */ + pnfs_mark_matching_lsegs_invalid(lo, free_me_list, NULL); } /* update lo->plh_stateid with new if is more recent */ @@ -1315,6 +1326,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) struct nfs4_layoutget_res *res = &lgp->res; struct pnfs_layout_segment *lseg; struct inode *ino = lo->plh_inode; + LIST_HEAD(free_me); int status = 0; /* Inject layout blob into I/O device driver */ @@ -1341,6 +1353,8 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out_forget_reply; } + /* Check that the new stateid matches the old stateid */ + pnfs_verify_layout_stateid(lo, &res->stateid, &free_me); /* Done processing layoutget. Set the layout stateid */ pnfs_set_layout_stateid(lo, &res->stateid, false); @@ -1355,6 +1369,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) } spin_unlock(&ino->i_lock); + pnfs_free_lseg_list(&free_me); return lseg; out: return ERR_PTR(status); @@ -1373,11 +1388,6 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r WARN_ON_ONCE(pgio->pg_lseg != NULL); - if (req->wb_offset != req->wb_pgbase) { - nfs_pageio_reset_read_mds(pgio); - return; - } - if (pgio->pg_dreq == NULL) rd_size = i_size_read(pgio->pg_inode) - req_offset(req); else @@ -1402,11 +1412,6 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, { WARN_ON_ONCE(pgio->pg_lseg != NULL); - if (req->wb_offset != req->wb_pgbase) { - nfs_pageio_reset_write_mds(pgio); - return; - } - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, req_offset(req), @@ -1419,56 +1424,49 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, } EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); -void -pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, - const struct nfs_pgio_completion_ops *compl_ops) -{ - struct nfs_server *server = NFS_SERVER(inode); - struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; - - if (ld == NULL) - nfs_pageio_init_read(pgio, inode, compl_ops); - else - nfs_pageio_init(pgio, inode, ld->pg_read_ops, compl_ops, server->rsize, 0); -} - -void -pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, - int ioflags, - const struct nfs_pgio_completion_ops *compl_ops) -{ - struct nfs_server *server = NFS_SERVER(inode); - struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; - - if (ld == NULL) - nfs_pageio_init_write(pgio, inode, ioflags, compl_ops); - else - nfs_pageio_init(pgio, inode, ld->pg_write_ops, compl_ops, server->wsize, ioflags); -} - -bool +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { - if (pgio->pg_lseg == NULL) - return nfs_generic_pg_test(pgio, prev, req); + unsigned int size; + u64 seg_end, req_start, seg_left; + + size = nfs_generic_pg_test(pgio, prev, req); + if (!size) + return 0; /* - * Test if a nfs_page is fully contained in the pnfs_layout_range. - * Note that this test makes several assumptions: - * - that the previous nfs_page in the struct nfs_pageio_descriptor - * is known to lie within the range. - * - that the nfs_page being tested is known to be contiguous with the - * previous nfs_page. - * - Layout ranges are page aligned, so we only have to test the - * start offset of the request. + * 'size' contains the number of bytes left in the current page (up + * to the original size asked for in @req->wb_bytes). + * + * Calculate how many bytes are left in the layout segment + * and if there are less bytes than 'size', return that instead. * * Please also note that 'end_offset' is actually the offset of the * first byte that lies outside the pnfs_layout_range. FIXME? * */ - return req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset, - pgio->pg_lseg->pls_range.length); + if (pgio->pg_lseg) { + seg_end = end_offset(pgio->pg_lseg->pls_range.offset, + pgio->pg_lseg->pls_range.length); + req_start = req_offset(req); + WARN_ON_ONCE(req_start > seg_end); + /* start of request is past the last byte of this segment */ + if (req_start >= seg_end) + return 0; + + /* adjust 'size' iff there are fewer bytes left in the + * segment than what nfs_generic_pg_test returned */ + seg_left = seg_end - req_start; + if (seg_left < size) + size = (unsigned int)seg_left; + } + + return size; } EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); @@ -1481,7 +1479,7 @@ int pnfs_write_done_resend_to_mds(struct inode *inode, LIST_HEAD(failed); /* Resend all requests through the MDS */ - nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, compl_ops); + nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, true, compl_ops); pgio.pg_dreq = dreq; while (!list_empty(head)) { struct nfs_page *req = nfs_list_entry(head->next); @@ -1504,7 +1502,7 @@ int pnfs_write_done_resend_to_mds(struct inode *inode, } EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); -static void pnfs_ld_handle_write_error(struct nfs_write_data *data) +static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; @@ -1523,7 +1521,7 @@ static void pnfs_ld_handle_write_error(struct nfs_write_data *data) /* * Called by non rpc-based layout drivers */ -void pnfs_ld_write_done(struct nfs_write_data *data) +void pnfs_ld_write_done(struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; @@ -1539,7 +1537,7 @@ EXPORT_SYMBOL_GPL(pnfs_ld_write_done); static void pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, - struct nfs_write_data *data) + struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; @@ -1548,11 +1546,11 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_write_mds(desc); desc->pg_recoalesce = 1; } - nfs_writedata_release(data); + nfs_pgio_data_release(data); } static enum pnfs_try_status -pnfs_try_to_write_data(struct nfs_write_data *wdata, +pnfs_try_to_write_data(struct nfs_pgio_data *wdata, const struct rpc_call_ops *call_ops, struct pnfs_layout_segment *lseg, int how) @@ -1574,41 +1572,36 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata, } static void -pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how) +pnfs_do_write(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr, int how) { - struct nfs_write_data *data; + struct nfs_pgio_data *data = hdr->data; const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; struct pnfs_layout_segment *lseg = desc->pg_lseg; + enum pnfs_try_status trypnfs; desc->pg_lseg = NULL; - while (!list_empty(head)) { - enum pnfs_try_status trypnfs; - - data = list_first_entry(head, struct nfs_write_data, list); - list_del_init(&data->list); - - trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how); - if (trypnfs == PNFS_NOT_ATTEMPTED) - pnfs_write_through_mds(desc, data); - } + trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how); + if (trypnfs == PNFS_NOT_ATTEMPTED) + pnfs_write_through_mds(desc, data); pnfs_put_lseg(lseg); } static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) { pnfs_put_lseg(hdr->lseg); - nfs_writehdr_free(hdr); + nfs_rw_header_free(hdr); } EXPORT_SYMBOL_GPL(pnfs_writehdr_free); int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) { - struct nfs_write_header *whdr; + struct nfs_rw_header *whdr; struct nfs_pgio_header *hdr; int ret; - whdr = nfs_writehdr_alloc(); + whdr = nfs_rw_header_alloc(desc->pg_rw_ops); if (!whdr) { desc->pg_completion_ops->error_cleanup(&desc->pg_list); pnfs_put_lseg(desc->pg_lseg); @@ -1619,12 +1612,12 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); hdr->lseg = pnfs_get_lseg(desc->pg_lseg); atomic_inc(&hdr->refcnt); - ret = nfs_generic_flush(desc, hdr); + ret = nfs_generic_pgio(desc, hdr); if (ret != 0) { pnfs_put_lseg(desc->pg_lseg); desc->pg_lseg = NULL; } else - pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags); + pnfs_do_write(desc, hdr, desc->pg_ioflags); if (atomic_dec_and_test(&hdr->refcnt)) hdr->completion_ops->completion(hdr); return ret; @@ -1640,7 +1633,7 @@ int pnfs_read_done_resend_to_mds(struct inode *inode, LIST_HEAD(failed); /* Resend all requests through the MDS */ - nfs_pageio_init_read(&pgio, inode, compl_ops); + nfs_pageio_init_read(&pgio, inode, true, compl_ops); pgio.pg_dreq = dreq; while (!list_empty(head)) { struct nfs_page *req = nfs_list_entry(head->next); @@ -1659,7 +1652,7 @@ int pnfs_read_done_resend_to_mds(struct inode *inode, } EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds); -static void pnfs_ld_handle_read_error(struct nfs_read_data *data) +static void pnfs_ld_handle_read_error(struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; @@ -1678,7 +1671,7 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data) /* * Called by non rpc-based layout drivers */ -void pnfs_ld_read_done(struct nfs_read_data *data) +void pnfs_ld_read_done(struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; @@ -1694,7 +1687,7 @@ EXPORT_SYMBOL_GPL(pnfs_ld_read_done); static void pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, - struct nfs_read_data *data) + struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; @@ -1703,14 +1696,14 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_read_mds(desc); desc->pg_recoalesce = 1; } - nfs_readdata_release(data); + nfs_pgio_data_release(data); } /* * Call the appropriate parallel I/O subsystem read function. */ static enum pnfs_try_status -pnfs_try_to_read_data(struct nfs_read_data *rdata, +pnfs_try_to_read_data(struct nfs_pgio_data *rdata, const struct rpc_call_ops *call_ops, struct pnfs_layout_segment *lseg) { @@ -1732,41 +1725,35 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata, } static void -pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head) +pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) { - struct nfs_read_data *data; + struct nfs_pgio_data *data = hdr->data; const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; struct pnfs_layout_segment *lseg = desc->pg_lseg; + enum pnfs_try_status trypnfs; desc->pg_lseg = NULL; - while (!list_empty(head)) { - enum pnfs_try_status trypnfs; - - data = list_first_entry(head, struct nfs_read_data, list); - list_del_init(&data->list); - - trypnfs = pnfs_try_to_read_data(data, call_ops, lseg); - if (trypnfs == PNFS_NOT_ATTEMPTED) - pnfs_read_through_mds(desc, data); - } + trypnfs = pnfs_try_to_read_data(data, call_ops, lseg); + if (trypnfs == PNFS_NOT_ATTEMPTED) + pnfs_read_through_mds(desc, data); pnfs_put_lseg(lseg); } static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) { pnfs_put_lseg(hdr->lseg); - nfs_readhdr_free(hdr); + nfs_rw_header_free(hdr); } EXPORT_SYMBOL_GPL(pnfs_readhdr_free); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) { - struct nfs_read_header *rhdr; + struct nfs_rw_header *rhdr; struct nfs_pgio_header *hdr; int ret; - rhdr = nfs_readhdr_alloc(); + rhdr = nfs_rw_header_alloc(desc->pg_rw_ops); if (!rhdr) { desc->pg_completion_ops->error_cleanup(&desc->pg_list); ret = -ENOMEM; @@ -1778,18 +1765,27 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); hdr->lseg = pnfs_get_lseg(desc->pg_lseg); atomic_inc(&hdr->refcnt); - ret = nfs_generic_pagein(desc, hdr); + ret = nfs_generic_pgio(desc, hdr); if (ret != 0) { pnfs_put_lseg(desc->pg_lseg); desc->pg_lseg = NULL; } else - pnfs_do_multiple_reads(desc, &hdr->rpc_list); + pnfs_do_read(desc, hdr); if (atomic_dec_and_test(&hdr->refcnt)) hdr->completion_ops->completion(hdr); return ret; } EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); +static void pnfs_clear_layoutcommitting(struct inode *inode) +{ + unsigned long *bitlock = &NFS_I(inode)->flags; + + clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); + smp_mb__after_atomic(); + wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); +} + /* * There can be multiple RW segments. */ @@ -1807,7 +1803,6 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp) { struct pnfs_layout_segment *lseg, *tmp; - unsigned long *bitlock = &NFS_I(inode)->flags; /* Matched by references in pnfs_set_layoutcommit */ list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) { @@ -1815,9 +1810,7 @@ static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *lis pnfs_put_lseg(lseg); } - clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); - smp_mb__after_clear_bit(); - wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); + pnfs_clear_layoutcommitting(inode); } void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) @@ -1827,7 +1820,7 @@ void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); void -pnfs_set_layoutcommit(struct nfs_write_data *wdata) +pnfs_set_layoutcommit(struct nfs_pgio_data *wdata) { struct nfs_pgio_header *hdr = wdata->header; struct inode *inode = hdr->inode; @@ -1881,43 +1874,37 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) struct nfs4_layoutcommit_data *data; struct nfs_inode *nfsi = NFS_I(inode); loff_t end_pos; - int status = 0; - - dprintk("--> %s inode %lu\n", __func__, inode->i_ino); + int status; - if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + if (!pnfs_layoutcommit_outstanding(inode)) return 0; - /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ - data = kzalloc(sizeof(*data), GFP_NOFS); - if (!data) { - status = -ENOMEM; - goto out; - } - - if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) - goto out_free; + dprintk("--> %s inode %lu\n", __func__, inode->i_ino); + status = -EAGAIN; if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { - if (!sync) { - status = -EAGAIN; - goto out_free; - } - status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, - nfs_wait_bit_killable, TASK_KILLABLE); + if (!sync) + goto out; + status = wait_on_bit_lock(&nfsi->flags, + NFS_INO_LAYOUTCOMMITTING, + nfs_wait_bit_killable, + TASK_KILLABLE); if (status) - goto out_free; + goto out; } - INIT_LIST_HEAD(&data->lseg_list); + status = -ENOMEM; + /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ + data = kzalloc(sizeof(*data), GFP_NOFS); + if (!data) + goto clear_layoutcommitting; + + status = 0; spin_lock(&inode->i_lock); - if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { - clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags); - spin_unlock(&inode->i_lock); - wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING); - goto out_free; - } + if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + goto out_unlock; + INIT_LIST_HEAD(&data->lseg_list); pnfs_list_write_lseg(inode, &data->lseg_list); end_pos = nfsi->layout->plh_lwb; @@ -1940,8 +1927,11 @@ out: mark_inode_dirty_sync(inode); dprintk("<-- %s status %d\n", __func__, status); return status; -out_free: +out_unlock: + spin_unlock(&inode->i_lock); kfree(data); +clear_layoutcommitting: + pnfs_clear_layoutcommitting(inode); goto out; } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index a4f41810a7f..4fb309a2b4c 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -113,8 +113,8 @@ struct pnfs_layoutdriver_type { * Return PNFS_ATTEMPTED to indicate the layout code has attempted * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS */ - enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data); - enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how); + enum pnfs_try_status (*read_pagelist) (struct nfs_pgio_data *nfs_data); + enum pnfs_try_status (*write_pagelist) (struct nfs_pgio_data *nfs_data, int how); void (*free_deviceid_node) (struct nfs4_deviceid_node *); @@ -180,11 +180,6 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_put_lseg(struct pnfs_layout_segment *lseg); -void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *, - const struct nfs_pgio_completion_ops *); -void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, - int, const struct nfs_pgio_completion_ops *); - void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32); void unset_pnfs_layoutdriver(struct nfs_server *); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); @@ -192,7 +187,8 @@ int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, u64 wb_size); int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); -bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); +size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, + struct nfs_page *prev, struct nfs_page *req); void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg); struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); @@ -217,13 +213,13 @@ bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); -void pnfs_set_layoutcommit(struct nfs_write_data *wdata); +void pnfs_set_layoutcommit(struct nfs_pgio_data *wdata); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); int _pnfs_return_layout(struct inode *); int pnfs_commit_and_return_layout(struct inode *); -void pnfs_ld_write_done(struct nfs_write_data *); -void pnfs_ld_read_done(struct nfs_read_data *); +void pnfs_ld_write_done(struct nfs_pgio_data *); +void pnfs_ld_read_done(struct nfs_pgio_data *); struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, loff_t pos, @@ -275,7 +271,7 @@ pnfs_get_lseg(struct pnfs_layout_segment *lseg) { if (lseg) { atomic_inc(&lseg->pls_refcount); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } return lseg; } @@ -359,6 +355,15 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode) PNFS_LAYOUTRET_ON_SETATTR; } +static inline bool +pnfs_layoutcommit_outstanding(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + return test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags) != 0 || + test_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags) != 0; +} + static inline int pnfs_return_layout(struct inode *ino) { struct nfs_inode *nfsi = NFS_I(ino); @@ -452,18 +457,6 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s) { } -static inline void pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, - const struct nfs_pgio_completion_ops *compl_ops) -{ - nfs_pageio_init_read(pgio, inode, compl_ops); -} - -static inline void pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags, - const struct nfs_pgio_completion_ops *compl_ops) -{ - nfs_pageio_init_write(pgio, inode, ioflags, compl_ops); -} - static inline int pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how, struct nfs_commit_info *cinfo) @@ -515,6 +508,13 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src, return false; } +static inline bool +pnfs_layoutcommit_outstanding(struct inode *inode) +{ + return false; +} + + static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) { return NULL; diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index fddbba2d9ef..c171ce1a8a3 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -357,30 +357,6 @@ nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir, } static int -nfs_proc_rename(struct inode *old_dir, struct qstr *old_name, - struct inode *new_dir, struct qstr *new_name) -{ - struct nfs_renameargs arg = { - .old_dir = NFS_FH(old_dir), - .old_name = old_name, - .new_dir = NFS_FH(new_dir), - .new_name = new_name, - }; - struct rpc_message msg = { - .rpc_proc = &nfs_procedures[NFSPROC_RENAME], - .rpc_argp = &arg, - }; - int status; - - dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); - status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); - nfs_mark_for_revalidate(old_dir); - nfs_mark_for_revalidate(new_dir); - dprintk("NFS reply rename: %d\n", status); - return status; -} - -static int nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) { struct nfs_linkargs arg = { @@ -602,7 +578,7 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, return 0; } -static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data) +static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_data *data) { struct inode *inode = data->header->inode; @@ -618,18 +594,18 @@ static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data) return 0; } -static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) +static void nfs_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg) { msg->rpc_proc = &nfs_procedures[NFSPROC_READ]; } -static int nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) +static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data) { rpc_call_start(task); return 0; } -static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_data *data) { struct inode *inode = data->header->inode; @@ -638,19 +614,13 @@ static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) return 0; } -static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) +static void nfs_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg) { /* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */ data->args.stable = NFS_FILE_SYNC; msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE]; } -static int nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) -{ - rpc_call_start(task); - return 0; -} - static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) { BUG(); @@ -745,7 +715,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .unlink_setup = nfs_proc_unlink_setup, .unlink_rpc_prepare = nfs_proc_unlink_rpc_prepare, .unlink_done = nfs_proc_unlink_done, - .rename = nfs_proc_rename, .rename_setup = nfs_proc_rename_setup, .rename_rpc_prepare = nfs_proc_rename_rpc_prepare, .rename_done = nfs_proc_rename_done, @@ -759,13 +728,10 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .fsinfo = nfs_proc_fsinfo, .pathconf = nfs_proc_pathconf, .decode_dirent = nfs2_decode_dirent, + .pgio_rpc_prepare = nfs_proc_pgio_rpc_prepare, .read_setup = nfs_proc_read_setup, - .read_pageio_init = nfs_pageio_init_read, - .read_rpc_prepare = nfs_proc_read_rpc_prepare, .read_done = nfs_read_done, .write_setup = nfs_proc_write_setup, - .write_pageio_init = nfs_pageio_init_write, - .write_rpc_prepare = nfs_proc_write_rpc_prepare, .write_done = nfs_write_done, .commit_setup = nfs_proc_commit_setup, .commit_rpc_prepare = nfs_proc_commit_rpc_prepare, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 31db5c366b8..e818a475ca6 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -24,85 +24,24 @@ #include "internal.h" #include "iostat.h" #include "fscache.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE -static const struct nfs_pageio_ops nfs_pageio_read_ops; -static const struct rpc_call_ops nfs_read_common_ops; static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops; +static const struct nfs_rw_ops nfs_rw_read_ops; static struct kmem_cache *nfs_rdata_cachep; -struct nfs_read_header *nfs_readhdr_alloc(void) +static struct nfs_rw_header *nfs_readhdr_alloc(void) { - struct nfs_read_header *rhdr; - - rhdr = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); - if (rhdr) { - struct nfs_pgio_header *hdr = &rhdr->header; - - INIT_LIST_HEAD(&hdr->pages); - INIT_LIST_HEAD(&hdr->rpc_list); - spin_lock_init(&hdr->lock); - atomic_set(&hdr->refcnt, 0); - } - return rhdr; + return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); } -EXPORT_SYMBOL_GPL(nfs_readhdr_alloc); -static struct nfs_read_data *nfs_readdata_alloc(struct nfs_pgio_header *hdr, - unsigned int pagecount) +static void nfs_readhdr_free(struct nfs_rw_header *rhdr) { - struct nfs_read_data *data, *prealloc; - - prealloc = &container_of(hdr, struct nfs_read_header, header)->rpc_data; - if (prealloc->header == NULL) - data = prealloc; - else - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - goto out; - - if (nfs_pgarray_set(&data->pages, pagecount)) { - data->header = hdr; - atomic_inc(&hdr->refcnt); - } else { - if (data != prealloc) - kfree(data); - data = NULL; - } -out: - return data; -} - -void nfs_readhdr_free(struct nfs_pgio_header *hdr) -{ - struct nfs_read_header *rhdr = container_of(hdr, struct nfs_read_header, header); - kmem_cache_free(nfs_rdata_cachep, rhdr); } -EXPORT_SYMBOL_GPL(nfs_readhdr_free); - -void nfs_readdata_release(struct nfs_read_data *rdata) -{ - struct nfs_pgio_header *hdr = rdata->header; - struct nfs_read_header *read_header = container_of(hdr, struct nfs_read_header, header); - - put_nfs_open_context(rdata->args.context); - if (rdata->pages.pagevec != rdata->pages.page_array) - kfree(rdata->pages.pagevec); - if (rdata == &read_header->rpc_data) { - rdata->header = NULL; - rdata = NULL; - } - if (atomic_dec_and_test(&hdr->refcnt)) - hdr->completion_ops->completion(hdr); - /* Note: we only free the rpc_task after callbacks are done. - * See the comment in rpc_free_task() for why - */ - kfree(rdata); -} -EXPORT_SYMBOL_GPL(nfs_readdata_release); static int nfs_return_empty_page(struct page *page) @@ -114,17 +53,24 @@ int nfs_return_empty_page(struct page *page) } void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, - struct inode *inode, + struct inode *inode, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops) { - nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, compl_ops, - NFS_SERVER(inode)->rsize, 0); + struct nfs_server *server = NFS_SERVER(inode); + const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops; + +#ifdef CONFIG_NFS_V4_1 + if (server->pnfs_curr_ld && !force_mds) + pg_ops = server->pnfs_curr_ld->pg_read_ops; +#endif + nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops, + server->rsize, 0); } EXPORT_SYMBOL_GPL(nfs_pageio_init_read); void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) { - pgio->pg_ops = &nfs_pageio_read_ops; + pgio->pg_ops = &nfs_pgio_rw_ops; pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize; } EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); @@ -139,7 +85,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, len = nfs_page_length(page); if (len == 0) return nfs_return_empty_page(page); - new = nfs_create_request(ctx, inode, page, 0, len); + new = nfs_create_request(ctx, page, NULL, 0, len); if (IS_ERR(new)) { unlock_page(page); return PTR_ERR(new); @@ -147,7 +93,8 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, if (len < PAGE_CACHE_SIZE) zero_user_segment(page, len, PAGE_CACHE_SIZE); - NFS_PROTO(inode)->read_pageio_init(&pgio, inode, &nfs_async_read_completion_ops); + nfs_pageio_init_read(&pgio, inode, false, + &nfs_async_read_completion_ops); nfs_pageio_add_request(&pgio, new); nfs_pageio_complete(&pgio); NFS_I(inode)->read_io += pgio.pg_bytes_written; @@ -158,20 +105,31 @@ static void nfs_readpage_release(struct nfs_page *req) { struct inode *d_inode = req->wb_context->dentry->d_inode; - if (PageUptodate(req->wb_page)) - nfs_readpage_to_fscache(d_inode, req->wb_page, 0); + dprintk("NFS: read done (%s/%llu %d@%lld)\n", d_inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(d_inode), req->wb_bytes, + (long long)req_offset(req)); - unlock_page(req->wb_page); + if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { + if (PageUptodate(req->wb_page)) + nfs_readpage_to_fscache(d_inode, req->wb_page, 0); - dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", + unlock_page(req->wb_page); + } + + dprintk("NFS: read done (%s/%Lu %d@%Ld)\n", req->wb_context->dentry->d_inode->i_sb->s_id, - (long long)NFS_FILEID(req->wb_context->dentry->d_inode), + (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode), req->wb_bytes, (long long)req_offset(req)); nfs_release_request(req); } -/* Note io was page aligned */ +static void nfs_page_group_set_uptodate(struct nfs_page *req) +{ + if (nfs_page_group_sync_on_bit(req, PG_UPTODATE)) + SetPageUptodate(req->wb_page); +} + static void nfs_read_completion(struct nfs_pgio_header *hdr) { unsigned long bytes = 0; @@ -181,21 +139,32 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr) while (!list_empty(&hdr->pages)) { struct nfs_page *req = nfs_list_entry(hdr->pages.next); struct page *page = req->wb_page; + unsigned long start = req->wb_pgbase; + unsigned long end = req->wb_pgbase + req->wb_bytes; if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) { - if (bytes > hdr->good_bytes) - zero_user(page, 0, PAGE_SIZE); - else if (hdr->good_bytes - bytes < PAGE_SIZE) - zero_user_segment(page, - hdr->good_bytes & ~PAGE_MASK, - PAGE_SIZE); + /* note: regions of the page not covered by a + * request are zeroed in nfs_readpage_async / + * readpage_async_filler */ + if (bytes > hdr->good_bytes) { + /* nothing in this request was good, so zero + * the full extent of the request */ + zero_user_segment(page, start, end); + + } else if (hdr->good_bytes - bytes < req->wb_bytes) { + /* part of this request has good bytes, but + * not all. zero the bad bytes */ + start += hdr->good_bytes - bytes; + WARN_ON(start < req->wb_pgbase); + zero_user_segment(page, start, end); + } } bytes += req->wb_bytes; if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { if (bytes <= hdr->good_bytes) - SetPageUptodate(page); + nfs_page_group_set_uptodate(req); } else - SetPageUptodate(page); + nfs_page_group_set_uptodate(req); nfs_list_remove_request(req); nfs_readpage_release(req); } @@ -203,95 +172,14 @@ out: hdr->release(hdr); } -int nfs_initiate_read(struct rpc_clnt *clnt, - struct nfs_read_data *data, - const struct rpc_call_ops *call_ops, int flags) +static void nfs_initiate_read(struct nfs_pgio_data *data, struct rpc_message *msg, + struct rpc_task_setup *task_setup_data, int how) { struct inode *inode = data->header->inode; int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; - struct rpc_task *task; - struct rpc_message msg = { - .rpc_argp = &data->args, - .rpc_resp = &data->res, - .rpc_cred = data->header->cred, - }; - struct rpc_task_setup task_setup_data = { - .task = &data->task, - .rpc_client = clnt, - .rpc_message = &msg, - .callback_ops = call_ops, - .callback_data = data, - .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC | swap_flags | flags, - }; - - /* Set up the initial task struct. */ - NFS_PROTO(inode)->read_setup(data, &msg); - - dprintk("NFS: %5u initiated read call (req %s/%lld, %u bytes @ " - "offset %llu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (long long)NFS_FILEID(inode), - data->args.count, - (unsigned long long)data->args.offset); - - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - return PTR_ERR(task); - rpc_put_task(task); - return 0; -} -EXPORT_SYMBOL_GPL(nfs_initiate_read); -/* - * Set up the NFS read request struct - */ -static void nfs_read_rpcsetup(struct nfs_read_data *data, - unsigned int count, unsigned int offset) -{ - struct nfs_page *req = data->header->req; - - data->args.fh = NFS_FH(data->header->inode); - data->args.offset = req_offset(req) + offset; - data->args.pgbase = req->wb_pgbase + offset; - data->args.pages = data->pages.pagevec; - data->args.count = count; - data->args.context = get_nfs_open_context(req->wb_context); - data->args.lock_context = req->wb_lock_context; - - data->res.fattr = &data->fattr; - data->res.count = count; - data->res.eof = 0; - nfs_fattr_init(&data->fattr); -} - -static int nfs_do_read(struct nfs_read_data *data, - const struct rpc_call_ops *call_ops) -{ - struct inode *inode = data->header->inode; - - return nfs_initiate_read(NFS_CLIENT(inode), data, call_ops, 0); -} - -static int -nfs_do_multiple_reads(struct list_head *head, - const struct rpc_call_ops *call_ops) -{ - struct nfs_read_data *data; - int ret = 0; - - while (!list_empty(head)) { - int ret2; - - data = list_first_entry(head, struct nfs_read_data, list); - list_del_init(&data->list); - - ret2 = nfs_do_read(data, call_ops); - if (ret == 0) - ret = ret2; - } - return ret; + task_setup_data->flags |= swap_flags; + NFS_PROTO(inode)->read_setup(data, msg); } static void @@ -311,143 +199,14 @@ static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = { .completion = nfs_read_completion, }; -static void nfs_pagein_error(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) -{ - set_bit(NFS_IOHDR_REDO, &hdr->flags); - while (!list_empty(&hdr->rpc_list)) { - struct nfs_read_data *data = list_first_entry(&hdr->rpc_list, - struct nfs_read_data, list); - list_del(&data->list); - nfs_readdata_release(data); - } - desc->pg_completion_ops->error_cleanup(&desc->pg_list); -} - -/* - * Generate multiple requests to fill a single page. - * - * We optimize to reduce the number of read operations on the wire. If we - * detect that we're reading a page, or an area of a page, that is past the - * end of file, we do not generate NFS read operations but just clear the - * parts of the page that would have come back zero from the server anyway. - * - * We rely on the cached value of i_size to make this determination; another - * client can fill pages on the server past our cached end-of-file, but we - * won't see the new data until our attribute cache is updated. This is more - * or less conventional NFS client behavior. - */ -static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) -{ - struct nfs_page *req = hdr->req; - struct page *page = req->wb_page; - struct nfs_read_data *data; - size_t rsize = desc->pg_bsize, nbytes; - unsigned int offset; - - offset = 0; - nbytes = desc->pg_count; - do { - size_t len = min(nbytes,rsize); - - data = nfs_readdata_alloc(hdr, 1); - if (!data) { - nfs_pagein_error(desc, hdr); - return -ENOMEM; - } - data->pages.pagevec[0] = page; - nfs_read_rpcsetup(data, len, offset); - list_add(&data->list, &hdr->rpc_list); - nbytes -= len; - offset += len; - } while (nbytes != 0); - - nfs_list_remove_request(req); - nfs_list_add_request(req, &hdr->pages); - desc->pg_rpc_callops = &nfs_read_common_ops; - return 0; -} - -static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) -{ - struct nfs_page *req; - struct page **pages; - struct nfs_read_data *data; - struct list_head *head = &desc->pg_list; - - data = nfs_readdata_alloc(hdr, nfs_page_array_len(desc->pg_base, - desc->pg_count)); - if (!data) { - nfs_pagein_error(desc, hdr); - return -ENOMEM; - } - - pages = data->pages.pagevec; - while (!list_empty(head)) { - req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_list_add_request(req, &hdr->pages); - *pages++ = req->wb_page; - } - - nfs_read_rpcsetup(data, desc->pg_count, 0); - list_add(&data->list, &hdr->rpc_list); - desc->pg_rpc_callops = &nfs_read_common_ops; - return 0; -} - -int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) -{ - if (desc->pg_bsize < PAGE_CACHE_SIZE) - return nfs_pagein_multi(desc, hdr); - return nfs_pagein_one(desc, hdr); -} -EXPORT_SYMBOL_GPL(nfs_generic_pagein); - -static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) -{ - struct nfs_read_header *rhdr; - struct nfs_pgio_header *hdr; - int ret; - - rhdr = nfs_readhdr_alloc(); - if (!rhdr) { - desc->pg_completion_ops->error_cleanup(&desc->pg_list); - return -ENOMEM; - } - hdr = &rhdr->header; - nfs_pgheader_init(desc, hdr, nfs_readhdr_free); - atomic_inc(&hdr->refcnt); - ret = nfs_generic_pagein(desc, hdr); - if (ret == 0) - ret = nfs_do_multiple_reads(&hdr->rpc_list, - desc->pg_rpc_callops); - if (atomic_dec_and_test(&hdr->refcnt)) - hdr->completion_ops->completion(hdr); - return ret; -} - -static const struct nfs_pageio_ops nfs_pageio_read_ops = { - .pg_test = nfs_generic_pg_test, - .pg_doio = nfs_generic_pg_readpages, -}; - /* * This is the callback from RPC telling us whether a reply was * received or some error occurred (timeout or socket shutdown). */ -int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data) +static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data, + struct inode *inode) { - struct inode *inode = data->header->inode; - int status; - - dprintk("NFS: %s: %5u, (status %d)\n", __func__, task->tk_pid, - task->tk_status); - - status = NFS_PROTO(inode)->read_done(task, data); + int status = NFS_PROTO(inode)->read_done(task, data); if (status != 0) return status; @@ -460,10 +219,10 @@ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data) return 0; } -static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data) +static void nfs_readpage_retry(struct rpc_task *task, struct nfs_pgio_data *data) { - struct nfs_readargs *argp = &data->args; - struct nfs_readres *resp = &data->res; + struct nfs_pgio_args *argp = &data->args; + struct nfs_pgio_res *resp = &data->res; /* This is a short read! */ nfs_inc_stats(data->header->inode, NFSIOS_SHORTREAD); @@ -480,17 +239,11 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data rpc_restart_call_prepare(task); } -static void nfs_readpage_result_common(struct rpc_task *task, void *calldata) +static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *data) { - struct nfs_read_data *data = calldata; struct nfs_pgio_header *hdr = data->header; - /* Note the only returns of nfs_readpage_result are 0 and -EAGAIN */ - if (nfs_readpage_result(task, data) != 0) - return; - if (task->tk_status < 0) - nfs_set_pgio_error(hdr, task->tk_status, data->args.offset); - else if (data->res.eof) { + if (data->res.eof) { loff_t bound; bound = data->args.offset + data->res.count; @@ -505,26 +258,6 @@ static void nfs_readpage_result_common(struct rpc_task *task, void *calldata) nfs_readpage_retry(task, data); } -static void nfs_readpage_release_common(void *calldata) -{ - nfs_readdata_release(calldata); -} - -void nfs_read_prepare(struct rpc_task *task, void *calldata) -{ - struct nfs_read_data *data = calldata; - int err; - err = NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data); - if (err) - rpc_exit(task, err); -} - -static const struct rpc_call_ops nfs_read_common_ops = { - .rpc_call_prepare = nfs_read_prepare, - .rpc_call_done = nfs_readpage_result_common, - .rpc_release = nfs_readpage_release_common, -}; - /* * Read a page over NFS. * We read the page synchronously in the following case: @@ -592,7 +325,6 @@ static int readpage_async_filler(void *data, struct page *page) { struct nfs_readdesc *desc = (struct nfs_readdesc *)data; - struct inode *inode = page_file_mapping(page)->host; struct nfs_page *new; unsigned int len; int error; @@ -601,7 +333,7 @@ readpage_async_filler(void *data, struct page *page) if (len == 0) return nfs_return_empty_page(page); - new = nfs_create_request(desc->ctx, inode, page, 0, len); + new = nfs_create_request(desc->ctx, page, NULL, 0, len); if (IS_ERR(new)) goto out_error; @@ -630,9 +362,9 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, unsigned long npages; int ret = -ESTALE; - dprintk("NFS: nfs_readpages (%s/%Ld %d)\n", + dprintk("NFS: nfs_readpages (%s/%Lu %d)\n", inode->i_sb->s_id, - (long long)NFS_FILEID(inode), + (unsigned long long)NFS_FILEID(inode), nr_pages); nfs_inc_stats(inode, NFSIOS_VFSREADPAGES); @@ -654,7 +386,8 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, if (ret == 0) goto read_complete; /* all pages were read */ - NFS_PROTO(inode)->read_pageio_init(&pgio, inode, &nfs_async_read_completion_ops); + nfs_pageio_init_read(&pgio, inode, false, + &nfs_async_read_completion_ops); ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); @@ -671,7 +404,7 @@ out: int __init nfs_init_readpagecache(void) { nfs_rdata_cachep = kmem_cache_create("nfs_read_data", - sizeof(struct nfs_read_header), + sizeof(struct nfs_rw_header), 0, SLAB_HWCACHE_ALIGN, NULL); if (nfs_rdata_cachep == NULL) @@ -684,3 +417,12 @@ void nfs_destroy_readpagecache(void) { kmem_cache_destroy(nfs_rdata_cachep); } + +static const struct nfs_rw_ops nfs_rw_read_ops = { + .rw_mode = FMODE_READ, + .rw_alloc_header = nfs_readhdr_alloc, + .rw_free_header = nfs_readhdr_free, + .rw_done = nfs_readpage_done, + .rw_result = nfs_readpage_result, + .rw_initiate = nfs_initiate_read, +}; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index a03b9c6f948..084af1060d7 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -497,7 +497,8 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour) static const struct { rpc_authflavor_t flavour; const char *str; - } sec_flavours[] = { + } sec_flavours[NFS_AUTH_INFO_MAX_FLAVORS] = { + /* update NFS_AUTH_INFO_MAX_FLAVORS when this list changes! */ { RPC_AUTH_NULL, "null" }, { RPC_AUTH_UNIX, "sys" }, { RPC_AUTH_GSS_KRB5, "krb5" }, @@ -923,8 +924,7 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void) data->mount_server.port = NFS_UNSPEC_PORT; data->nfs_server.port = NFS_UNSPEC_PORT; data->nfs_server.protocol = XPRT_TRANSPORT_TCP; - data->auth_flavors[0] = RPC_AUTH_MAXFLAVOR; - data->auth_flavor_len = 0; + data->selected_flavor = RPC_AUTH_MAXFLAVOR; data->minorversion = 0; data->need_mount = true; data->net = current->nsproxy->net_ns; @@ -1019,14 +1019,53 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt) } } -static void nfs_set_auth_parsed_mount_data(struct nfs_parsed_mount_data *data, - rpc_authflavor_t pseudoflavor) +/* + * Add 'flavor' to 'auth_info' if not already present. + * Returns true if 'flavor' ends up in the list, false otherwise + */ +static bool nfs_auth_info_add(struct nfs_auth_info *auth_info, + rpc_authflavor_t flavor) { - data->auth_flavors[0] = pseudoflavor; - data->auth_flavor_len = 1; + unsigned int i; + unsigned int max_flavor_len = (sizeof(auth_info->flavors) / + sizeof(auth_info->flavors[0])); + + /* make sure this flavor isn't already in the list */ + for (i = 0; i < auth_info->flavor_len; i++) { + if (flavor == auth_info->flavors[i]) + return true; + } + + if (auth_info->flavor_len + 1 >= max_flavor_len) { + dfprintk(MOUNT, "NFS: too many sec= flavors\n"); + return false; + } + + auth_info->flavors[auth_info->flavor_len++] = flavor; + return true; } /* + * Return true if 'match' is in auth_info or auth_info is empty. + * Return false otherwise. + */ +bool nfs_auth_info_match(const struct nfs_auth_info *auth_info, + rpc_authflavor_t match) +{ + int i; + + if (!auth_info->flavor_len) + return true; + + for (i = 0; i < auth_info->flavor_len; i++) { + if (auth_info->flavors[i] == match) + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(nfs_auth_info_match); + +/* * Parse the value of the 'sec=' option. */ static int nfs_parse_security_flavors(char *value, @@ -1034,49 +1073,55 @@ static int nfs_parse_security_flavors(char *value, { substring_t args[MAX_OPT_ARGS]; rpc_authflavor_t pseudoflavor; + char *p; dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value); - switch (match_token(value, nfs_secflavor_tokens, args)) { - case Opt_sec_none: - pseudoflavor = RPC_AUTH_NULL; - break; - case Opt_sec_sys: - pseudoflavor = RPC_AUTH_UNIX; - break; - case Opt_sec_krb5: - pseudoflavor = RPC_AUTH_GSS_KRB5; - break; - case Opt_sec_krb5i: - pseudoflavor = RPC_AUTH_GSS_KRB5I; - break; - case Opt_sec_krb5p: - pseudoflavor = RPC_AUTH_GSS_KRB5P; - break; - case Opt_sec_lkey: - pseudoflavor = RPC_AUTH_GSS_LKEY; - break; - case Opt_sec_lkeyi: - pseudoflavor = RPC_AUTH_GSS_LKEYI; - break; - case Opt_sec_lkeyp: - pseudoflavor = RPC_AUTH_GSS_LKEYP; - break; - case Opt_sec_spkm: - pseudoflavor = RPC_AUTH_GSS_SPKM; - break; - case Opt_sec_spkmi: - pseudoflavor = RPC_AUTH_GSS_SPKMI; - break; - case Opt_sec_spkmp: - pseudoflavor = RPC_AUTH_GSS_SPKMP; - break; - default: - return 0; + while ((p = strsep(&value, ":")) != NULL) { + switch (match_token(p, nfs_secflavor_tokens, args)) { + case Opt_sec_none: + pseudoflavor = RPC_AUTH_NULL; + break; + case Opt_sec_sys: + pseudoflavor = RPC_AUTH_UNIX; + break; + case Opt_sec_krb5: + pseudoflavor = RPC_AUTH_GSS_KRB5; + break; + case Opt_sec_krb5i: + pseudoflavor = RPC_AUTH_GSS_KRB5I; + break; + case Opt_sec_krb5p: + pseudoflavor = RPC_AUTH_GSS_KRB5P; + break; + case Opt_sec_lkey: + pseudoflavor = RPC_AUTH_GSS_LKEY; + break; + case Opt_sec_lkeyi: + pseudoflavor = RPC_AUTH_GSS_LKEYI; + break; + case Opt_sec_lkeyp: + pseudoflavor = RPC_AUTH_GSS_LKEYP; + break; + case Opt_sec_spkm: + pseudoflavor = RPC_AUTH_GSS_SPKM; + break; + case Opt_sec_spkmi: + pseudoflavor = RPC_AUTH_GSS_SPKMI; + break; + case Opt_sec_spkmp: + pseudoflavor = RPC_AUTH_GSS_SPKMP; + break; + default: + dfprintk(MOUNT, + "NFS: sec= option '%s' not recognized\n", p); + return 0; + } + + if (!nfs_auth_info_add(&mnt->auth_info, pseudoflavor)) + return 0; } - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - nfs_set_auth_parsed_mount_data(mnt, pseudoflavor); return 1; } @@ -1569,7 +1614,7 @@ static int nfs_parse_mount_options(char *raw, goto out_minorversion_mismatch; if (mnt->options & NFS_OPTION_MIGRATION && - mnt->version != 4 && mnt->minorversion != 0) + (mnt->version != 4 || mnt->minorversion != 0)) goto out_migration_misuse; /* @@ -1623,12 +1668,14 @@ out_security_failure: } /* - * Ensure that the specified authtype in args->auth_flavors[0] is supported by - * the server. Returns 0 if it's ok, and -EACCES if not. + * Ensure that a specified authtype in args->auth_info is supported by + * the server. Returns 0 and sets args->selected_flavor if it's ok, and + * -EACCES if not. */ -static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args, +static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, rpc_authflavor_t *server_authlist, unsigned int count) { + rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR; unsigned int i; /* @@ -1640,17 +1687,20 @@ static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args, * can be used. */ for (i = 0; i < count; i++) { - if (args->auth_flavors[0] == server_authlist[i] || - server_authlist[i] == RPC_AUTH_NULL) + flavor = server_authlist[i]; + + if (nfs_auth_info_match(&args->auth_info, flavor) || + flavor == RPC_AUTH_NULL) goto out; } - dfprintk(MOUNT, "NFS: auth flavor %u not supported by server\n", - args->auth_flavors[0]); + dfprintk(MOUNT, + "NFS: specified auth flavors not supported by server\n"); return -EACCES; out: - dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]); + args->selected_flavor = flavor; + dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->selected_flavor); return 0; } @@ -1738,9 +1788,10 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf * Was a sec= authflavor specified in the options? First, verify * whether the server supports it, and then just try to use it if so. */ - if (args->auth_flavor_len > 0) { - status = nfs_verify_authflavor(args, authlist, authlist_len); - dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]); + if (args->auth_info.flavor_len > 0) { + status = nfs_verify_authflavors(args, authlist, authlist_len); + dfprintk(MOUNT, "NFS: using auth flavor %u\n", + args->selected_flavor); if (status) return ERR_PTR(status); return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); @@ -1769,7 +1820,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf /* Fallthrough */ } dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor); - nfs_set_auth_parsed_mount_data(args, flavor); + args->selected_flavor = flavor; server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); if (!IS_ERR(server)) return server; @@ -1785,7 +1836,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf /* Last chance! Try AUTH_UNIX */ dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX); - nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX); + args->selected_flavor = RPC_AUTH_UNIX; return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); } @@ -1972,9 +2023,9 @@ static int nfs23_validate_mount_data(void *options, args->bsize = data->bsize; if (data->flags & NFS_MOUNT_SECFLAVOUR) - nfs_set_auth_parsed_mount_data(args, data->pseudoflavor); + args->selected_flavor = data->pseudoflavor; else - nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX); + args->selected_flavor = RPC_AUTH_UNIX; if (!args->nfs_server.hostname) goto out_nomem; @@ -2108,9 +2159,6 @@ static int nfs_validate_text_mount_data(void *options, nfs_set_port(sap, &args->nfs_server.port, port); - if (args->auth_flavor_len > 1) - goto out_bad_auth; - return nfs_parse_devname(dev_name, &args->nfs_server.hostname, max_namelen, @@ -2130,21 +2178,31 @@ out_invalid_transport_udp: out_no_address: dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n"); return -EINVAL; - -out_bad_auth: - dfprintk(MOUNT, "NFS: Too many RPC auth flavours specified\n"); - return -EINVAL; } +#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \ + | NFS_MOUNT_SECURE \ + | NFS_MOUNT_TCP \ + | NFS_MOUNT_VER3 \ + | NFS_MOUNT_KERBEROS \ + | NFS_MOUNT_NONLM \ + | NFS_MOUNT_BROKEN_SUID \ + | NFS_MOUNT_STRICTLOCK \ + | NFS_MOUNT_UNSHARED \ + | NFS_MOUNT_NORESVPORT \ + | NFS_MOUNT_LEGACY_INTERFACE) + static int nfs_compare_remount_data(struct nfs_server *nfss, struct nfs_parsed_mount_data *data) { - if (data->flags != nfss->flags || + if ((data->flags ^ nfss->flags) & NFS_MOUNT_CMP_FLAGMASK || data->rsize != nfss->rsize || data->wsize != nfss->wsize || + data->version != nfss->nfs_client->rpc_ops->version || + data->minorversion != nfss->nfs_client->cl_minorversion || data->retrans != nfss->client->cl_timeout->to_retries || - data->auth_flavors[0] != nfss->client->cl_auth->au_flavor || + data->selected_flavor != nfss->client->cl_auth->au_flavor || data->acregmin != nfss->acregmin / HZ || data->acregmax != nfss->acregmax / HZ || data->acdirmin != nfss->acdirmin / HZ || @@ -2169,6 +2227,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data; u32 nfsvers = nfss->nfs_client->rpc_ops->version; + sync_filesystem(sb); + /* * Userspace mount programs that send binary options generally send * them populated with default values. We have no way to know which @@ -2189,7 +2249,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) data->rsize = nfss->rsize; data->wsize = nfss->wsize; data->retrans = nfss->client->cl_timeout->to_retries; - nfs_set_auth_parsed_mount_data(data, nfss->client->cl_auth->au_flavor); + data->selected_flavor = nfss->client->cl_auth->au_flavor; + data->auth_info = nfss->auth_info; data->acregmin = nfss->acregmin / HZ; data->acregmax = nfss->acregmax / HZ; data->acdirmin = nfss->acdirmin / HZ; @@ -2197,12 +2258,15 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ; data->nfs_server.port = nfss->port; data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen; + data->version = nfsvers; + data->minorversion = nfss->nfs_client->cl_minorversion; + data->net = current->nsproxy->net_ns; memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr, data->nfs_server.addrlen); /* overwrite those values with any that were specified */ - error = nfs_parse_mount_options((char *)options, data); - if (error < 0) + error = -EINVAL; + if (!nfs_parse_mount_options((char *)options, data)) goto out; /* @@ -2296,18 +2360,6 @@ void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info) nfs_initialise_sb(sb); } -#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \ - | NFS_MOUNT_SECURE \ - | NFS_MOUNT_TCP \ - | NFS_MOUNT_VER3 \ - | NFS_MOUNT_KERBEROS \ - | NFS_MOUNT_NONLM \ - | NFS_MOUNT_BROKEN_SUID \ - | NFS_MOUNT_STRICTLOCK \ - | NFS_MOUNT_UNSHARED \ - | NFS_MOUNT_NORESVPORT \ - | NFS_MOUNT_LEGACY_INTERFACE) - static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags) { const struct nfs_server *a = s->s_fs_info; @@ -2332,7 +2384,7 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n goto Ebusy; if (a->acdirmax != b->acdirmax) goto Ebusy; - if (b->flags & NFS_MOUNT_SECFLAVOUR && + if (b->auth_info.flavor_len > 0 && clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor) goto Ebusy; return 1; @@ -2530,6 +2582,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, mntroot = ERR_PTR(error); goto error_splat_bdi; } + server->super = s; } if (!s->s_root) { @@ -2713,9 +2766,9 @@ static int nfs4_validate_mount_data(void *options, data->auth_flavours, sizeof(pseudoflavor))) return -EFAULT; - nfs_set_auth_parsed_mount_data(args, pseudoflavor); + args->selected_flavor = pseudoflavor; } else - nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX); + args->selected_flavor = RPC_AUTH_UNIX; c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); if (IS_ERR(c)) diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index 6b3f2535a3e..bb6ed810fa6 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -13,7 +13,7 @@ static struct ctl_table_header *nfs_callback_sysctl_table; -static ctl_table nfs_cb_sysctls[] = { +static struct ctl_table nfs_cb_sysctls[] = { { .procname = "nfs_mountpoint_timeout", .data = &nfs_mountpoint_expiry_timeout, @@ -31,7 +31,7 @@ static ctl_table nfs_cb_sysctls[] = { { } }; -static ctl_table nfs_cb_sysctl_dir[] = { +static struct ctl_table nfs_cb_sysctl_dir[] = { { .procname = "nfs", .mode = 0555, @@ -40,7 +40,7 @@ static ctl_table nfs_cb_sysctl_dir[] = { { } }; -static ctl_table nfs_cb_sysctl_root[] = { +static struct ctl_table nfs_cb_sysctl_root[] = { { .procname = "fs", .mode = 0555, diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 8285de9eaad..de54129336c 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -14,6 +14,7 @@ #include <linux/sched.h> #include <linux/wait.h> #include <linux/namei.h> +#include <linux/fsnotify.h> #include "internal.h" #include "nfs4_fs.h" @@ -353,8 +354,8 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata) return; } - if (task->tk_status != 0) - nfs_cancel_async_unlink(old_dentry); + if (data->complete) + data->complete(task, data); } /** @@ -399,9 +400,10 @@ static const struct rpc_call_ops nfs_rename_ops = { * * It's expected that valid references to the dentries and inodes are held */ -static struct rpc_task * +struct rpc_task * nfs_async_rename(struct inode *old_dir, struct inode *new_dir, - struct dentry *old_dentry, struct dentry *new_dentry) + struct dentry *old_dentry, struct dentry *new_dentry, + void (*complete)(struct rpc_task *, struct nfs_renamedata *)) { struct nfs_renamedata *data; struct rpc_message msg = { }; @@ -438,6 +440,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, data->new_dentry = dget(new_dentry); nfs_fattr_init(&data->old_fattr); nfs_fattr_init(&data->new_fattr); + data->complete = complete; /* set up nfs_renameargs */ data->args.old_dir = NFS_FH(old_dir); @@ -456,6 +459,27 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, return rpc_run_task(&task_setup_data); } +/* + * Perform tasks needed when a sillyrename is done such as cancelling the + * queued async unlink if it failed. + */ +static void +nfs_complete_sillyrename(struct rpc_task *task, struct nfs_renamedata *data) +{ + struct dentry *dentry = data->old_dentry; + + if (task->tk_status != 0) { + nfs_cancel_async_unlink(dentry); + return; + } + + /* + * vfs_unlink and the like do not issue this when a file is + * sillyrenamed, so do it here. + */ + fsnotify_nameremove(dentry, 0); +} + #define SILLYNAME_PREFIX ".nfs" #define SILLYNAME_PREFIX_LEN ((unsigned)sizeof(SILLYNAME_PREFIX) - 1) #define SILLYNAME_FILEID_LEN ((unsigned)sizeof(u64) << 1) @@ -493,7 +517,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) unsigned long long fileid; struct dentry *sdentry; struct rpc_task *task; - int error = -EIO; + int error = -EBUSY; dfprintk(VFS, "NFS: silly-rename(%pd2, ct=%d)\n", dentry, d_count(dentry)); @@ -502,7 +526,6 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) /* * We don't allow a dentry to be silly-renamed twice. */ - error = -EBUSY; if (dentry->d_flags & DCACHE_NFSFS_RENAMED) goto out; @@ -549,7 +572,8 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) } /* run the rename task, undo unlink if it fails */ - task = nfs_async_rename(dir, dir, dentry, sdentry); + task = nfs_async_rename(dir, dir, dentry, sdentry, + nfs_complete_sillyrename); if (IS_ERR(task)) { error = -EBUSY; nfs_cancel_async_unlink(dentry); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index c1d548211c3..5e2f1030454 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -42,10 +42,11 @@ * Local function declarations */ static void nfs_redirty_request(struct nfs_page *req); -static const struct rpc_call_ops nfs_write_common_ops; static const struct rpc_call_ops nfs_commit_ops; static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops; static const struct nfs_commit_completion_ops nfs_commit_completion_ops; +static const struct nfs_rw_ops nfs_rw_write_ops; +static void nfs_clear_request_commit(struct nfs_page *req); static struct kmem_cache *nfs_wdata_cachep; static mempool_t *nfs_wdata_mempool; @@ -70,76 +71,19 @@ void nfs_commit_free(struct nfs_commit_data *p) } EXPORT_SYMBOL_GPL(nfs_commit_free); -struct nfs_write_header *nfs_writehdr_alloc(void) +static struct nfs_rw_header *nfs_writehdr_alloc(void) { - struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); - - if (p) { - struct nfs_pgio_header *hdr = &p->header; + struct nfs_rw_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); + if (p) memset(p, 0, sizeof(*p)); - INIT_LIST_HEAD(&hdr->pages); - INIT_LIST_HEAD(&hdr->rpc_list); - spin_lock_init(&hdr->lock); - atomic_set(&hdr->refcnt, 0); - hdr->verf = &p->verf; - } return p; } -EXPORT_SYMBOL_GPL(nfs_writehdr_alloc); -static struct nfs_write_data *nfs_writedata_alloc(struct nfs_pgio_header *hdr, - unsigned int pagecount) +static void nfs_writehdr_free(struct nfs_rw_header *whdr) { - struct nfs_write_data *data, *prealloc; - - prealloc = &container_of(hdr, struct nfs_write_header, header)->rpc_data; - if (prealloc->header == NULL) - data = prealloc; - else - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - goto out; - - if (nfs_pgarray_set(&data->pages, pagecount)) { - data->header = hdr; - atomic_inc(&hdr->refcnt); - } else { - if (data != prealloc) - kfree(data); - data = NULL; - } -out: - return data; -} - -void nfs_writehdr_free(struct nfs_pgio_header *hdr) -{ - struct nfs_write_header *whdr = container_of(hdr, struct nfs_write_header, header); mempool_free(whdr, nfs_wdata_mempool); } -EXPORT_SYMBOL_GPL(nfs_writehdr_free); - -void nfs_writedata_release(struct nfs_write_data *wdata) -{ - struct nfs_pgio_header *hdr = wdata->header; - struct nfs_write_header *write_header = container_of(hdr, struct nfs_write_header, header); - - put_nfs_open_context(wdata->args.context); - if (wdata->pages.pagevec != wdata->pages.page_array) - kfree(wdata->pages.pagevec); - if (wdata == &write_header->rpc_data) { - wdata->header = NULL; - wdata = NULL; - } - if (atomic_dec_and_test(&hdr->refcnt)) - hdr->completion_ops->completion(hdr); - /* Note: we only free the rpc_task after callbacks are done. - * See the comment in rpc_free_task() for why - */ - kfree(wdata); -} -EXPORT_SYMBOL_GPL(nfs_writedata_release); static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) { @@ -148,8 +92,15 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); } +/* + * nfs_page_find_head_request_locked - find head request associated with @page + * + * must be called while holding the inode lock. + * + * returns matching head request with reference held, or NULL if not found. + */ static struct nfs_page * -nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page) +nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page) { struct nfs_page *req = NULL; @@ -161,25 +112,33 @@ nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page) /* Linearly search the commit list for the correct req */ list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) { if (freq->wb_page == page) { - req = freq; + req = freq->wb_head; break; } } } - if (req) + if (req) { + WARN_ON_ONCE(req->wb_head != req); + kref_get(&req->wb_kref); + } return req; } -static struct nfs_page *nfs_page_find_request(struct page *page) +/* + * nfs_page_find_head_request - find head request associated with @page + * + * returns matching head request with reference held, or NULL if not found. + */ +static struct nfs_page *nfs_page_find_head_request(struct page *page) { struct inode *inode = page_file_mapping(page)->host; struct nfs_page *req = NULL; spin_lock(&inode->i_lock); - req = nfs_page_find_request_locked(NFS_I(inode), page); + req = nfs_page_find_head_request_locked(NFS_I(inode), page); spin_unlock(&inode->i_lock); return req; } @@ -211,18 +170,78 @@ static void nfs_set_pageerror(struct page *page) nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page)); } +/* + * nfs_page_group_search_locked + * @head - head request of page group + * @page_offset - offset into page + * + * Search page group with head @head to find a request that contains the + * page offset @page_offset. + * + * Returns a pointer to the first matching nfs request, or NULL if no + * match is found. + * + * Must be called with the page group lock held + */ +static struct nfs_page * +nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset) +{ + struct nfs_page *req; + + WARN_ON_ONCE(head != head->wb_head); + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_head->wb_flags)); + + req = head; + do { + if (page_offset >= req->wb_pgbase && + page_offset < (req->wb_pgbase + req->wb_bytes)) + return req; + + req = req->wb_this_page; + } while (req != head); + + return NULL; +} + +/* + * nfs_page_group_covers_page + * @head - head request of page group + * + * Return true if the page group with head @head covers the whole page, + * returns false otherwise + */ +static bool nfs_page_group_covers_page(struct nfs_page *req) +{ + struct nfs_page *tmp; + unsigned int pos = 0; + unsigned int len = nfs_page_length(req->wb_page); + + nfs_page_group_lock(req); + + do { + tmp = nfs_page_group_search_locked(req->wb_head, pos); + if (tmp) { + /* no way this should happen */ + WARN_ON_ONCE(tmp->wb_pgbase != pos); + pos += tmp->wb_bytes - (pos - tmp->wb_pgbase); + } + } while (tmp && pos < len); + + nfs_page_group_unlock(req); + WARN_ON_ONCE(pos > len); + return pos == len; +} + /* We can set the PG_uptodate flag if we see that a write request * covers the full page. */ -static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int count) +static void nfs_mark_uptodate(struct nfs_page *req) { - if (PageUptodate(page)) - return; - if (base != 0) + if (PageUptodate(req->wb_page)) return; - if (count != nfs_page_length(page)) + if (!nfs_page_group_covers_page(req)) return; - SetPageUptodate(page); + SetPageUptodate(req->wb_page); } static int wb_priority(struct writeback_control *wbc) @@ -258,46 +277,259 @@ static void nfs_set_page_writeback(struct page *page) } } -static void nfs_end_page_writeback(struct page *page) +static void nfs_end_page_writeback(struct nfs_page *req) { - struct inode *inode = page_file_mapping(page)->host; + struct inode *inode = page_file_mapping(req->wb_page)->host; struct nfs_server *nfss = NFS_SERVER(inode); - end_page_writeback(page); + if (!nfs_page_group_sync_on_bit(req, PG_WB_END)) + return; + + end_page_writeback(req->wb_page); if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); } -static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock) + +/* nfs_page_group_clear_bits + * @req - an nfs request + * clears all page group related bits from @req + */ +static void +nfs_page_group_clear_bits(struct nfs_page *req) +{ + clear_bit(PG_TEARDOWN, &req->wb_flags); + clear_bit(PG_UNLOCKPAGE, &req->wb_flags); + clear_bit(PG_UPTODATE, &req->wb_flags); + clear_bit(PG_WB_END, &req->wb_flags); + clear_bit(PG_REMOVE, &req->wb_flags); +} + + +/* + * nfs_unroll_locks_and_wait - unlock all newly locked reqs and wait on @req + * + * this is a helper function for nfs_lock_and_join_requests + * + * @inode - inode associated with request page group, must be holding inode lock + * @head - head request of page group, must be holding head lock + * @req - request that couldn't lock and needs to wait on the req bit lock + * @nonblock - if true, don't actually wait + * + * NOTE: this must be called holding page_group bit lock and inode spin lock + * and BOTH will be released before returning. + * + * returns 0 on success, < 0 on error. + */ +static int +nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head, + struct nfs_page *req, bool nonblock) + __releases(&inode->i_lock) +{ + struct nfs_page *tmp; + int ret; + + /* relinquish all the locks successfully grabbed this run */ + for (tmp = head ; tmp != req; tmp = tmp->wb_this_page) + nfs_unlock_request(tmp); + + WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags)); + + /* grab a ref on the request that will be waited on */ + kref_get(&req->wb_kref); + + nfs_page_group_unlock(head); + spin_unlock(&inode->i_lock); + + /* release ref from nfs_page_find_head_request_locked */ + nfs_release_request(head); + + if (!nonblock) + ret = nfs_wait_on_request(req); + else + ret = -EAGAIN; + nfs_release_request(req); + + return ret; +} + +/* + * nfs_destroy_unlinked_subrequests - destroy recently unlinked subrequests + * + * @destroy_list - request list (using wb_this_page) terminated by @old_head + * @old_head - the old head of the list + * + * All subrequests must be locked and removed from all lists, so at this point + * they are only "active" in this function, and possibly in nfs_wait_on_request + * with a reference held by some other context. + */ +static void +nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, + struct nfs_page *old_head) +{ + while (destroy_list) { + struct nfs_page *subreq = destroy_list; + + destroy_list = (subreq->wb_this_page == old_head) ? + NULL : subreq->wb_this_page; + + WARN_ON_ONCE(old_head != subreq->wb_head); + + /* make sure old group is not used */ + subreq->wb_head = subreq; + subreq->wb_this_page = subreq; + + nfs_clear_request_commit(subreq); + + /* subreq is now totally disconnected from page group or any + * write / commit lists. last chance to wake any waiters */ + nfs_unlock_request(subreq); + + if (!test_bit(PG_TEARDOWN, &subreq->wb_flags)) { + /* release ref on old head request */ + nfs_release_request(old_head); + + nfs_page_group_clear_bits(subreq); + + /* release the PG_INODE_REF reference */ + if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) + nfs_release_request(subreq); + else + WARN_ON_ONCE(1); + } else { + WARN_ON_ONCE(test_bit(PG_CLEAN, &subreq->wb_flags)); + /* zombie requests have already released the last + * reference and were waiting on the rest of the + * group to complete. Since it's no longer part of a + * group, simply free the request */ + nfs_page_group_clear_bits(subreq); + nfs_free_request(subreq); + } + } +} + +/* + * nfs_lock_and_join_requests - join all subreqs to the head req and return + * a locked reference, cancelling any pending + * operations for this page. + * + * @page - the page used to lookup the "page group" of nfs_page structures + * @nonblock - if true, don't block waiting for request locks + * + * This function joins all sub requests to the head request by first + * locking all requests in the group, cancelling any pending operations + * and finally updating the head request to cover the whole range covered by + * the (former) group. All subrequests are removed from any write or commit + * lists, unlinked from the group and destroyed. + * + * Returns a locked, referenced pointer to the head request - which after + * this call is guaranteed to be the only request associated with the page. + * Returns NULL if no requests are found for @page, or a ERR_PTR if an + * error was encountered. + */ +static struct nfs_page * +nfs_lock_and_join_requests(struct page *page, bool nonblock) { struct inode *inode = page_file_mapping(page)->host; - struct nfs_page *req; + struct nfs_page *head, *subreq; + struct nfs_page *destroy_list = NULL; + unsigned int total_bytes; int ret; +try_again: + total_bytes = 0; + + WARN_ON_ONCE(destroy_list); + spin_lock(&inode->i_lock); - for (;;) { - req = nfs_page_find_request_locked(NFS_I(inode), page); - if (req == NULL) - break; - if (nfs_lock_request(req)) - break; - /* Note: If we hold the page lock, as is the case in nfs_writepage, - * then the call to nfs_lock_request() will always - * succeed provided that someone hasn't already marked the - * request as dirty (in which case we don't care). - */ + + /* + * A reference is taken only on the head request which acts as a + * reference to the whole page group - the group will not be destroyed + * until the head reference is released. + */ + head = nfs_page_find_head_request_locked(NFS_I(inode), page); + + if (!head) { spin_unlock(&inode->i_lock); - if (!nonblock) - ret = nfs_wait_on_request(req); - else - ret = -EAGAIN; - nfs_release_request(req); - if (ret != 0) + return NULL; + } + + /* lock each request in the page group */ + nfs_page_group_lock(head); + subreq = head; + do { + /* + * Subrequests are always contiguous, non overlapping + * and in order. If not, it's a programming error. + */ + WARN_ON_ONCE(subreq->wb_offset != + (head->wb_offset + total_bytes)); + + /* keep track of how many bytes this group covers */ + total_bytes += subreq->wb_bytes; + + if (!nfs_lock_request(subreq)) { + /* releases page group bit lock and + * inode spin lock and all references */ + ret = nfs_unroll_locks_and_wait(inode, head, + subreq, nonblock); + + if (ret == 0) + goto try_again; + return ERR_PTR(ret); - spin_lock(&inode->i_lock); + } + + subreq = subreq->wb_this_page; + } while (subreq != head); + + /* Now that all requests are locked, make sure they aren't on any list. + * Commit list removal accounting is done after locks are dropped */ + subreq = head; + do { + nfs_list_remove_request(subreq); + subreq = subreq->wb_this_page; + } while (subreq != head); + + /* unlink subrequests from head, destroy them later */ + if (head->wb_this_page != head) { + /* destroy list will be terminated by head */ + destroy_list = head->wb_this_page; + head->wb_this_page = head; + + /* change head request to cover whole range that + * the former page group covered */ + head->wb_bytes = total_bytes; } + + /* + * prepare head request to be added to new pgio descriptor + */ + nfs_page_group_clear_bits(head); + + /* + * some part of the group was still on the inode list - otherwise + * the group wouldn't be involved in async write. + * grab a reference for the head request, iff it needs one. + */ + if (!test_and_set_bit(PG_INODE_REF, &head->wb_flags)) + kref_get(&head->wb_kref); + + nfs_page_group_unlock(head); + + /* drop lock to clear_request_commit the head req and clean up + * requests on destroy list */ spin_unlock(&inode->i_lock); - return req; + + nfs_destroy_unlinked_subrequests(destroy_list, head); + + /* clean up commit list state */ + nfs_clear_request_commit(head); + + /* still holds ref on head from nfs_page_find_head_request_locked + * and still has lock on head from lock loop */ + return head; } /* @@ -310,7 +542,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, struct nfs_page *req; int ret = 0; - req = nfs_find_and_lock_request(page, nonblock); + req = nfs_lock_and_join_requests(page, nonblock); if (!req) goto out; ret = PTR_ERR(req); @@ -354,10 +586,8 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc struct nfs_pageio_descriptor pgio; int err; - NFS_PROTO(page_file_mapping(page)->host)->write_pageio_init(&pgio, - page->mapping->host, - wb_priority(wbc), - &nfs_async_write_completion_ops); + nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc), + false, &nfs_async_write_completion_ops); err = nfs_do_writepage(page, wbc, &pgio); nfs_pageio_complete(&pgio); if (err < 0) @@ -400,12 +630,13 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); - NFS_PROTO(inode)->write_pageio_init(&pgio, inode, wb_priority(wbc), &nfs_async_write_completion_ops); + nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, + &nfs_async_write_completion_ops); err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); nfs_pageio_complete(&pgio); clear_bit_unlock(NFS_INO_FLUSHING, bitlock); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(bitlock, NFS_INO_FLUSHING); if (err < 0) @@ -425,6 +656,8 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) { struct nfs_inode *nfsi = NFS_I(inode); + WARN_ON_ONCE(req->wb_this_page != req); + /* Lock the request! */ nfs_lock_request(req); @@ -441,6 +674,9 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) set_page_private(req->wb_page, (unsigned long)req); } nfsi->npages++; + /* this a head request for a page group - mark it as having an + * extra reference so sub groups can follow suit */ + WARN_ON(test_and_set_bit(PG_INODE_REF, &req->wb_flags)); kref_get(&req->wb_kref); spin_unlock(&inode->i_lock); } @@ -452,16 +688,23 @@ static void nfs_inode_remove_request(struct nfs_page *req) { struct inode *inode = req->wb_context->dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_page *head; - spin_lock(&inode->i_lock); - if (likely(!PageSwapCache(req->wb_page))) { - set_page_private(req->wb_page, 0); - ClearPagePrivate(req->wb_page); - clear_bit(PG_MAPPED, &req->wb_flags); + if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { + head = req->wb_head; + + spin_lock(&inode->i_lock); + if (likely(!PageSwapCache(head->wb_page))) { + set_page_private(head->wb_page, 0); + ClearPagePrivate(head->wb_page); + clear_bit(PG_MAPPED, &head->wb_flags); + } + nfsi->npages--; + spin_unlock(&inode->i_lock); } - nfsi->npages--; - spin_unlock(&inode->i_lock); - nfs_release_request(req); + + if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) + nfs_release_request(req); } static void @@ -583,7 +826,7 @@ nfs_clear_request_commit(struct nfs_page *req) } static inline -int nfs_write_need_commit(struct nfs_write_data *data) +int nfs_write_need_commit(struct nfs_pgio_data *data) { if (data->verf.committed == NFS_DATA_SYNC) return data->header->lseg == NULL; @@ -614,7 +857,7 @@ nfs_clear_request_commit(struct nfs_page *req) } static inline -int nfs_write_need_commit(struct nfs_write_data *data) +int nfs_write_need_commit(struct nfs_pgio_data *data) { return 0; } @@ -645,7 +888,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) goto next; } if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) { - memcpy(&req->wb_verf, &hdr->verf->verifier, sizeof(req->wb_verf)); + memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); nfs_mark_request_commit(req, hdr->lseg, &cinfo); goto next; } @@ -653,7 +896,7 @@ remove_req: nfs_inode_remove_request(req); next: nfs_unlock_request(req); - nfs_end_page_writeback(req->wb_page); + nfs_end_page_writeback(req); nfs_release_request(req); } out: @@ -661,7 +904,7 @@ out: } #if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) -static unsigned long +unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo) { return cinfo->mds->ncommit; @@ -718,7 +961,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, } #else -static unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo) +unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo) { return 0; } @@ -754,10 +997,14 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, spin_lock(&inode->i_lock); for (;;) { - req = nfs_page_find_request_locked(NFS_I(inode), page); + req = nfs_page_find_head_request_locked(NFS_I(inode), page); if (req == NULL) goto out_unlock; + /* should be handled by nfs_flush_incompatible */ + WARN_ON_ONCE(req->wb_head != req); + WARN_ON_ONCE(req->wb_this_page != req); + rqend = req->wb_offset + req->wb_bytes; /* * Tell the caller to flush out the request if @@ -819,7 +1066,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, req = nfs_try_to_update_request(inode, page, offset, bytes); if (req != NULL) goto out; - req = nfs_create_request(ctx, inode, page, offset, bytes); + req = nfs_create_request(ctx, page, NULL, offset, bytes); if (IS_ERR(req)) goto out; nfs_inode_add_request(inode, req); @@ -837,7 +1084,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, return PTR_ERR(req); /* Update file length */ nfs_grow_file(page, offset, count); - nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); + nfs_mark_uptodate(req); nfs_mark_request_dirty(req); nfs_unlock_and_release_request(req); return 0; @@ -858,11 +1105,13 @@ int nfs_flush_incompatible(struct file *file, struct page *page) * dropped page. */ do { - req = nfs_page_find_request(page); + req = nfs_page_find_head_request(page); if (req == NULL) return 0; l_ctx = req->wb_lock_context; do_flush = req->wb_page != page || req->wb_context != ctx; + /* for now, flush if more than 1 request in page_group */ + do_flush |= req->wb_this_page != req; if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) { do_flush |= l_ctx->lockowner.l_owner != current->files || l_ctx->lockowner.l_pid != current->tgid; @@ -909,11 +1158,18 @@ bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx) */ static bool nfs_write_pageuptodate(struct page *page, struct inode *inode) { + struct nfs_inode *nfsi = NFS_I(inode); + if (nfs_have_delegated_attributes(inode)) goto out; - if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE)) + if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) + return false; + smp_rmb(); + if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags)) return false; out: + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + return false; return PageUptodate(page) != 0; } @@ -922,19 +1178,20 @@ out: * extend the write to cover the entire page in order to avoid fragmentation * inefficiencies. * - * If the file is opened for synchronous writes or if we have a write delegation - * from the server then we can just skip the rest of the checks. + * If the file is opened for synchronous writes then we can just skip the rest + * of the checks. */ static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode) { if (file->f_flags & O_DSYNC) return 0; + if (!nfs_write_pageuptodate(page, inode)) + return 0; if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) return 1; - if (nfs_write_pageuptodate(page, inode) && (inode->i_flock == NULL || - (inode->i_flock->fl_start == 0 && + if (inode->i_flock == NULL || (inode->i_flock->fl_start == 0 && inode->i_flock->fl_end == OFFSET_MAX && - inode->i_flock->fl_type != F_RDLCK))) + inode->i_flock->fl_type != F_RDLCK)) return 1; return 0; } @@ -984,126 +1241,17 @@ static int flush_task_priority(int how) return RPC_PRIORITY_NORMAL; } -int nfs_initiate_write(struct rpc_clnt *clnt, - struct nfs_write_data *data, - const struct rpc_call_ops *call_ops, - int how, int flags) +static void nfs_initiate_write(struct nfs_pgio_data *data, struct rpc_message *msg, + struct rpc_task_setup *task_setup_data, int how) { struct inode *inode = data->header->inode; int priority = flush_task_priority(how); - struct rpc_task *task; - struct rpc_message msg = { - .rpc_argp = &data->args, - .rpc_resp = &data->res, - .rpc_cred = data->header->cred, - }; - struct rpc_task_setup task_setup_data = { - .rpc_client = clnt, - .task = &data->task, - .rpc_message = &msg, - .callback_ops = call_ops, - .callback_data = data, - .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC | flags, - .priority = priority, - }; - int ret = 0; - - /* Set up the initial task struct. */ - NFS_PROTO(inode)->write_setup(data, &msg); - dprintk("NFS: %5u initiated write call " - "(req %s/%lld, %u bytes @ offset %llu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (long long)NFS_FILEID(inode), - data->args.count, - (unsigned long long)data->args.offset); + task_setup_data->priority = priority; + NFS_PROTO(inode)->write_setup(data, msg); nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client, - &task_setup_data.rpc_client, &msg, data); - - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - ret = PTR_ERR(task); - goto out; - } - if (how & FLUSH_SYNC) { - ret = rpc_wait_for_completion_task(task); - if (ret == 0) - ret = task->tk_status; - } - rpc_put_task(task); -out: - return ret; -} -EXPORT_SYMBOL_GPL(nfs_initiate_write); - -/* - * Set up the argument/result storage required for the RPC call. - */ -static void nfs_write_rpcsetup(struct nfs_write_data *data, - unsigned int count, unsigned int offset, - int how, struct nfs_commit_info *cinfo) -{ - struct nfs_page *req = data->header->req; - - /* Set up the RPC argument and reply structs - * NB: take care not to mess about with data->commit et al. */ - - data->args.fh = NFS_FH(data->header->inode); - data->args.offset = req_offset(req) + offset; - /* pnfs_set_layoutcommit needs this */ - data->mds_offset = data->args.offset; - data->args.pgbase = req->wb_pgbase + offset; - data->args.pages = data->pages.pagevec; - data->args.count = count; - data->args.context = get_nfs_open_context(req->wb_context); - data->args.lock_context = req->wb_lock_context; - data->args.stable = NFS_UNSTABLE; - switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { - case 0: - break; - case FLUSH_COND_STABLE: - if (nfs_reqs_to_commit(cinfo)) - break; - default: - data->args.stable = NFS_FILE_SYNC; - } - - data->res.fattr = &data->fattr; - data->res.count = count; - data->res.verf = &data->verf; - nfs_fattr_init(&data->fattr); -} - -static int nfs_do_write(struct nfs_write_data *data, - const struct rpc_call_ops *call_ops, - int how) -{ - struct inode *inode = data->header->inode; - - return nfs_initiate_write(NFS_CLIENT(inode), data, call_ops, how, 0); -} - -static int nfs_do_multiple_writes(struct list_head *head, - const struct rpc_call_ops *call_ops, - int how) -{ - struct nfs_write_data *data; - int ret = 0; - - while (!list_empty(head)) { - int ret2; - - data = list_first_entry(head, struct nfs_write_data, list); - list_del_init(&data->list); - - ret2 = nfs_do_write(data, call_ops, how); - if (ret == 0) - ret = ret2; - } - return ret; + &task_setup_data->rpc_client, msg, data); } /* If a nfs_flush_* function fails, it should remove reqs from @head and @@ -1114,7 +1262,7 @@ static void nfs_redirty_request(struct nfs_page *req) { nfs_mark_request_dirty(req); nfs_unlock_request(req); - nfs_end_page_writeback(req->wb_page); + nfs_end_page_writeback(req); nfs_release_request(req); } @@ -1134,173 +1282,30 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = { .completion = nfs_write_completion, }; -static void nfs_flush_error(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) -{ - set_bit(NFS_IOHDR_REDO, &hdr->flags); - while (!list_empty(&hdr->rpc_list)) { - struct nfs_write_data *data = list_first_entry(&hdr->rpc_list, - struct nfs_write_data, list); - list_del(&data->list); - nfs_writedata_release(data); - } - desc->pg_completion_ops->error_cleanup(&desc->pg_list); -} - -/* - * Generate multiple small requests to write out a single - * contiguous dirty area on one page. - */ -static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) -{ - struct nfs_page *req = hdr->req; - struct page *page = req->wb_page; - struct nfs_write_data *data; - size_t wsize = desc->pg_bsize, nbytes; - unsigned int offset; - int requests = 0; - struct nfs_commit_info cinfo; - - nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); - - if ((desc->pg_ioflags & FLUSH_COND_STABLE) && - (desc->pg_moreio || nfs_reqs_to_commit(&cinfo) || - desc->pg_count > wsize)) - desc->pg_ioflags &= ~FLUSH_COND_STABLE; - - - offset = 0; - nbytes = desc->pg_count; - do { - size_t len = min(nbytes, wsize); - - data = nfs_writedata_alloc(hdr, 1); - if (!data) { - nfs_flush_error(desc, hdr); - return -ENOMEM; - } - data->pages.pagevec[0] = page; - nfs_write_rpcsetup(data, len, offset, desc->pg_ioflags, &cinfo); - list_add(&data->list, &hdr->rpc_list); - requests++; - nbytes -= len; - offset += len; - } while (nbytes != 0); - nfs_list_remove_request(req); - nfs_list_add_request(req, &hdr->pages); - desc->pg_rpc_callops = &nfs_write_common_ops; - return 0; -} - -/* - * Create an RPC task for the given write request and kick it. - * The page must have been locked by the caller. - * - * It may happen that the page we're passed is not marked dirty. - * This is the case if nfs_updatepage detects a conflicting request - * that has been written but not committed. - */ -static int nfs_flush_one(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) -{ - struct nfs_page *req; - struct page **pages; - struct nfs_write_data *data; - struct list_head *head = &desc->pg_list; - struct nfs_commit_info cinfo; - - data = nfs_writedata_alloc(hdr, nfs_page_array_len(desc->pg_base, - desc->pg_count)); - if (!data) { - nfs_flush_error(desc, hdr); - return -ENOMEM; - } - - nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); - pages = data->pages.pagevec; - while (!list_empty(head)) { - req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_list_add_request(req, &hdr->pages); - *pages++ = req->wb_page; - } - - if ((desc->pg_ioflags & FLUSH_COND_STABLE) && - (desc->pg_moreio || nfs_reqs_to_commit(&cinfo))) - desc->pg_ioflags &= ~FLUSH_COND_STABLE; - - /* Set up the argument struct */ - nfs_write_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo); - list_add(&data->list, &hdr->rpc_list); - desc->pg_rpc_callops = &nfs_write_common_ops; - return 0; -} - -int nfs_generic_flush(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) -{ - if (desc->pg_bsize < PAGE_CACHE_SIZE) - return nfs_flush_multi(desc, hdr); - return nfs_flush_one(desc, hdr); -} -EXPORT_SYMBOL_GPL(nfs_generic_flush); - -static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) -{ - struct nfs_write_header *whdr; - struct nfs_pgio_header *hdr; - int ret; - - whdr = nfs_writehdr_alloc(); - if (!whdr) { - desc->pg_completion_ops->error_cleanup(&desc->pg_list); - return -ENOMEM; - } - hdr = &whdr->header; - nfs_pgheader_init(desc, hdr, nfs_writehdr_free); - atomic_inc(&hdr->refcnt); - ret = nfs_generic_flush(desc, hdr); - if (ret == 0) - ret = nfs_do_multiple_writes(&hdr->rpc_list, - desc->pg_rpc_callops, - desc->pg_ioflags); - if (atomic_dec_and_test(&hdr->refcnt)) - hdr->completion_ops->completion(hdr); - return ret; -} - -static const struct nfs_pageio_ops nfs_pageio_write_ops = { - .pg_test = nfs_generic_pg_test, - .pg_doio = nfs_generic_pg_writepages, -}; - void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, - struct inode *inode, int ioflags, + struct inode *inode, int ioflags, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops) { - nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops, compl_ops, - NFS_SERVER(inode)->wsize, ioflags); + struct nfs_server *server = NFS_SERVER(inode); + const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops; + +#ifdef CONFIG_NFS_V4_1 + if (server->pnfs_curr_ld && !force_mds) + pg_ops = server->pnfs_curr_ld->pg_write_ops; +#endif + nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops, + server->wsize, ioflags); } EXPORT_SYMBOL_GPL(nfs_pageio_init_write); void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio) { - pgio->pg_ops = &nfs_pageio_write_ops; + pgio->pg_ops = &nfs_pgio_rw_ops; pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize; } EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds); -void nfs_write_prepare(struct rpc_task *task, void *calldata) -{ - struct nfs_write_data *data = calldata; - int err; - err = NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data); - if (err) - rpc_exit(task, err); -} - void nfs_commit_prepare(struct rpc_task *task, void *calldata) { struct nfs_commit_data *data = calldata; @@ -1308,23 +1313,8 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata) NFS_PROTO(data->inode)->commit_rpc_prepare(task, data); } -/* - * Handle a write reply that flushes a whole page. - * - * FIXME: There is an inherent race with invalidate_inode_pages and - * writebacks since the page->count is kept > 1 for as long - * as the page has a write request pending. - */ -static void nfs_writeback_done_common(struct rpc_task *task, void *calldata) -{ - struct nfs_write_data *data = calldata; - - nfs_writeback_done(task, data); -} - -static void nfs_writeback_release_common(void *calldata) +static void nfs_writeback_release_common(struct nfs_pgio_data *data) { - struct nfs_write_data *data = calldata; struct nfs_pgio_header *hdr = data->header; int status = data->task.tk_status; @@ -1333,34 +1323,46 @@ static void nfs_writeback_release_common(void *calldata) if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) ; /* Do nothing */ else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) - memcpy(hdr->verf, &data->verf, sizeof(*hdr->verf)); - else if (memcmp(hdr->verf, &data->verf, sizeof(*hdr->verf))) + memcpy(&hdr->verf, &data->verf, sizeof(hdr->verf)); + else if (memcmp(&hdr->verf, &data->verf, sizeof(hdr->verf))) set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags); spin_unlock(&hdr->lock); } - nfs_writedata_release(data); } -static const struct rpc_call_ops nfs_write_common_ops = { - .rpc_call_prepare = nfs_write_prepare, - .rpc_call_done = nfs_writeback_done_common, - .rpc_release = nfs_writeback_release_common, -}; +/* + * Special version of should_remove_suid() that ignores capabilities. + */ +static int nfs_should_remove_suid(const struct inode *inode) +{ + umode_t mode = inode->i_mode; + int kill = 0; + /* suid always must be killed */ + if (unlikely(mode & S_ISUID)) + kill = ATTR_KILL_SUID; + + /* + * sgid without any exec bits is just a mandatory locking mark; leave + * it alone. If some exec bits are set, it's a real sgid; kill it. + */ + if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) + kill |= ATTR_KILL_SGID; + + if (unlikely(kill && S_ISREG(mode))) + return kill; + + return 0; +} /* * This function is called when the WRITE call is complete. */ -void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data, + struct inode *inode) { - struct nfs_writeargs *argp = &data->args; - struct nfs_writeres *resp = &data->res; - struct inode *inode = data->header->inode; int status; - dprintk("NFS: %5u nfs_writeback_done (status %d)\n", - task->tk_pid, task->tk_status); - /* * ->write_done will attempt to use post-op attributes to detect * conflicting writes by other clients. A strict interpretation @@ -1370,11 +1372,11 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) */ status = NFS_PROTO(inode)->write_done(task, data); if (status != 0) - return; - nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, resp->count); + return status; + nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, data->res.count); #if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) - if (resp->verf->committed < argp->stable && task->tk_status >= 0) { + if (data->res.verf->committed < data->args.stable && task->tk_status >= 0) { /* We tried a write call, but the server did not * commit data to stable storage even though we * requested it. @@ -1390,18 +1392,31 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) dprintk("NFS: faulty NFS server %s:" " (committed = %d) != (stable = %d)\n", NFS_SERVER(inode)->nfs_client->cl_hostname, - resp->verf->committed, argp->stable); + data->res.verf->committed, data->args.stable); complain = jiffies + 300 * HZ; } } #endif - if (task->tk_status < 0) - nfs_set_pgio_error(data->header, task->tk_status, argp->offset); - else if (resp->count < argp->count) { + + /* Deal with the suid/sgid bit corner case */ + if (nfs_should_remove_suid(inode)) + nfs_mark_for_revalidate(inode); + return 0; +} + +/* + * This function is called when the WRITE call is complete. + */ +static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *data) +{ + struct nfs_pgio_args *argp = &data->args; + struct nfs_pgio_res *resp = &data->res; + + if (resp->count < argp->count) { static unsigned long complain; /* This a short write! */ - nfs_inc_stats(inode, NFSIOS_SHORTWRITE); + nfs_inc_stats(data->header->inode, NFSIOS_SHORTWRITE); /* Has the server at least made some progress? */ if (resp->count == 0) { @@ -1452,7 +1467,7 @@ static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) static void nfs_commit_clear_lock(struct nfs_inode *nfsi) { clear_bit(NFS_INO_COMMIT, &nfsi->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); } @@ -1606,9 +1621,9 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) nfs_list_remove_request(req); nfs_clear_page_commit(req->wb_page); - dprintk("NFS: commit (%s/%lld %d@%lld)", + dprintk("NFS: commit (%s/%llu %d@%lld)", req->wb_context->dentry->d_sb->s_id, - (long long)NFS_FILEID(req->wb_context->dentry->d_inode), + (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode), req->wb_bytes, (long long)req_offset(req)); if (status < 0) { @@ -1782,27 +1797,28 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) struct nfs_page *req; int ret = 0; - for (;;) { - wait_on_page_writeback(page); - req = nfs_page_find_request(page); - if (req == NULL) - break; - if (nfs_lock_request(req)) { - nfs_clear_request_commit(req); - nfs_inode_remove_request(req); - /* - * In case nfs_inode_remove_request has marked the - * page as being dirty - */ - cancel_dirty_page(page, PAGE_CACHE_SIZE); - nfs_unlock_and_release_request(req); - break; - } - ret = nfs_wait_on_request(req); - nfs_release_request(req); - if (ret < 0) - break; + wait_on_page_writeback(page); + + /* blocking call to cancel all requests and join to a single (head) + * request */ + req = nfs_lock_and_join_requests(page, false); + + if (IS_ERR(req)) { + ret = PTR_ERR(req); + } else if (req) { + /* all requests from this page have been cancelled by + * nfs_lock_and_join_requests, so just remove the head + * request from the inode / page_private pointer and + * release it */ + nfs_inode_remove_request(req); + /* + * In case nfs_inode_remove_request has marked the + * page as being dirty + */ + cancel_dirty_page(page, PAGE_CACHE_SIZE); + nfs_unlock_and_release_request(req); } + return ret; } @@ -1868,7 +1884,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, int __init nfs_init_writepagecache(void) { nfs_wdata_cachep = kmem_cache_create("nfs_write_data", - sizeof(struct nfs_write_header), + sizeof(struct nfs_rw_header), 0, SLAB_HWCACHE_ALIGN, NULL); if (nfs_wdata_cachep == NULL) @@ -1930,3 +1946,12 @@ void nfs_destroy_writepagecache(void) kmem_cache_destroy(nfs_wdata_cachep); } +static const struct nfs_rw_ops nfs_rw_write_ops = { + .rw_mode = FMODE_WRITE, + .rw_alloc_header = nfs_writehdr_alloc, + .rw_free_header = nfs_writehdr_free, + .rw_release = nfs_writeback_release_common, + .rw_done = nfs_writeback_done, + .rw_result = nfs_writeback_result, + .rw_initiate = nfs_initiate_write, +}; diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index dc8f1ef665c..f994e750e0d 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -95,7 +95,7 @@ config NFSD_V4_SECURITY_LABEL Smack policies on NFSv4 files, say N. WARNING: there is still a chance of backwards-incompatible protocol changes. - For now we recommend "Y" only for developers and testers." + For now we recommend "Y" only for developers and testers. config NFSD_FAULT_INJECTION bool "NFS server manual fault injection" diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h index 8b186a4955c..a986ceb6fd0 100644 --- a/fs/nfsd/acl.h +++ b/fs/nfsd/acl.h @@ -35,23 +35,25 @@ #ifndef LINUX_NFS4_ACL_H #define LINUX_NFS4_ACL_H -#include <linux/posix_acl.h> +struct nfs4_acl; +struct svc_fh; +struct svc_rqst; -/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to - * fit in a page: */ -#define NFS4_ACL_MAX 170 +/* + * Maximum ACL we'll accept from a client; chosen (somewhat + * arbitrarily) so that kmalloc'ing the ACL shouldn't require a + * high-order allocation. This allows 204 ACEs on x86_64: + */ +#define NFS4_ACL_MAX ((PAGE_SIZE - sizeof(struct nfs4_acl)) \ + / sizeof(struct nfs4_ace)) struct nfs4_acl *nfs4_acl_new(int); int nfs4_acl_get_whotype(char *, u32); -int nfs4_acl_write_who(int who, char *p); - -#define NFS4_ACL_TYPE_DEFAULT 0x01 -#define NFS4_ACL_DIR 0x02 -#define NFS4_ACL_OWNER 0x04 +__be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who); -struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *, - struct posix_acl *, unsigned int flags); -int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **, - struct posix_acl **, unsigned int flags); +int nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, + struct nfs4_acl **acl); +__be32 nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfs4_acl *acl); #endif /* LINUX_NFS4_ACL_H */ diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 06cddd57226..72f44823adb 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -1,7 +1,6 @@ /* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ #include <linux/sched.h> -#include <linux/user_namespace.h> #include "nfsd.h" #include "auth.h" @@ -25,7 +24,6 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) struct cred *new; int i; int flags = nfsexp_flags(rqstp, exp); - int ret; validate_process_creds(); @@ -71,10 +69,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) if (gid_eq(new->fsgid, INVALID_GID)) new->fsgid = exp->ex_anon_gid; - ret = set_groups(new, gi); + set_groups(new, gi); put_group_info(gi); - if (ret < 0) - goto error; if (!uid_eq(new->fsuid, GLOBAL_ROOT_UID)) new->cap_effective = cap_drop_nfsd_set(new->cap_effective); @@ -88,9 +84,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) return 0; oom: - ret = -ENOMEM; -error: abort_creds(new); - return ret; + return -ENOMEM; } diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index d5c5b3e0026..b582f9ab6b2 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -84,12 +84,4 @@ int nfsd_cache_lookup(struct svc_rqst *); void nfsd_cache_update(struct svc_rqst *, int, __be32 *); int nfsd_reply_cache_stats_open(struct inode *, struct file *); -#ifdef CONFIG_NFSD_V4 -void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp); -#else /* CONFIG_NFSD_V4 */ -static inline void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp) -{ -} -#endif /* CONFIG_NFSD_V4 */ - #endif /* NFSCACHE_H */ diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 5f38ea36e26..13b85f94d9e 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -17,17 +17,12 @@ #include <linux/exportfs.h> #include <linux/sunrpc/svc_xprt.h> -#include <net/ipv6.h> - #include "nfsd.h" #include "nfsfh.h" #include "netns.h" #define NFSDDBG_FACILITY NFSDDBG_EXPORT -typedef struct auth_domain svc_client; -typedef struct svc_export svc_export; - /* * We have two caches. * One maps client+vfsmnt+dentry to export options - the export map @@ -73,7 +68,7 @@ static struct svc_expkey *svc_expkey_lookup(struct cache_detail *cd, struct svc_ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) { - /* client fsidtype fsid [path] */ + /* client fsidtype fsid expiry [path] */ char *buf; int len; struct auth_domain *dom = NULL; @@ -295,13 +290,19 @@ svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new, static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc) { + struct nfsd4_fs_location *locations = fsloc->locations; int i; + if (!locations) + return; + for (i = 0; i < fsloc->locations_count; i++) { - kfree(fsloc->locations[i].path); - kfree(fsloc->locations[i].hosts); + kfree(locations[i].path); + kfree(locations[i].hosts); } - kfree(fsloc->locations); + + kfree(locations); + fsloc->locations = NULL; } static void svc_export_put(struct kref *ref) @@ -388,6 +389,10 @@ fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) int len; int migrated, i, err; + /* more than one fsloc */ + if (fsloc->locations) + return -EINVAL; + /* listsize */ err = get_uint(mesg, &fsloc->locations_count); if (err) @@ -437,13 +442,18 @@ out_free_all: static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { - int listsize, err; struct exp_flavor_info *f; + u32 listsize; + int err; - err = get_int(mesg, &listsize); + /* more than one secinfo */ + if (exp->ex_nflavors) + return -EINVAL; + + err = get_uint(mesg, &listsize); if (err) return err; - if (listsize < 0 || listsize > MAX_SECINFO_LIST) + if (listsize > MAX_SECINFO_LIST) return -EINVAL; for (f = exp->ex_flavors; f < exp->ex_flavors + listsize; f++) { @@ -474,6 +484,27 @@ static inline int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; } #endif +static inline int +uuid_parse(char **mesg, char *buf, unsigned char **puuid) +{ + int len; + + /* more than one uuid */ + if (*puuid) + return -EINVAL; + + /* expect a 16 byte uuid encoded as \xXXXX... */ + len = qword_get(mesg, buf, PAGE_SIZE); + if (len != EX_UUID_LEN) + return -EINVAL; + + *puuid = kmemdup(buf, EX_UUID_LEN, GFP_KERNEL); + if (*puuid == NULL) + return -ENOMEM; + + return 0; +} + static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) { /* client path expiry [flags anonuid anongid fsid] */ @@ -536,16 +567,12 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) if (err) goto out3; exp.ex_anon_uid= make_kuid(&init_user_ns, an_int); - if (!uid_valid(exp.ex_anon_uid)) - goto out3; /* anon gid */ err = get_int(&mesg, &an_int); if (err) goto out3; exp.ex_anon_gid= make_kgid(&init_user_ns, an_int); - if (!gid_valid(exp.ex_anon_gid)) - goto out3; /* fsid */ err = get_int(&mesg, &an_int); @@ -556,18 +583,9 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) while ((len = qword_get(&mesg, buf, PAGE_SIZE)) > 0) { if (strcmp(buf, "fsloc") == 0) err = fsloc_parse(&mesg, buf, &exp.ex_fslocs); - else if (strcmp(buf, "uuid") == 0) { - /* expect a 16 byte uuid encoded as \xXXXX... */ - len = qword_get(&mesg, buf, PAGE_SIZE); - if (len != 16) - err = -EINVAL; - else { - exp.ex_uuid = - kmemdup(buf, 16, GFP_KERNEL); - if (exp.ex_uuid == NULL) - err = -ENOMEM; - } - } else if (strcmp(buf, "secinfo") == 0) + else if (strcmp(buf, "uuid") == 0) + err = uuid_parse(&mesg, buf, &exp.ex_uuid); + else if (strcmp(buf, "secinfo") == 0) err = secinfo_parse(&mesg, buf, &exp); else /* quietly ignore unknown words and anything @@ -583,6 +601,26 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) exp.ex_uuid); if (err) goto out4; + /* + * No point caching this if it would immediately expire. + * Also, this protects exportfs's dummy export from the + * anon_uid/anon_gid checks: + */ + if (exp.h.expiry_time < seconds_since_boot()) + goto out4; + /* + * For some reason exportfs has been passing down an + * invalid (-1) uid & gid on the "dummy" export which it + * uses to test export support. To make sure exportfs + * sees errors from check_export we therefore need to + * delay these checks till after check_export: + */ + err = -EINVAL; + if (!uid_valid(exp.ex_anon_uid)) + goto out4; + if (!gid_valid(exp.ex_anon_gid)) + goto out4; + err = 0; } expp = svc_export_lookup(&exp); @@ -633,7 +671,7 @@ static int svc_export_show(struct seq_file *m, if (exp->ex_uuid) { int i; seq_puts(m, ",uuid="); - for (i=0; i<16; i++) { + for (i = 0; i < EX_UUID_LEN; i++) { if ((i&3) == 0 && i) seq_putc(m, ':'); seq_printf(m, "%02x", exp->ex_uuid[i]); @@ -755,7 +793,7 @@ svc_export_update(struct svc_export *new, struct svc_export *old) static struct svc_expkey * -exp_find_key(struct cache_detail *cd, svc_client *clp, int fsid_type, +exp_find_key(struct cache_detail *cd, struct auth_domain *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp) { struct svc_expkey key, *ek; @@ -777,9 +815,9 @@ exp_find_key(struct cache_detail *cd, svc_client *clp, int fsid_type, return ek; } - -static svc_export *exp_get_by_name(struct cache_detail *cd, svc_client *clp, - const struct path *path, struct cache_req *reqp) +static struct svc_export * +exp_get_by_name(struct cache_detail *cd, struct auth_domain *clp, + const struct path *path, struct cache_req *reqp) { struct svc_export *exp, key; int err; @@ -803,11 +841,11 @@ static svc_export *exp_get_by_name(struct cache_detail *cd, svc_client *clp, /* * Find the export entry for a given dentry. */ -static struct svc_export *exp_parent(struct cache_detail *cd, svc_client *clp, - struct path *path) +static struct svc_export * +exp_parent(struct cache_detail *cd, struct auth_domain *clp, struct path *path) { struct dentry *saved = dget(path->dentry); - svc_export *exp = exp_get_by_name(cd, clp, path, NULL); + struct svc_export *exp = exp_get_by_name(cd, clp, path, NULL); while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) { struct dentry *parent = dget_parent(path->dentry); @@ -828,7 +866,7 @@ static struct svc_export *exp_parent(struct cache_detail *cd, svc_client *clp, * since its harder to fool a kernel module than a user space program. */ int -exp_rootfh(struct net *net, svc_client *clp, char *name, +exp_rootfh(struct net *net, struct auth_domain *clp, char *name, struct knfsd_fh *f, int maxsize) { struct svc_export *exp; diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h new file mode 100644 index 00000000000..cfeea85c5be --- /dev/null +++ b/fs/nfsd/export.h @@ -0,0 +1,110 @@ +/* + * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de> + */ +#ifndef NFSD_EXPORT_H +#define NFSD_EXPORT_H + +#include <linux/sunrpc/cache.h> +#include <uapi/linux/nfsd/export.h> + +struct knfsd_fh; +struct svc_fh; +struct svc_rqst; + +/* + * FS Locations + */ + +#define MAX_FS_LOCATIONS 128 + +struct nfsd4_fs_location { + char *hosts; /* colon separated list of hosts */ + char *path; /* slash separated list of path components */ +}; + +struct nfsd4_fs_locations { + uint32_t locations_count; + struct nfsd4_fs_location *locations; +/* If we're not actually serving this data ourselves (only providing a + * list of replicas that do serve it) then we set "migrated": */ + int migrated; +}; + +/* + * We keep an array of pseudoflavors with the export, in order from most + * to least preferred. For the foreseeable future, we don't expect more + * than the eight pseudoflavors null, unix, krb5, krb5i, krb5p, skpm3, + * spkm3i, and spkm3p (and using all 8 at once should be rare). + */ +#define MAX_SECINFO_LIST 8 +#define EX_UUID_LEN 16 + +struct exp_flavor_info { + u32 pseudoflavor; + u32 flags; +}; + +struct svc_export { + struct cache_head h; + struct auth_domain * ex_client; + int ex_flags; + struct path ex_path; + kuid_t ex_anon_uid; + kgid_t ex_anon_gid; + int ex_fsid; + unsigned char * ex_uuid; /* 16 byte fsid */ + struct nfsd4_fs_locations ex_fslocs; + uint32_t ex_nflavors; + struct exp_flavor_info ex_flavors[MAX_SECINFO_LIST]; + struct cache_detail *cd; +}; + +/* an "export key" (expkey) maps a filehandlefragement to an + * svc_export for a given client. There can be several per export, + * for the different fsid types. + */ +struct svc_expkey { + struct cache_head h; + + struct auth_domain * ek_client; + int ek_fsidtype; + u32 ek_fsid[6]; + + struct path ek_path; +}; + +#define EX_ISSYNC(exp) (!((exp)->ex_flags & NFSEXP_ASYNC)) +#define EX_NOHIDE(exp) ((exp)->ex_flags & NFSEXP_NOHIDE) +#define EX_WGATHER(exp) ((exp)->ex_flags & NFSEXP_GATHERED_WRITES) + +int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp); +__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp); + +/* + * Function declarations + */ +int nfsd_export_init(struct net *); +void nfsd_export_shutdown(struct net *); +void nfsd_export_flush(struct net *); +struct svc_export * rqst_exp_get_by_name(struct svc_rqst *, + struct path *); +struct svc_export * rqst_exp_parent(struct svc_rqst *, + struct path *); +struct svc_export * rqst_find_fsidzero_export(struct svc_rqst *); +int exp_rootfh(struct net *, struct auth_domain *, + char *path, struct knfsd_fh *, int maxsize); +__be32 exp_pseudoroot(struct svc_rqst *, struct svc_fh *); +__be32 nfserrno(int errno); + +static inline void exp_put(struct svc_export *exp) +{ + cache_put(&exp->h, exp->cd); +} + +static inline void exp_get(struct svc_export *exp) +{ + cache_get(&exp->h); +} +struct svc_export * rqst_exp_find(struct svc_rqst *, int, u32 *); + +#endif /* NFSD_EXPORT_H */ diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c index d620e7f8142..2ed05c3cd43 100644 --- a/fs/nfsd/fault_inject.c +++ b/fs/nfsd/fault_inject.c @@ -97,25 +97,14 @@ static ssize_t fault_inject_read(struct file *file, char __user *buf, { static u64 val; char read_buf[25]; - size_t size, ret; + size_t size; loff_t pos = *ppos; if (!pos) nfsd_inject_get(file_inode(file)->i_private, &val); size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val); - if (pos < 0) - return -EINVAL; - if (pos >= size || !len) - return 0; - if (len > size - pos) - len = size - pos; - ret = copy_to_user(buf, read_buf + pos, len); - if (ret == len) - return -EFAULT; - len -= ret; - *ppos = pos + len; - return len; + return simple_read_from_buffer(buf, len, ppos, read_buf, size); } static ssize_t fault_inject_write(struct file *file, const char __user *buf, diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h index bf95f6b817a..a3f34900091 100644 --- a/fs/nfsd/idmap.h +++ b/fs/nfsd/idmap.h @@ -56,7 +56,7 @@ static inline void nfsd_idmap_shutdown(struct net *net) __be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, kuid_t *); __be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, kgid_t *); -int nfsd_map_uid_to_name(struct svc_rqst *, kuid_t, char *); -int nfsd_map_gid_to_name(struct svc_rqst *, kgid_t, char *); +__be32 nfsd4_encode_user(struct xdr_stream *, struct svc_rqst *, kuid_t); +__be32 nfsd4_encode_group(struct xdr_stream *, struct svc_rqst *, kgid_t); #endif /* LINUX_NFSD_IDMAP_H */ diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 849a7c3ced2..d32b3aa6600 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -95,6 +95,7 @@ struct nfsd_net { time_t nfsd4_grace; bool nfsd_net_up; + bool lockd_up; /* * Time of server startup diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 95d76dc6c5d..12b023a7ab7 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -30,8 +30,9 @@ nfsacld_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp) { - svc_fh *fh; struct posix_acl *acl; + struct inode *inode; + svc_fh *fh; __be32 nfserr = 0; dprintk("nfsd: GETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); @@ -41,6 +42,8 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, if (nfserr) RETURN_STATUS(nfserr); + inode = fh->fh_dentry->d_inode; + if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) RETURN_STATUS(nfserr_inval); resp->mask = argp->mask; @@ -50,21 +53,13 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, goto fail; if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { - acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS); + acl = get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) { - int err = PTR_ERR(acl); - - if (err == -ENODATA || err == -EOPNOTSUPP) - acl = NULL; - else { - nfserr = nfserrno(err); - goto fail; - } + nfserr = nfserrno(PTR_ERR(acl)); + goto fail; } if (acl == NULL) { /* Solaris returns the inode's minimum ACL. */ - - struct inode *inode = fh->fh_dentry->d_inode; acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); } resp->acl_access = acl; @@ -72,17 +67,10 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { /* Check how Solaris handles requests for the Default ACL of a non-directory! */ - - acl = nfsd_get_posix_acl(fh, ACL_TYPE_DEFAULT); + acl = get_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(acl)) { - int err = PTR_ERR(acl); - - if (err == -ENODATA || err == -EOPNOTSUPP) - acl = NULL; - else { - nfserr = nfserrno(err); - goto fail; - } + nfserr = nfserrno(PTR_ERR(acl)); + goto fail; } resp->acl_default = acl; } @@ -103,31 +91,51 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp, struct nfsd3_setaclargs *argp, struct nfsd_attrstat *resp) { + struct inode *inode; svc_fh *fh; __be32 nfserr = 0; + int error; dprintk("nfsd: SETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); fh = fh_copy(&resp->fh, &argp->fh); nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); + if (nfserr) + goto out; - if (!nfserr) { - nfserr = nfserrno( nfsd_set_posix_acl( - fh, ACL_TYPE_ACCESS, argp->acl_access) ); - } - if (!nfserr) { - nfserr = nfserrno( nfsd_set_posix_acl( - fh, ACL_TYPE_DEFAULT, argp->acl_default) ); - } - if (!nfserr) { - nfserr = fh_getattr(fh, &resp->stat); + inode = fh->fh_dentry->d_inode; + if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) { + error = -EOPNOTSUPP; + goto out_errno; } + error = fh_want_write(fh); + if (error) + goto out_errno; + + error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS); + if (error) + goto out_drop_write; + error = inode->i_op->set_acl(inode, argp->acl_default, + ACL_TYPE_DEFAULT); + if (error) + goto out_drop_write; + + fh_drop_write(fh); + + nfserr = fh_getattr(fh, &resp->stat); + +out: /* argp->acl_{access,default} may have been allocated in nfssvc_decode_setaclargs. */ posix_acl_release(argp->acl_access); posix_acl_release(argp->acl_default); return nfserr; +out_drop_write: + fh_drop_write(fh); +out_errno: + nfserr = nfserrno(error); + goto out; } /* @@ -174,7 +182,8 @@ static __be32 nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessarg static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_getaclargs *argp) { - if (!(p = nfs2svc_decode_fh(p, &argp->fh))) + p = nfs2svc_decode_fh(p, &argp->fh); + if (!p) return 0; argp->mask = ntohl(*p); p++; @@ -189,7 +198,8 @@ static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p, unsigned int base; int n; - if (!(p = nfs2svc_decode_fh(p, &argp->fh))) + p = nfs2svc_decode_fh(p, &argp->fh); + if (!p) return 0; argp->mask = ntohl(*p++); if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) || @@ -210,7 +220,8 @@ static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p, static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *argp) { - if (!(p = nfs2svc_decode_fh(p, &argp->fh))) + p = nfs2svc_decode_fh(p, &argp->fh); + if (!p) return 0; return xdr_argsize_check(rqstp, p); } @@ -218,7 +229,8 @@ static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p, static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_accessargs *argp) { - if (!(p = nfs2svc_decode_fh(p, &argp->fh))) + p = nfs2svc_decode_fh(p, &argp->fh); + if (!p) return 0; argp->access = ntohl(*p++); diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 9cbc1a841f8..2a514e21dc7 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -29,8 +29,9 @@ nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp, struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp) { - svc_fh *fh; struct posix_acl *acl; + struct inode *inode; + svc_fh *fh; __be32 nfserr = 0; fh = fh_copy(&resp->fh, &argp->fh); @@ -38,26 +39,20 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp, if (nfserr) RETURN_STATUS(nfserr); + inode = fh->fh_dentry->d_inode; + if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) RETURN_STATUS(nfserr_inval); resp->mask = argp->mask; if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { - acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS); + acl = get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) { - int err = PTR_ERR(acl); - - if (err == -ENODATA || err == -EOPNOTSUPP) - acl = NULL; - else { - nfserr = nfserrno(err); - goto fail; - } + nfserr = nfserrno(PTR_ERR(acl)); + goto fail; } if (acl == NULL) { /* Solaris returns the inode's minimum ACL. */ - - struct inode *inode = fh->fh_dentry->d_inode; acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); } resp->acl_access = acl; @@ -65,17 +60,10 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp, if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { /* Check how Solaris handles requests for the Default ACL of a non-directory! */ - - acl = nfsd_get_posix_acl(fh, ACL_TYPE_DEFAULT); + acl = get_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(acl)) { - int err = PTR_ERR(acl); - - if (err == -ENODATA || err == -EOPNOTSUPP) - acl = NULL; - else { - nfserr = nfserrno(err); - goto fail; - } + nfserr = nfserrno(PTR_ERR(acl)); + goto fail; } resp->acl_default = acl; } @@ -96,21 +84,37 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp, struct nfsd3_setaclargs *argp, struct nfsd3_attrstat *resp) { + struct inode *inode; svc_fh *fh; __be32 nfserr = 0; + int error; fh = fh_copy(&resp->fh, &argp->fh); nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); + if (nfserr) + goto out; - if (!nfserr) { - nfserr = nfserrno( nfsd_set_posix_acl( - fh, ACL_TYPE_ACCESS, argp->acl_access) ); - } - if (!nfserr) { - nfserr = nfserrno( nfsd_set_posix_acl( - fh, ACL_TYPE_DEFAULT, argp->acl_default) ); + inode = fh->fh_dentry->d_inode; + if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) { + error = -EOPNOTSUPP; + goto out_errno; } + error = fh_want_write(fh); + if (error) + goto out_errno; + + error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS); + if (error) + goto out_drop_write; + error = inode->i_op->set_acl(inode, argp->acl_default, + ACL_TYPE_DEFAULT); + +out_drop_write: + fh_drop_write(fh); +out_errno: + nfserr = nfserrno(error); +out: /* argp->acl_{access,default} may have been allocated in nfs3svc_decode_setaclargs. */ posix_acl_release(argp->acl_access); @@ -124,7 +128,8 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp, static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_getaclargs *args) { - if (!(p = nfs3svc_decode_fh(p, &args->fh))) + p = nfs3svc_decode_fh(p, &args->fh); + if (!p) return 0; args->mask = ntohl(*p); p++; @@ -139,7 +144,8 @@ static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p, unsigned int base; int n; - if (!(p = nfs3svc_decode_fh(p, &args->fh))) + p = nfs3svc_decode_fh(p, &args->fh); + if (!p) return 0; args->mask = ntohl(*p++); if (args->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) || diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 14d9ecb96cf..e6c01e80325 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -168,7 +168,7 @@ encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat) { *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]); - *p++ = htonl((u32) stat->mode); + *p++ = htonl((u32) (stat->mode & S_IALLUGO)); *p++ = htonl((u32) stat->nlink); *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid)); *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid)); @@ -278,7 +278,8 @@ void fill_post_wcc(struct svc_fh *fhp) int nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; return xdr_argsize_check(rqstp, p); } @@ -287,7 +288,8 @@ int nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_sattrargs *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; p = decode_sattr3(p, &args->attrs); @@ -315,7 +317,8 @@ int nfs3svc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_accessargs *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; args->access = ntohl(*p++); @@ -330,7 +333,8 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, int v; u32 max_blocksize = svc_max_payload(rqstp); - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; p = xdr_decode_hyper(p, &args->offset); @@ -360,7 +364,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, unsigned int len, v, hdr, dlen; u32 max_blocksize = svc_max_payload(rqstp); - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; p = xdr_decode_hyper(p, &args->offset); @@ -535,7 +540,8 @@ int nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_readlinkargs *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; args->buffer = page_address(*(rqstp->rq_next_page++)); @@ -558,7 +564,8 @@ int nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_readdirargs *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; p = xdr_decode_hyper(p, &args->cookie); args->verf = p; p += 2; @@ -580,7 +587,8 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p, int len; u32 max_blocksize = svc_max_payload(rqstp); - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; p = xdr_decode_hyper(p, &args->cookie); args->verf = p; p += 2; @@ -605,7 +613,8 @@ int nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_commitargs *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; p = xdr_decode_hyper(p, &args->offset); args->count = ntohl(*p++); @@ -842,21 +851,21 @@ out: static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen) { - struct svc_fh fh; + struct svc_fh *fh = &cd->scratch; __be32 err; - fh_init(&fh, NFS3_FHSIZE); - err = compose_entry_fh(cd, &fh, name, namlen); + fh_init(fh, NFS3_FHSIZE); + err = compose_entry_fh(cd, fh, name, namlen); if (err) { *p++ = 0; *p++ = 0; goto out; } - p = encode_post_op_attr(cd->rqstp, p, &fh); + p = encode_post_op_attr(cd->rqstp, p, fh); *p++ = xdr_one; /* yes, a file handle follows */ - p = encode_fh(p, &fh); + p = encode_fh(p, fh); out: - fh_put(&fh); + fh_put(fh); return p; } diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 8a50b3c1809..d714156a19f 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -36,9 +36,14 @@ #include <linux/slab.h> #include <linux/nfs_fs.h> -#include <linux/export.h> +#include "nfsfh.h" +#include "nfsd.h" #include "acl.h" +#include "vfs.h" +#define NFS4_ACL_TYPE_DEFAULT 0x01 +#define NFS4_ACL_DIR 0x02 +#define NFS4_ACL_OWNER 0x04 /* mode bit translations: */ #define NFS4_READ_MODE (NFS4_ACE_READ_DATA) @@ -130,36 +135,47 @@ static short ace2type(struct nfs4_ace *); static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *, unsigned int); -struct nfs4_acl * -nfs4_acl_posix_to_nfsv4(struct posix_acl *pacl, struct posix_acl *dpacl, - unsigned int flags) +int +nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, + struct nfs4_acl **acl) { - struct nfs4_acl *acl; + struct inode *inode = dentry->d_inode; + int error = 0; + struct posix_acl *pacl = NULL, *dpacl = NULL; + unsigned int flags = 0; int size = 0; - if (pacl) { - if (posix_acl_valid(pacl) < 0) - return ERR_PTR(-EINVAL); - size += 2*pacl->a_count; + pacl = get_acl(inode, ACL_TYPE_ACCESS); + if (!pacl) { + pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); + if (IS_ERR(pacl)) + return PTR_ERR(pacl); } - if (dpacl) { - if (posix_acl_valid(dpacl) < 0) - return ERR_PTR(-EINVAL); - size += 2*dpacl->a_count; + /* allocate for worst case: one (deny, allow) pair each: */ + size += 2 * pacl->a_count; + + if (S_ISDIR(inode->i_mode)) { + flags = NFS4_ACL_DIR; + dpacl = get_acl(inode, ACL_TYPE_DEFAULT); + if (dpacl) + size += 2 * dpacl->a_count; } - /* Allocate for worst case: one (deny, allow) pair each: */ - acl = nfs4_acl_new(size); - if (acl == NULL) - return ERR_PTR(-ENOMEM); + *acl = nfs4_acl_new(size); + if (*acl == NULL) { + error = -ENOMEM; + goto out; + } - if (pacl) - _posix_to_nfsv4_one(pacl, acl, flags & ~NFS4_ACL_TYPE_DEFAULT); + _posix_to_nfsv4_one(pacl, *acl, flags & ~NFS4_ACL_TYPE_DEFAULT); if (dpacl) - _posix_to_nfsv4_one(dpacl, acl, flags | NFS4_ACL_TYPE_DEFAULT); + _posix_to_nfsv4_one(dpacl, *acl, flags | NFS4_ACL_TYPE_DEFAULT); - return acl; + out: + posix_acl_release(pacl); + posix_acl_release(dpacl); + return error; } struct posix_acl_summary { @@ -385,8 +401,10 @@ sort_pacl(struct posix_acl *pacl) * by uid/gid. */ int i, j; - if (pacl->a_count <= 4) - return; /* no users or groups */ + /* no users or groups */ + if (!pacl || pacl->a_count <= 4) + return; + i = 1; while (pacl->a_entries[i].e_tag == ACL_USER) i++; @@ -513,19 +531,21 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) /* * ACLs with no ACEs are treated differently in the inheritable - * and effective cases: when there are no inheritable ACEs, we - * set a zero-length default posix acl: + * and effective cases: when there are no inheritable ACEs, + * calls ->set_acl with a NULL ACL structure. */ - if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) { - pacl = posix_acl_alloc(0, GFP_KERNEL); - return pacl ? pacl : ERR_PTR(-ENOMEM); - } + if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) + return NULL; + /* * When there are no effective ACEs, the following will end * up setting a 3-element effective posix ACL with all * permissions zero. */ - nace = 4 + state->users->n + state->groups->n; + if (!state->users->n && !state->groups->n) + nace = 3; + else /* Note we also include a MASK ACE in this case: */ + nace = 4 + state->users->n + state->groups->n; pacl = posix_acl_alloc(nace, GFP_KERNEL); if (!pacl) return ERR_PTR(-ENOMEM); @@ -569,9 +589,11 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) add_to_mask(state, &state->groups->aces[i].perms); } - pace++; - pace->e_tag = ACL_MASK; - low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags); + if (state->users->n || state->groups->n) { + pace++; + pace->e_tag = ACL_MASK; + low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags); + } pace++; pace->e_tag = ACL_OTHER; @@ -719,8 +741,9 @@ static void process_one_v4_ace(struct posix_acl_state *state, } } -int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl, - struct posix_acl **dpacl, unsigned int flags) +static int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, + struct posix_acl **pacl, struct posix_acl **dpacl, + unsigned int flags) { struct posix_acl_state effective_acl_state, default_acl_state; struct nfs4_ace *ace; @@ -780,6 +803,57 @@ out_estate: return ret; } +__be32 +nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfs4_acl *acl) +{ + __be32 error; + int host_error; + struct dentry *dentry; + struct inode *inode; + struct posix_acl *pacl = NULL, *dpacl = NULL; + unsigned int flags = 0; + + /* Get inode */ + error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR); + if (error) + return error; + + dentry = fhp->fh_dentry; + inode = dentry->d_inode; + + if (!inode->i_op->set_acl || !IS_POSIXACL(inode)) + return nfserr_attrnotsupp; + + if (S_ISDIR(inode->i_mode)) + flags = NFS4_ACL_DIR; + + host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags); + if (host_error == -EINVAL) + return nfserr_attrnotsupp; + if (host_error < 0) + goto out_nfserr; + + host_error = inode->i_op->set_acl(inode, pacl, ACL_TYPE_ACCESS); + if (host_error < 0) + goto out_release; + + if (S_ISDIR(inode->i_mode)) { + host_error = inode->i_op->set_acl(inode, dpacl, + ACL_TYPE_DEFAULT); + } + +out_release: + posix_acl_release(pacl); + posix_acl_release(dpacl); +out_nfserr: + if (host_error == -EOPNOTSUPP) + return nfserr_attrnotsupp; + else + return nfserrno(host_error); +} + + static short ace2type(struct nfs4_ace *ace) { @@ -798,9 +872,6 @@ ace2type(struct nfs4_ace *ace) return -1; } -EXPORT_SYMBOL(nfs4_acl_posix_to_nfsv4); -EXPORT_SYMBOL(nfs4_acl_nfsv4_to_posix); - struct nfs4_acl * nfs4_acl_new(int n) { @@ -848,21 +919,21 @@ nfs4_acl_get_whotype(char *p, u32 len) return NFS4_ACL_WHO_NAMED; } -int -nfs4_acl_write_who(int who, char *p) +__be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who) { + __be32 *p; int i; for (i = 0; i < ARRAY_SIZE(s2t_map); i++) { - if (s2t_map[i].type == who) { - memcpy(p, s2t_map[i].string, s2t_map[i].stringlen); - return s2t_map[i].stringlen; - } + if (s2t_map[i].type != who) + continue; + p = xdr_reserve_space(xdr, s2t_map[i].stringlen + 4); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque(p, s2t_map[i].string, + s2t_map[i].stringlen); + return 0; } - BUG(); + WARN_ON_ONCE(1); return -1; } - -EXPORT_SYMBOL(nfs4_acl_new); -EXPORT_SYMBOL(nfs4_acl_get_whotype); -EXPORT_SYMBOL(nfs4_acl_write_who); diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 7f05cd140de..2c73cae9899 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -32,6 +32,7 @@ */ #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/xprt.h> #include <linux/sunrpc/svc_xprt.h> #include <linux/slab.h> #include "nfsd.h" @@ -635,11 +636,29 @@ static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc } } +static struct rpc_clnt *create_backchannel_client(struct rpc_create_args *args) +{ + struct rpc_xprt *xprt; + + if (args->protocol != XPRT_TRANSPORT_BC_TCP) + return rpc_create(args); + + xprt = args->bc_xprt->xpt_bc_xprt; + if (xprt) { + xprt_get(xprt); + return rpc_create_xprt(args, xprt); + } + + return rpc_create(args); +} + static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses) { + int maxtime = max_cb_time(clp->net); struct rpc_timeout timeparms = { - .to_initval = max_cb_time(clp->net), + .to_initval = maxtime, .to_retries = 0, + .to_maxval = maxtime, }; struct rpc_create_args args = { .net = clp->net, @@ -674,7 +693,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c args.authflavor = ses->se_cb_sec.flavor; } /* Create RPC client */ - client = rpc_create(&args); + client = create_backchannel_client(&args); if (IS_ERR(client)) { dprintk("NFSD: couldn't create callback client: %ld\n", PTR_ERR(client)); diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 4832fd819f8..a0ab0a847d6 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -551,27 +551,45 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen return 0; } -static int -idmap_id_to_name(struct svc_rqst *rqstp, int type, u32 id, char *name) +static __be32 encode_ascii_id(struct xdr_stream *xdr, u32 id) +{ + char buf[11]; + int len; + __be32 *p; + + len = sprintf(buf, "%u", id); + p = xdr_reserve_space(xdr, len + 4); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque(p, buf, len); + return 0; +} + +static __be32 idmap_id_to_name(struct xdr_stream *xdr, + struct svc_rqst *rqstp, int type, u32 id) { struct ent *item, key = { .id = id, .type = type, }; + __be32 *p; int ret; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item); if (ret == -ENOENT) - return sprintf(name, "%u", id); + return encode_ascii_id(xdr, id); if (ret) - return ret; + return nfserrno(ret); ret = strlen(item->name); - BUG_ON(ret > IDMAP_NAMESZ); - memcpy(name, item->name, ret); + WARN_ON_ONCE(ret > IDMAP_NAMESZ); + p = xdr_reserve_space(xdr, ret + 4); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque(p, item->name, ret); cache_put(&item->h, nn->idtoname_cache); - return ret; + return 0; } static bool @@ -603,12 +621,12 @@ do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u return idmap_name_to_id(rqstp, type, name, namelen, id); } -static int -do_id_to_name(struct svc_rqst *rqstp, int type, u32 id, char *name) +static __be32 encode_name_from_id(struct xdr_stream *xdr, + struct svc_rqst *rqstp, int type, u32 id) { if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS) - return sprintf(name, "%u", id); - return idmap_id_to_name(rqstp, type, id, name); + return encode_ascii_id(xdr, id); + return idmap_id_to_name(xdr, rqstp, type, id); } __be32 @@ -637,16 +655,16 @@ nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, return status; } -int -nfsd_map_uid_to_name(struct svc_rqst *rqstp, kuid_t uid, char *name) +__be32 nfsd4_encode_user(struct xdr_stream *xdr, struct svc_rqst *rqstp, + kuid_t uid) { u32 id = from_kuid(&init_user_ns, uid); - return do_id_to_name(rqstp, IDMAP_TYPE_USER, id, name); + return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_USER, id); } -int -nfsd_map_gid_to_name(struct svc_rqst *rqstp, kgid_t gid, char *name) +__be32 nfsd4_encode_group(struct xdr_stream *xdr, struct svc_rqst *rqstp, + kgid_t gid) { u32 id = from_kgid(&init_user_ns, gid); - return do_id_to_name(rqstp, IDMAP_TYPE_GROUP, id, name); + return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_GROUP, id); } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 419572f33b7..8f029db5d27 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -41,6 +41,7 @@ #include "vfs.h" #include "current_stateid.h" #include "netns.h" +#include "acl.h" #ifdef CONFIG_NFSD_V4_SECURITY_LABEL #include <linux/security.h> @@ -230,17 +231,16 @@ static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate } static __be32 -do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open) +do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh) { struct svc_fh *current_fh = &cstate->current_fh; - struct svc_fh *resfh; int accmode; __be32 status; - resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); - if (!resfh) + *resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); + if (!*resfh) return nfserr_jukebox; - fh_init(resfh, NFS4_FHSIZE); + fh_init(*resfh, NFS4_FHSIZE); open->op_truncate = 0; if (open->op_create) { @@ -265,12 +265,12 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru */ status = do_nfsd_create(rqstp, current_fh, open->op_fname.data, open->op_fname.len, &open->op_iattr, - resfh, open->op_createmode, + *resfh, open->op_createmode, (u32 *)open->op_verf.data, &open->op_truncate, &open->op_created); if (!status && open->op_label.len) - nfsd4_security_inode_setsecctx(resfh, &open->op_label, open->op_bmval); + nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval); /* * Following rfc 3530 14.2.16, use the returned bitmask @@ -280,31 +280,32 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0) open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_MODIFY); - } else { + } else + /* + * Note this may exit with the parent still locked. + * We will hold the lock until nfsd4_open's final + * lookup, to prevent renames or unlinks until we've had + * a chance to an acquire a delegation if appropriate. + */ status = nfsd_lookup(rqstp, current_fh, - open->op_fname.data, open->op_fname.len, resfh); - fh_unlock(current_fh); - } + open->op_fname.data, open->op_fname.len, *resfh); if (status) goto out; - status = nfsd_check_obj_isreg(resfh); + status = nfsd_check_obj_isreg(*resfh); if (status) goto out; if (is_create_with_attrs(open) && open->op_acl != NULL) - do_set_nfs4_acl(rqstp, resfh, open->op_acl, open->op_bmval); + do_set_nfs4_acl(rqstp, *resfh, open->op_acl, open->op_bmval); - nfsd4_set_open_owner_reply_cache(cstate, open, resfh); + nfsd4_set_open_owner_reply_cache(cstate, open, *resfh); accmode = NFSD_MAY_NOP; if (open->op_created || open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) accmode |= NFSD_MAY_OWNER_OVERRIDE; - status = do_open_permission(rqstp, resfh, open, accmode); + status = do_open_permission(rqstp, *resfh, open, accmode); set_change_info(&open->op_cinfo, current_fh); - fh_dup2(current_fh, resfh); out: - fh_put(resfh); - kfree(resfh); return status; } @@ -357,6 +358,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open) { __be32 status; + struct svc_fh *resfh = NULL; struct nfsd4_compoundres *resp; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -423,26 +425,26 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_DELEGATE_CUR: case NFS4_OPEN_CLAIM_NULL: - status = do_open_lookup(rqstp, cstate, open); + status = do_open_lookup(rqstp, cstate, open, &resfh); if (status) goto out; break; case NFS4_OPEN_CLAIM_PREVIOUS: - open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; status = nfs4_check_open_reclaim(&open->op_clientid, cstate->minorversion, nn); if (status) goto out; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; case NFS4_OPEN_CLAIM_FH: case NFS4_OPEN_CLAIM_DELEG_CUR_FH: status = do_open_fhandle(rqstp, cstate, open); if (status) goto out; + resfh = &cstate->current_fh; break; case NFS4_OPEN_CLAIM_DELEG_PREV_FH: case NFS4_OPEN_CLAIM_DELEGATE_PREV: - open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; dprintk("NFSD: unsupported OPEN claim type %d\n", open->op_claim_type); status = nfserr_notsupp; @@ -458,9 +460,14 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * successful, it (1) truncates the file if open->op_truncate was * set, (2) sets open->op_stateid, (3) sets open->op_delegation. */ - status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); + status = nfsd4_process_open2(rqstp, resfh, open); WARN_ON(status && open->op_created); out: + if (resfh && resfh != &cstate->current_fh) { + fh_dup2(&cstate->current_fh, resfh); + fh_put(resfh); + kfree(resfh); + } nfsd4_cleanup_open_state(open, status); if (open->op_openowner && !nfsd4_has_session(cstate)) cstate->replay_owner = &open->op_openowner->oo_owner; @@ -610,15 +617,6 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (create->cr_type) { case NF4LNK: - /* ugh! we have to null-terminate the linktext, or - * vfs_symlink() will choke. it is always safe to - * null-terminate by brute force, since at worst we - * will overwrite the first byte of the create namelen - * in the XDR buffer, which has already been extracted - * during XDR decode. - */ - create->cr_linkname[create->cr_linklen] = 0; - status = nfsd_symlink(rqstp, &cstate->current_fh, create->cr_name, create->cr_namelen, create->cr_linkname, create->cr_linklen, @@ -778,7 +776,6 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!nfsd4_last_compound_op(rqstp)) rqstp->rq_splice_ok = false; - nfs4_lock_state(); /* check stateid */ if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, &read->rd_stateid, @@ -786,11 +783,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); goto out; } - if (read->rd_filp) - get_file(read->rd_filp); status = nfs_ok; out: - nfs4_unlock_state(); read->rd_rqstp = rqstp; read->rd_fhp = &cstate->current_fh; return status; @@ -929,10 +923,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, int err; if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { - nfs4_lock_state(); status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, &setattr->sa_stateid, WR_STATE, NULL); - nfs4_unlock_state(); if (status) { dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); return status; @@ -998,17 +990,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (write->wr_offset >= OFFSET_MAX) return nfserr_inval; - nfs4_lock_state(); status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, stateid, WR_STATE, &filp); if (status) { - nfs4_unlock_state(); dprintk("NFSD: nfsd4_write: couldn't process stateid!\n"); return status; } - if (filp) - get_file(filp); - nfs4_unlock_state(); cnt = write->wr_buflen; write->wr_how_written = write->wr_stable_how; @@ -1064,13 +1051,15 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_jukebox; p = buf; - status = nfsd4_encode_fattr(&cstate->current_fh, + status = nfsd4_encode_fattr_to_buf(&p, count, &cstate->current_fh, cstate->current_fh.fh_export, - cstate->current_fh.fh_dentry, &p, - count, verify->ve_bmval, + cstate->current_fh.fh_dentry, + verify->ve_bmval, rqstp, 0); - - /* this means that nfsd4_encode_fattr() ran out of space */ + /* + * If nfsd4_encode_fattr() ran out of space, assume that's because + * the attributes are longer (hence different) than those given: + */ if (status == nfserr_resource) status = nfserr_not_same; if (status) @@ -1172,9 +1161,7 @@ struct nfsd4_operation { static struct nfsd4_operation nfsd4_ops[]; -#ifdef NFSD_DEBUG static const char *nfsd4_op_name(unsigned opnum); -#endif /* * Enforce NFSv4.1 COMPOUND ordering rules: @@ -1216,6 +1203,8 @@ static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op) bool nfsd4_cache_this_op(struct nfsd4_op *op) { + if (op->opnum == OP_ILLEGAL) + return false; return OPDESC(op)->op_flags & OP_CACHEME; } @@ -1252,6 +1241,25 @@ static bool need_wrongsec_check(struct svc_rqst *rqstp) return !(nextd->op_flags & OP_HANDLES_WRONGSEC); } +static void svcxdr_init_encode(struct svc_rqst *rqstp, + struct nfsd4_compoundres *resp) +{ + struct xdr_stream *xdr = &resp->xdr; + struct xdr_buf *buf = &rqstp->rq_res; + struct kvec *head = buf->head; + + xdr->buf = buf; + xdr->iov = head; + xdr->p = head->iov_base + head->iov_len; + xdr->end = head->iov_base + PAGE_SIZE - rqstp->rq_auth_slack; + /* Tail and page_len should be zero at this point: */ + buf->len = buf->head[0].iov_len; + xdr->scratch.iov_len = 0; + xdr->page_ptr = buf->pages - 1; + buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages) + - rqstp->rq_auth_slack; +} + /* * COMPOUND call. */ @@ -1263,26 +1271,20 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, struct nfsd4_op *op; struct nfsd4_operation *opdesc; struct nfsd4_compound_state *cstate = &resp->cstate; - int slack_bytes; - u32 plen = 0; + struct svc_fh *current_fh = &cstate->current_fh; + struct svc_fh *save_fh = &cstate->save_fh; __be32 status; - resp->xbuf = &rqstp->rq_res; - resp->p = rqstp->rq_res.head[0].iov_base + - rqstp->rq_res.head[0].iov_len; - resp->tagp = resp->p; + svcxdr_init_encode(rqstp, resp); + resp->tagp = resp->xdr.p; /* reserve space for: taglen, tag, and opcnt */ - resp->p += 2 + XDR_QUADLEN(args->taglen); - resp->end = rqstp->rq_res.head[0].iov_base + PAGE_SIZE; + xdr_reserve_space(&resp->xdr, 8 + args->taglen); resp->taglen = args->taglen; resp->tag = args->tag; - resp->opcnt = 0; resp->rqstp = rqstp; - resp->cstate.minorversion = args->minorversion; - resp->cstate.replay_owner = NULL; - resp->cstate.session = NULL; - fh_init(&resp->cstate.current_fh, NFS4_FHSIZE); - fh_init(&resp->cstate.save_fh, NFS4_FHSIZE); + cstate->minorversion = args->minorversion; + fh_init(current_fh, NFS4_FHSIZE); + fh_init(save_fh, NFS4_FHSIZE); /* * Don't use the deferral mechanism for NFSv4; compounds make it * too hard to avoid non-idempotency problems. @@ -1320,35 +1322,34 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, goto encode_op; } - /* We must be able to encode a successful response to - * this operation, with enough room left over to encode a - * failed response to the next operation. If we don't - * have enough room, fail with ERR_RESOURCE. - */ - slack_bytes = (char *)resp->end - (char *)resp->p; - if (slack_bytes < COMPOUND_SLACK_SPACE - + COMPOUND_ERR_SLACK_SPACE) { - BUG_ON(slack_bytes < COMPOUND_ERR_SLACK_SPACE); - op->status = nfserr_resource; - goto encode_op; - } - opdesc = OPDESC(op); - if (!cstate->current_fh.fh_dentry) { + if (!current_fh->fh_dentry) { if (!(opdesc->op_flags & ALLOWED_WITHOUT_FH)) { op->status = nfserr_nofilehandle; goto encode_op; } - } else if (cstate->current_fh.fh_export->ex_fslocs.migrated && + } else if (current_fh->fh_export->ex_fslocs.migrated && !(opdesc->op_flags & ALLOWED_ON_ABSENT_FS)) { op->status = nfserr_moved; goto encode_op; } + fh_clear_wcc(current_fh); + /* If op is non-idempotent */ if (opdesc->op_flags & OP_MODIFIES_SOMETHING) { - plen = opdesc->op_rsize_bop(rqstp, op); + /* + * Don't execute this op if we couldn't encode a + * succesful reply: + */ + u32 plen = opdesc->op_rsize_bop(rqstp, op); + /* + * Plus if there's another operation, make sure + * we'll have space to at least encode an error: + */ + if (resp->opcnt < args->opcnt) + plen += COMPOUND_ERR_SLACK_SPACE; op->status = nfsd4_check_resp_size(resp, plen); } @@ -1367,19 +1368,19 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, clear_current_stateid(cstate); if (need_wrongsec_check(rqstp)) - op->status = check_nfsd_access(cstate->current_fh.fh_export, rqstp); + op->status = check_nfsd_access(current_fh->fh_export, rqstp); } encode_op: /* Only from SEQUENCE */ - if (resp->cstate.status == nfserr_replay_cache) { + if (cstate->status == nfserr_replay_cache) { dprintk("%s NFS4.1 replay from cache\n", __func__); status = op->status; goto out; } if (op->status == nfserr_replay_me) { op->replay = &cstate->replay_owner->so_replay; - nfsd4_encode_replay(resp, op); + nfsd4_encode_replay(&resp->xdr, op); status = op->status = op->replay->rp_status; } else { nfsd4_encode_operation(resp, op); @@ -1401,10 +1402,10 @@ encode_op: nfsd4_increment_op_stats(op->opnum); } - resp->cstate.status = status; - fh_put(&resp->cstate.current_fh); - fh_put(&resp->cstate.save_fh); - BUG_ON(resp->cstate.replay_owner); + cstate->status = status; + fh_put(current_fh); + fh_put(save_fh); + BUG_ON(cstate->replay_owner); out: /* Reset deferral mechanism for RPC deferrals */ rqstp->rq_usedeferral = 1; @@ -1418,7 +1419,8 @@ out: #define op_encode_change_info_maxsz (5) #define nfs4_fattr_bitmap_maxsz (4) -#define op_encode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) +/* We'll fall back on returning no lockowner if run out of space: */ +#define op_encode_lockowner_maxsz (0) #define op_encode_lock_denied_maxsz (8 + op_encode_lockowner_maxsz) #define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) @@ -1450,6 +1452,49 @@ static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); } +/* + * Note since this is an idempotent operation we won't insist on failing + * the op prematurely if the estimate is too large. We may turn off splice + * reads unnecessarily. + */ +static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp, + struct nfsd4_op *op) +{ + u32 *bmap = op->u.getattr.ga_bmval; + u32 bmap0 = bmap[0], bmap1 = bmap[1], bmap2 = bmap[2]; + u32 ret = 0; + + if (bmap0 & FATTR4_WORD0_ACL) + return svc_max_payload(rqstp); + if (bmap0 & FATTR4_WORD0_FS_LOCATIONS) + return svc_max_payload(rqstp); + + if (bmap1 & FATTR4_WORD1_OWNER) { + ret += IDMAP_NAMESZ + 4; + bmap1 &= ~FATTR4_WORD1_OWNER; + } + if (bmap1 & FATTR4_WORD1_OWNER_GROUP) { + ret += IDMAP_NAMESZ + 4; + bmap1 &= ~FATTR4_WORD1_OWNER_GROUP; + } + if (bmap0 & FATTR4_WORD0_FILEHANDLE) { + ret += NFS4_FHSIZE + 4; + bmap0 &= ~FATTR4_WORD0_FILEHANDLE; + } + if (bmap2 & FATTR4_WORD2_SECURITY_LABEL) { + ret += NFSD4_MAX_SEC_LABEL_LEN + 12; + bmap2 &= ~FATTR4_WORD2_SECURITY_LABEL; + } + /* + * Largest of remaining attributes are 16 bytes (e.g., + * supported_attributes) + */ + ret += 16 * (hweight32(bmap0) + hweight32(bmap1) + hweight32(bmap2)); + /* bitmask, length */ + ret += 20; + return ret; +} + static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz) @@ -1480,18 +1525,19 @@ static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) if (rlen > maxcount) rlen = maxcount; - return (op_encode_hdr_size + 2) * sizeof(__be32) + rlen; + return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32); } static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { + u32 maxcount = svc_max_payload(rqstp); u32 rlen = op->u.readdir.rd_maxcount; - if (rlen > PAGE_SIZE) - rlen = PAGE_SIZE; + if (rlen > maxcount) + rlen = maxcount; - return (op_encode_hdr_size + op_encode_verifier_maxsz) - * sizeof(__be32) + rlen; + return (op_encode_hdr_size + op_encode_verifier_maxsz + + XDR_QUADLEN(rlen)) * sizeof(__be32); } static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) @@ -1506,6 +1552,12 @@ static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op + op_encode_change_info_maxsz) * sizeof(__be32); } +static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp, + struct nfsd4_op *op) +{ + return NFS4_MAX_SESSIONID_LEN + 20; +} + static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); @@ -1513,18 +1565,20 @@ static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *o static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { - return (op_encode_hdr_size + 2 + 1024) * sizeof(__be32); + return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) * + sizeof(__be32); } static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { - return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32); + return (op_encode_hdr_size + 2 + op_encode_verifier_maxsz) * sizeof(__be32); } static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\ - 1 + 1 + 2 + /* eir_flags, spr_how, spo_must_enforce & _allow */\ + 1 + 1 + /* eir_flags, spr_how */\ + 4 + /* spo_must_enforce & _allow with bitmap */\ 2 + /*eir_server_owner.so_minor_id */\ /* eir_server_owner.so_major_id<> */\ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\ @@ -1585,6 +1639,7 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_GETATTR] = { .op_func = (nfsd4op_func)nfsd4_getattr, .op_flags = ALLOWED_ON_ABSENT_FS, + .op_rsize_bop = nfsd4_getattr_rsize, .op_name = "OP_GETATTR", }, [OP_GETFH] = { @@ -1654,37 +1709,32 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_PUTFH] = { .op_func = (nfsd4op_func)nfsd4_putfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING - | OP_CLEAR_STATEID, + | OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID, .op_name = "OP_PUTFH", .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_PUTPUBFH] = { .op_func = (nfsd4op_func)nfsd4_putrootfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING - | OP_CLEAR_STATEID, + | OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID, .op_name = "OP_PUTPUBFH", .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_PUTROOTFH] = { .op_func = (nfsd4op_func)nfsd4_putrootfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING - | OP_CLEAR_STATEID, + | OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID, .op_name = "OP_PUTROOTFH", .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_READ] = { .op_func = (nfsd4op_func)nfsd4_read, - .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_READ", .op_rsize_bop = (nfsd4op_rsize)nfsd4_read_rsize, .op_get_currentstateid = (stateid_getter)nfsd4_get_readstateid, }, [OP_READDIR] = { .op_func = (nfsd4op_func)nfsd4_readdir, - .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_READDIR", .op_rsize_bop = (nfsd4op_rsize)nfsd4_readdir_rsize, }, @@ -1842,14 +1892,33 @@ static struct nfsd4_operation nfsd4_ops[] = { }, }; -#ifdef NFSD_DEBUG +int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + struct nfsd4_operation *opdesc; + nfsd4op_rsize estimator; + + if (op->opnum == OP_ILLEGAL) + return op_encode_hdr_size * sizeof(__be32); + opdesc = OPDESC(op); + estimator = opdesc->op_rsize_bop; + return estimator ? estimator(rqstp, op) : PAGE_SIZE; +} + +void warn_on_nonidempotent_op(struct nfsd4_op *op) +{ + if (OPDESC(op)->op_flags & OP_MODIFIES_SOMETHING) { + pr_err("unable to encode reply to nonidempotent op %d (%s)\n", + op->opnum, nfsd4_op_name(op->opnum)); + WARN_ON_ONCE(1); + } +} + static const char *nfsd4_op_name(unsigned opnum) { if (opnum < ARRAY_SIZE(nfsd4_ops)) return nfsd4_ops[opnum].op_name; return "unknown_operation"; } -#endif #define nfsd4_voidres nfsd4_voidargs struct nfsd4_voidargs { int dummy; }; @@ -1881,6 +1950,7 @@ struct svc_version nfsd_version4 = { .vs_proc = nfsd_procedures4, .vs_dispatch = nfsd_dispatch, .vs_xdrsize = NFS4_SVC_XDRSIZE, + .vs_rpcb_optnl = 1, }; /* diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a601fd49f99..2204e1fe572 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -41,6 +41,7 @@ #include <linux/ratelimit.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/addr.h> +#include <linux/hash.h> #include "xdr4.h" #include "xdr4cb.h" #include "vfs.h" @@ -81,13 +82,13 @@ static DEFINE_MUTEX(client_mutex); * effort to decrease the scope of the client_mutex, this spinlock may * eventually cover more: */ -static DEFINE_SPINLOCK(recall_lock); +static DEFINE_SPINLOCK(state_lock); -static struct kmem_cache *openowner_slab = NULL; -static struct kmem_cache *lockowner_slab = NULL; -static struct kmem_cache *file_slab = NULL; -static struct kmem_cache *stateid_slab = NULL; -static struct kmem_cache *deleg_slab = NULL; +static struct kmem_cache *openowner_slab; +static struct kmem_cache *lockowner_slab; +static struct kmem_cache *file_slab; +static struct kmem_cache *stateid_slab; +static struct kmem_cache *deleg_slab; void nfs4_lock_state(void) @@ -235,9 +236,9 @@ static void nfsd4_free_file(struct nfs4_file *f) static inline void put_nfs4_file(struct nfs4_file *fi) { - if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) { + if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) { hlist_del(&fi->fi_hash); - spin_unlock(&recall_lock); + spin_unlock(&state_lock); iput(fi->fi_inode); nfsd4_free_file(fi); } @@ -364,6 +365,79 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp) return openlockstateid(nfs4_alloc_stid(clp, stateid_slab)); } +/* + * When we recall a delegation, we should be careful not to hand it + * out again straight away. + * To ensure this we keep a pair of bloom filters ('new' and 'old') + * in which the filehandles of recalled delegations are "stored". + * If a filehandle appear in either filter, a delegation is blocked. + * When a delegation is recalled, the filehandle is stored in the "new" + * filter. + * Every 30 seconds we swap the filters and clear the "new" one, + * unless both are empty of course. + * + * Each filter is 256 bits. We hash the filehandle to 32bit and use the + * low 3 bytes as hash-table indices. + * + * 'state_lock', which is always held when block_delegations() is called, + * is used to manage concurrent access. Testing does not need the lock + * except when swapping the two filters. + */ +static struct bloom_pair { + int entries, old_entries; + time_t swap_time; + int new; /* index into 'set' */ + DECLARE_BITMAP(set[2], 256); +} blocked_delegations; + +static int delegation_blocked(struct knfsd_fh *fh) +{ + u32 hash; + struct bloom_pair *bd = &blocked_delegations; + + if (bd->entries == 0) + return 0; + if (seconds_since_boot() - bd->swap_time > 30) { + spin_lock(&state_lock); + if (seconds_since_boot() - bd->swap_time > 30) { + bd->entries -= bd->old_entries; + bd->old_entries = bd->entries; + memset(bd->set[bd->new], 0, + sizeof(bd->set[0])); + bd->new = 1-bd->new; + bd->swap_time = seconds_since_boot(); + } + spin_unlock(&state_lock); + } + hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0); + if (test_bit(hash&255, bd->set[0]) && + test_bit((hash>>8)&255, bd->set[0]) && + test_bit((hash>>16)&255, bd->set[0])) + return 1; + + if (test_bit(hash&255, bd->set[1]) && + test_bit((hash>>8)&255, bd->set[1]) && + test_bit((hash>>16)&255, bd->set[1])) + return 1; + + return 0; +} + +static void block_delegations(struct knfsd_fh *fh) +{ + u32 hash; + struct bloom_pair *bd = &blocked_delegations; + + hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0); + + __set_bit(hash&255, bd->set[bd->new]); + __set_bit((hash>>8)&255, bd->set[bd->new]); + __set_bit((hash>>16)&255, bd->set[bd->new]); + if (bd->entries == 0) + bd->swap_time = seconds_since_boot(); + bd->entries += 1; +} + static struct nfs4_delegation * alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh) { @@ -372,10 +446,11 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv dprintk("NFSD alloc_init_deleg\n"); if (num_delegations > max_delegations) return NULL; + if (delegation_blocked(¤t_fh->fh_handle)) + return NULL; dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab)); if (dp == NULL) return dp; - dp->dl_stid.sc_type = NFS4_DELEG_STID; /* * delegation seqid's are never incremented. The 4.1 special * meaning of seqid 0 isn't meaningful, really, but let's avoid @@ -402,17 +477,24 @@ static void remove_stid(struct nfs4_stid *s) idr_remove(stateids, s->sc_stateid.si_opaque.so_id); } +static void nfs4_free_stid(struct kmem_cache *slab, struct nfs4_stid *s) +{ + kmem_cache_free(slab, s); +} + void nfs4_put_delegation(struct nfs4_delegation *dp) { if (atomic_dec_and_test(&dp->dl_count)) { - kmem_cache_free(deleg_slab, dp); + nfs4_free_stid(deleg_slab, &dp->dl_stid); num_delegations--; } } static void nfs4_put_deleg_lease(struct nfs4_file *fp) { + if (!fp->fi_lease) + return; if (atomic_dec_and_test(&fp->fi_delegees)) { vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease); fp->fi_lease = NULL; @@ -426,18 +508,30 @@ static void unhash_stid(struct nfs4_stid *s) s->sc_type = 0; } +static void +hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) +{ + lockdep_assert_held(&state_lock); + + dp->dl_stid.sc_type = NFS4_DELEG_STID; + list_add(&dp->dl_perfile, &fp->fi_delegations); + list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); +} + /* Called under the state lock. */ static void unhash_delegation(struct nfs4_delegation *dp) { + spin_lock(&state_lock); list_del_init(&dp->dl_perclnt); - spin_lock(&recall_lock); list_del_init(&dp->dl_perfile); list_del_init(&dp->dl_recall_lru); - spin_unlock(&recall_lock); - nfs4_put_deleg_lease(dp->dl_file); - put_nfs4_file(dp->dl_file); - dp->dl_file = NULL; + spin_unlock(&state_lock); + if (dp->dl_file) { + nfs4_put_deleg_lease(dp->dl_file); + put_nfs4_file(dp->dl_file); + dp->dl_file = NULL; + } } @@ -610,7 +704,7 @@ static void close_generic_stateid(struct nfs4_ol_stateid *stp) static void free_generic_stateid(struct nfs4_ol_stateid *stp) { remove_stid(&stp->st_stid); - kmem_cache_free(stateid_slab, stp); + nfs4_free_stid(stateid_slab, &stp->st_stid); } static void release_lock_stateid(struct nfs4_ol_stateid *stp) @@ -640,6 +734,12 @@ static void unhash_lockowner(struct nfs4_lockowner *lo) } } +static void nfs4_free_lockowner(struct nfs4_lockowner *lo) +{ + kfree(lo->lo_owner.so_owner.data); + kmem_cache_free(lockowner_slab, lo); +} + static void release_lockowner(struct nfs4_lockowner *lo) { unhash_lockowner(lo); @@ -668,7 +768,6 @@ static void unhash_open_stateid(struct nfs4_ol_stateid *stp) static void release_open_stateid(struct nfs4_ol_stateid *stp) { unhash_open_stateid(stp); - unhash_stid(&stp->st_stid); free_generic_stateid(stp); } @@ -690,12 +789,17 @@ static void release_last_closed_stateid(struct nfs4_openowner *oo) struct nfs4_ol_stateid *s = oo->oo_last_closed_stid; if (s) { - unhash_stid(&s->st_stid); free_generic_stateid(s); oo->oo_last_closed_stid = NULL; } } +static void nfs4_free_openowner(struct nfs4_openowner *oo) +{ + kfree(oo->oo_owner.so_owner.data); + kmem_cache_free(openowner_slab, oo); +} + static void release_openowner(struct nfs4_openowner *oo) { unhash_openowner(oo); @@ -829,10 +933,11 @@ static void nfsd4_put_drc_mem(struct nfsd4_channel_attrs *ca) spin_unlock(&nfsd_drc_lock); } -static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *attrs) +static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs, + struct nfsd4_channel_attrs *battrs) { - int numslots = attrs->maxreqs; - int slotsize = slot_bytes(attrs); + int numslots = fattrs->maxreqs; + int slotsize = slot_bytes(fattrs); struct nfsd4_session *new; int mem, i; @@ -849,6 +954,10 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *attrs) if (!new->se_slots[i]) goto out_free; } + + memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs)); + memcpy(&new->se_bchannel, battrs, sizeof(struct nfsd4_channel_attrs)); + return new; out_free: while (i--) @@ -994,8 +1103,7 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru list_add(&new->se_perclnt, &clp->cl_sessions); spin_unlock(&clp->cl_lock); spin_unlock(&nn->client_lock); - memcpy(&new->se_fchannel, &cses->fore_channel, - sizeof(struct nfsd4_channel_attrs)); + if (cses->flags & SESSION4_BACK_CHAN) { struct sockaddr *sa = svc_addr(rqstp); /* @@ -1071,10 +1179,22 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) return NULL; } clp->cl_name.len = name.len; + INIT_LIST_HEAD(&clp->cl_sessions); + idr_init(&clp->cl_stateids); + atomic_set(&clp->cl_refcount, 0); + clp->cl_cb_state = NFSD4_CB_UNKNOWN; + INIT_LIST_HEAD(&clp->cl_idhash); + INIT_LIST_HEAD(&clp->cl_openowners); + INIT_LIST_HEAD(&clp->cl_delegations); + INIT_LIST_HEAD(&clp->cl_lru); + INIT_LIST_HEAD(&clp->cl_callbacks); + INIT_LIST_HEAD(&clp->cl_revoked); + spin_lock_init(&clp->cl_lock); + rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); return clp; } -static inline void +static void free_client(struct nfs4_client *clp) { struct nfsd_net __maybe_unused *nn = net_generic(clp->net, nfsd_net_id); @@ -1088,6 +1208,7 @@ free_client(struct nfs4_client *clp) WARN_ON_ONCE(atomic_read(&ses->se_ref)); free_session(ses); } + rpc_destroy_wait_queue(&clp->cl_cb_waitq); free_svc_cred(&clp->cl_cred); kfree(clp->cl_name.data); idr_destroy(&clp->cl_stateids); @@ -1116,17 +1237,22 @@ destroy_client(struct nfs4_client *clp) struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); INIT_LIST_HEAD(&reaplist); - spin_lock(&recall_lock); + spin_lock(&state_lock); while (!list_empty(&clp->cl_delegations)) { dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); list_del_init(&dp->dl_perclnt); list_move(&dp->dl_recall_lru, &reaplist); } - spin_unlock(&recall_lock); + spin_unlock(&state_lock); while (!list_empty(&reaplist)) { dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); destroy_delegation(dp); } + list_splice_init(&clp->cl_revoked, &reaplist); + while (!list_empty(&reaplist)) { + dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); + destroy_revoked_delegation(dp); + } while (!list_empty(&clp->cl_openowners)) { oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient); release_openowner(oo); @@ -1335,7 +1461,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, if (clp == NULL) return NULL; - INIT_LIST_HEAD(&clp->cl_sessions); ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred); if (ret) { spin_lock(&nn->client_lock); @@ -1343,20 +1468,9 @@ static struct nfs4_client *create_client(struct xdr_netobj name, spin_unlock(&nn->client_lock); return NULL; } - idr_init(&clp->cl_stateids); - atomic_set(&clp->cl_refcount, 0); - clp->cl_cb_state = NFSD4_CB_UNKNOWN; - INIT_LIST_HEAD(&clp->cl_idhash); - INIT_LIST_HEAD(&clp->cl_openowners); - INIT_LIST_HEAD(&clp->cl_delegations); - INIT_LIST_HEAD(&clp->cl_lru); - INIT_LIST_HEAD(&clp->cl_callbacks); - INIT_LIST_HEAD(&clp->cl_revoked); - spin_lock_init(&clp->cl_lock); nfsd4_init_callback(&clp->cl_cb_null); clp->cl_time = get_seconds(); clear_bit(0, &clp->cl_cb_slot_busy); - rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); copy_verf(clp, verf); rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa); gen_confirm(clp); @@ -1526,11 +1640,12 @@ out_err: } /* - * Cache a reply. nfsd4_check_drc_limit() has bounded the cache size. + * Cache a reply. nfsd4_check_resp_size() has bounded the cache size. */ void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) { + struct xdr_buf *buf = resp->xdr.buf; struct nfsd4_slot *slot = resp->cstate.slot; unsigned int base; @@ -1544,11 +1659,9 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) slot->sl_datalen = 0; return; } - slot->sl_datalen = (char *)resp->p - (char *)resp->cstate.datap; - base = (char *)resp->cstate.datap - - (char *)resp->xbuf->head[0].iov_base; - if (read_bytes_from_xdr_buf(resp->xbuf, base, slot->sl_data, - slot->sl_datalen)) + base = resp->cstate.data_offset; + slot->sl_datalen = buf->len - base; + if (read_bytes_from_xdr_buf(buf, base, slot->sl_data, slot->sl_datalen)) WARN("%s: sessions DRC could not cache compound\n", __func__); return; } @@ -1584,28 +1697,31 @@ nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args, * The sequence operation is not cached because we can use the slot and * session values. */ -__be32 +static __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, struct nfsd4_sequence *seq) { struct nfsd4_slot *slot = resp->cstate.slot; + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; __be32 status; dprintk("--> %s slot %p\n", __func__, slot); - /* Either returns 0 or nfserr_retry_uncached */ status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp); - if (status == nfserr_retry_uncached_rep) + if (status) return status; - /* The sequence operation has been encoded, cstate->datap set. */ - memcpy(resp->cstate.datap, slot->sl_data, slot->sl_datalen); + p = xdr_reserve_space(xdr, slot->sl_datalen); + if (!p) { + WARN_ON_ONCE(1); + return nfserr_serverfault; + } + xdr_encode_opaque_fixed(p, slot->sl_data, slot->sl_datalen); + xdr_commit_encode(xdr); resp->opcnt = slot->sl_opcnt; - resp->p = resp->cstate.datap + XDR_QUADLEN(slot->sl_datalen); - status = slot->sl_status; - - return status; + return slot->sl_status; } /* @@ -1843,6 +1959,11 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs return nfs_ok; } +#define NFSD_CB_MAX_REQ_SZ ((NFS4_enc_cb_recall_sz + \ + RPC_MAX_HEADER_WITH_AUTH) * sizeof(__be32)) +#define NFSD_CB_MAX_RESP_SZ ((NFS4_dec_cb_recall_sz + \ + RPC_MAX_REPHEADER_WITH_AUTH) * sizeof(__be32)) + static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca) { ca->headerpadsz = 0; @@ -1853,9 +1974,9 @@ static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca) * less than 1k. Tighten up this estimate in the unlikely event * it turns out to be a problem for some client: */ - if (ca->maxreq_sz < NFS4_enc_cb_recall_sz + RPC_MAX_HEADER_WITH_AUTH) + if (ca->maxreq_sz < NFSD_CB_MAX_REQ_SZ) return nfserr_toosmall; - if (ca->maxresp_sz < NFS4_dec_cb_recall_sz + RPC_MAX_REPHEADER_WITH_AUTH) + if (ca->maxresp_sz < NFSD_CB_MAX_RESP_SZ) return nfserr_toosmall; ca->maxresp_cached = 0; if (ca->maxops < 2) @@ -1905,9 +2026,9 @@ nfsd4_create_session(struct svc_rqst *rqstp, return status; status = check_backchannel_attrs(&cr_ses->back_channel); if (status) - return status; + goto out_release_drc_mem; status = nfserr_jukebox; - new = alloc_session(&cr_ses->fore_channel); + new = alloc_session(&cr_ses->fore_channel, &cr_ses->back_channel); if (!new) goto out_release_drc_mem; conn = alloc_conn_from_crses(rqstp, cr_ses); @@ -2172,11 +2293,13 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_sequence *seq) { struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct xdr_stream *xdr = &resp->xdr; struct nfsd4_session *session; struct nfs4_client *clp; struct nfsd4_slot *slot; struct nfsd4_conn *conn; __be32 status; + int buflen; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if (resp->opcnt != 1) @@ -2245,6 +2368,16 @@ nfsd4_sequence(struct svc_rqst *rqstp, if (status) goto out_put_session; + buflen = (seq->cachethis) ? + session->se_fchannel.maxresp_cached : + session->se_fchannel.maxresp_sz; + status = (seq->cachethis) ? nfserr_rep_too_big_to_cache : + nfserr_rep_too_big; + if (xdr_restrict_buflen(xdr, buflen - rqstp->rq_auth_slack)) + goto out_put_session; + svc_reserve(rqstp, buflen); + + status = nfs_ok; /* Success! bump slot seqid */ slot->sl_seqid = seq->seqid; slot->sl_flags |= NFSD4_SLOT_INUSE; @@ -2270,7 +2403,8 @@ out: if (!list_empty(&clp->cl_revoked)) seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED; out_no_session: - kfree(conn); + if (conn) + free_conn(conn); spin_unlock(&nn->client_lock); return status; out_put_session: @@ -2481,28 +2615,19 @@ static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino) fp->fi_lease = NULL; memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); memset(fp->fi_access, 0, sizeof(fp->fi_access)); - spin_lock(&recall_lock); + spin_lock(&state_lock); hlist_add_head(&fp->fi_hash, &file_hashtbl[hashval]); - spin_unlock(&recall_lock); -} - -static void -nfsd4_free_slab(struct kmem_cache **slab) -{ - if (*slab == NULL) - return; - kmem_cache_destroy(*slab); - *slab = NULL; + spin_unlock(&state_lock); } void nfsd4_free_slabs(void) { - nfsd4_free_slab(&openowner_slab); - nfsd4_free_slab(&lockowner_slab); - nfsd4_free_slab(&file_slab); - nfsd4_free_slab(&stateid_slab); - nfsd4_free_slab(&deleg_slab); + kmem_cache_destroy(openowner_slab); + kmem_cache_destroy(lockowner_slab); + kmem_cache_destroy(file_slab); + kmem_cache_destroy(stateid_slab); + kmem_cache_destroy(deleg_slab); } int @@ -2511,42 +2636,38 @@ nfsd4_init_slabs(void) openowner_slab = kmem_cache_create("nfsd4_openowners", sizeof(struct nfs4_openowner), 0, 0, NULL); if (openowner_slab == NULL) - goto out_nomem; + goto out; lockowner_slab = kmem_cache_create("nfsd4_lockowners", sizeof(struct nfs4_lockowner), 0, 0, NULL); if (lockowner_slab == NULL) - goto out_nomem; + goto out_free_openowner_slab; file_slab = kmem_cache_create("nfsd4_files", sizeof(struct nfs4_file), 0, 0, NULL); if (file_slab == NULL) - goto out_nomem; + goto out_free_lockowner_slab; stateid_slab = kmem_cache_create("nfsd4_stateids", sizeof(struct nfs4_ol_stateid), 0, 0, NULL); if (stateid_slab == NULL) - goto out_nomem; + goto out_free_file_slab; deleg_slab = kmem_cache_create("nfsd4_delegations", sizeof(struct nfs4_delegation), 0, 0, NULL); if (deleg_slab == NULL) - goto out_nomem; + goto out_free_stateid_slab; return 0; -out_nomem: - nfsd4_free_slabs(); + +out_free_stateid_slab: + kmem_cache_destroy(stateid_slab); +out_free_file_slab: + kmem_cache_destroy(file_slab); +out_free_lockowner_slab: + kmem_cache_destroy(lockowner_slab); +out_free_openowner_slab: + kmem_cache_destroy(openowner_slab); +out: dprintk("nfsd4: out of memory while initializing nfsv4\n"); return -ENOMEM; } -void nfs4_free_openowner(struct nfs4_openowner *oo) -{ - kfree(oo->oo_owner.so_owner.data); - kmem_cache_free(openowner_slab, oo); -} - -void nfs4_free_lockowner(struct nfs4_lockowner *lo) -{ - kfree(lo->lo_owner.so_owner.data); - kmem_cache_free(lockowner_slab, lo); -} - static void init_nfs4_replay(struct nfs4_replay *rp) { rp->rp_status = nfserr_serverfault; @@ -2667,15 +2788,15 @@ find_file(struct inode *ino) unsigned int hashval = file_hashval(ino); struct nfs4_file *fp; - spin_lock(&recall_lock); + spin_lock(&state_lock); hlist_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { if (fp->fi_inode == ino) { get_nfs4_file(fp); - spin_unlock(&recall_lock); + spin_unlock(&state_lock); return fp; } } - spin_unlock(&recall_lock); + spin_unlock(&state_lock); return NULL; } @@ -2712,6 +2833,7 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) struct nfs4_client *clp = dp->dl_stid.sc_client; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + lockdep_assert_held(&state_lock); /* We're assuming the state code never drops its reference * without first removing the lease. Since we're in this lease * callback (and since the lease code is serialized by the kernel @@ -2724,6 +2846,8 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) /* Only place dl_time is set; protected by i_lock: */ dp->dl_time = get_seconds(); + block_delegations(&dp->dl_fh); + nfsd4_cb_recall(dp); } @@ -2748,11 +2872,11 @@ static void nfsd_break_deleg_cb(struct file_lock *fl) */ fl->fl_break_time = 0; - spin_lock(&recall_lock); + spin_lock(&state_lock); fp->fi_had_conflict = true; list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) nfsd_break_one_deleg(dp); - spin_unlock(&recall_lock); + spin_unlock(&state_lock); } static @@ -3008,7 +3132,7 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int f return NULL; locks_init_lock(fl); fl->fl_lmops = &nfsd_lease_mng_ops; - fl->fl_flags = FL_LEASE; + fl->fl_flags = FL_DELEG; fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; fl->fl_end = OFFSET_MAX; fl->fl_owner = (fl_owner_t)(dp->dl_file); @@ -3026,49 +3150,38 @@ static int nfs4_setlease(struct nfs4_delegation *dp) if (!fl) return -ENOMEM; fl->fl_file = find_readable_file(fp); - list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); status = vfs_setlease(fl->fl_file, fl->fl_type, &fl); - if (status) { - list_del_init(&dp->dl_perclnt); - locks_free_lock(fl); - return status; - } + if (status) + goto out_free; fp->fi_lease = fl; fp->fi_deleg_file = get_file(fl->fl_file); atomic_set(&fp->fi_delegees, 1); - list_add(&dp->dl_perfile, &fp->fi_delegations); + spin_lock(&state_lock); + hash_delegation_locked(dp, fp); + spin_unlock(&state_lock); return 0; +out_free: + locks_free_lock(fl); + return status; } static int nfs4_set_delegation(struct nfs4_delegation *dp, struct nfs4_file *fp) { - int status; - if (fp->fi_had_conflict) return -EAGAIN; get_nfs4_file(fp); dp->dl_file = fp; - if (!fp->fi_lease) { - status = nfs4_setlease(dp); - if (status) - goto out_free; - return 0; - } - spin_lock(&recall_lock); + if (!fp->fi_lease) + return nfs4_setlease(dp); + spin_lock(&state_lock); + atomic_inc(&fp->fi_delegees); if (fp->fi_had_conflict) { - spin_unlock(&recall_lock); - status = -EAGAIN; - goto out_free; + spin_unlock(&state_lock); + return -EAGAIN; } - atomic_inc(&fp->fi_delegees); - list_add(&dp->dl_perfile, &fp->fi_delegations); - spin_unlock(&recall_lock); - list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); + hash_delegation_locked(dp, fp); + spin_unlock(&state_lock); return 0; -out_free: - put_nfs4_file(fp); - dp->dl_file = fp; - return status; } static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) @@ -3117,6 +3230,7 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh, goto out_no_deleg; break; case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_FH: /* * Let's not give out any delegations till everyone's * had the chance to reclaim theirs.... @@ -3154,8 +3268,7 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh, open->op_delegate_type = NFS4_OPEN_DELEGATE_READ; return; out_free: - unhash_stid(&dp->dl_stid); - nfs4_put_delegation(dp); + destroy_delegation(dp); out_no_deleg: open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE; if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && @@ -3372,8 +3485,7 @@ nfs4_laundromat(struct nfsd_net *nn) struct nfs4_delegation *dp; struct list_head *pos, *next, reaplist; time_t cutoff = get_seconds() - nn->nfsd4_lease; - time_t t, clientid_val = nn->nfsd4_lease; - time_t u, test_val = nn->nfsd4_lease; + time_t t, new_timeo = nn->nfsd4_lease; nfs4_lock_state(); @@ -3385,8 +3497,7 @@ nfs4_laundromat(struct nfsd_net *nn) clp = list_entry(pos, struct nfs4_client, cl_lru); if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { t = clp->cl_time - cutoff; - if (clientid_val > t) - clientid_val = t; + new_timeo = min(new_timeo, t); break; } if (mark_client_expired_locked(clp)) { @@ -3403,39 +3514,35 @@ nfs4_laundromat(struct nfsd_net *nn) clp->cl_clientid.cl_id); expire_client(clp); } - spin_lock(&recall_lock); + spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); if (net_generic(dp->dl_stid.sc_client->net, nfsd_net_id) != nn) continue; if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) { - u = dp->dl_time - cutoff; - if (test_val > u) - test_val = u; + t = dp->dl_time - cutoff; + new_timeo = min(new_timeo, t); break; } list_move(&dp->dl_recall_lru, &reaplist); } - spin_unlock(&recall_lock); + spin_unlock(&state_lock); list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); revoke_delegation(dp); } - test_val = nn->nfsd4_lease; list_for_each_safe(pos, next, &nn->close_lru) { oo = container_of(pos, struct nfs4_openowner, oo_close_lru); if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) { - u = oo->oo_time - cutoff; - if (test_val > u) - test_val = u; + t = oo->oo_time - cutoff; + new_timeo = min(new_timeo, t); break; } release_openowner(oo); } - if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT) - clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT; + new_timeo = max_t(time_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); nfs4_unlock_state(); - return clientid_val; + return new_timeo; } static struct workqueue_struct *laundry_wq; @@ -3609,8 +3716,11 @@ static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, return nfserr_bad_stateid; status = lookup_clientid(&stateid->si_opaque.so_clid, sessions, nn, &cl); - if (status == nfserr_stale_clientid) + if (status == nfserr_stale_clientid) { + if (sessions) + return nfserr_bad_stateid; return nfserr_stale_stateid; + } if (status) return status; *s = find_stateid_by_type(cl, stateid, typemask); @@ -3632,6 +3742,7 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, struct svc_fh *current_fh = &cstate->current_fh; struct inode *ino = current_fh->fh_dentry->d_inode; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct file *file = NULL; __be32 status; if (filpp) @@ -3643,10 +3754,12 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) return check_special_stateids(net, current_fh, stateid, flags); + nfs4_lock_state(); + status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s, cstate->minorversion, nn); if (status) - return status; + goto out; status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate)); if (status) goto out; @@ -3657,8 +3770,8 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, if (status) goto out; if (filpp) { - *filpp = dp->dl_file->fi_deleg_file; - if (!*filpp) { + file = dp->dl_file->fi_deleg_file; + if (!file) { WARN_ON_ONCE(1); status = nfserr_serverfault; goto out; @@ -3679,25 +3792,36 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, goto out; if (filpp) { if (flags & RD_STATE) - *filpp = find_readable_file(stp->st_file); + file = find_readable_file(stp->st_file); else - *filpp = find_writeable_file(stp->st_file); + file = find_writeable_file(stp->st_file); } break; default: - return nfserr_bad_stateid; + status = nfserr_bad_stateid; + goto out; } status = nfs_ok; + if (file) + *filpp = get_file(file); out: + nfs4_unlock_state(); return status; } static __be32 nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp) { - if (check_for_locks(stp->st_file, lockowner(stp->st_stateowner))) + struct nfs4_lockowner *lo = lockowner(stp->st_stateowner); + + if (check_for_locks(stp->st_file, lo)) return nfserr_locks_held; - release_lock_stateid(stp); + /* + * Currently there's a 1-1 lock stateid<->lockowner + * correspondance, and we have to delete the lockowner when we + * delete the lock stateid: + */ + release_lockowner(lo); return nfs_ok; } @@ -3995,10 +4119,9 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfsd4_close_open_stateid(stp); - if (cstate->minorversion) { - unhash_stid(&stp->st_stid); + if (cstate->minorversion) free_generic_stateid(stp); - } else + else oo->oo_last_closed_stid = stp; if (list_empty(&oo->oo_owner.so_stateids)) { @@ -4138,6 +4261,10 @@ static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, c if (!same_owner_str(&lo->lo_owner, owner, clid)) return false; + if (list_empty(&lo->lo_owner.so_stateids)) { + WARN_ON_ONCE(1); + return false; + } lst = list_first_entry(&lo->lo_owner.so_stateids, struct nfs4_ol_stateid, st_perstateowner); return lst->st_file->fi_inode == inode; @@ -4864,6 +4991,7 @@ static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max, struct nfs4_delegation *dp, *next; u64 count = 0; + lockdep_assert_held(&state_lock); list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) { if (victims) list_move(&dp->dl_recall_lru, victims); @@ -4879,9 +5007,9 @@ u64 nfsd_forget_client_delegations(struct nfs4_client *clp, u64 max) LIST_HEAD(victims); u64 count; - spin_lock(&recall_lock); + spin_lock(&state_lock); count = nfsd_find_all_delegations(clp, max, &victims); - spin_unlock(&recall_lock); + spin_unlock(&state_lock); list_for_each_entry_safe(dp, next, &victims, dl_recall_lru) revoke_delegation(dp); @@ -4895,11 +5023,11 @@ u64 nfsd_recall_client_delegations(struct nfs4_client *clp, u64 max) LIST_HEAD(victims); u64 count; - spin_lock(&recall_lock); + spin_lock(&state_lock); count = nfsd_find_all_delegations(clp, max, &victims); list_for_each_entry_safe(dp, next, &victims, dl_recall_lru) nfsd_break_one_deleg(dp); - spin_unlock(&recall_lock); + spin_unlock(&state_lock); return count; } @@ -4908,9 +5036,9 @@ u64 nfsd_print_client_delegations(struct nfs4_client *clp, u64 max) { u64 count = 0; - spin_lock(&recall_lock); + spin_lock(&state_lock); count = nfsd_find_all_delegations(clp, max, NULL); - spin_unlock(&recall_lock); + spin_unlock(&state_lock); nfsd_print_count(clp, count, "delegations"); return count; @@ -4951,13 +5079,6 @@ struct nfs4_client *nfsd_find_client(struct sockaddr_storage *addr, size_t addr_ #endif /* CONFIG_NFSD_FAULT_INJECTION */ -/* initialization to perform at module load time: */ - -void -nfs4_state_init(void) -{ -} - /* * Since the lifetime of a delegation isn't limited to that of an open, a * client may quite reasonably hang on to a delegation as long as it has @@ -5045,7 +5166,6 @@ nfs4_state_destroy_net(struct net *net) int i; struct nfs4_client *clp = NULL; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - struct rb_node *node, *tmp; for (i = 0; i < CLIENT_HASH_SIZE; i++) { while (!list_empty(&nn->conf_id_hashtbl[i])) { @@ -5054,13 +5174,11 @@ nfs4_state_destroy_net(struct net *net) } } - node = rb_first(&nn->unconf_name_tree); - while (node != NULL) { - tmp = node; - node = rb_next(tmp); - clp = rb_entry(tmp, struct nfs4_client, cl_namenode); - rb_erase(tmp, &nn->unconf_name_tree); - destroy_client(clp); + for (i = 0; i < CLIENT_HASH_SIZE; i++) { + while (!list_empty(&nn->unconf_id_hashtbl[i])) { + clp = list_entry(nn->unconf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); + destroy_client(clp); + } } kfree(nn->sessionid_hashtbl); @@ -5119,7 +5237,6 @@ out_recovery: return ret; } -/* should be called with the state lock held */ void nfs4_state_shutdown_net(struct net *net) { @@ -5130,13 +5247,14 @@ nfs4_state_shutdown_net(struct net *net) cancel_delayed_work_sync(&nn->laundromat_work); locks_end_grace(&nn->nfsd4_manager); + nfs4_lock_state(); INIT_LIST_HEAD(&reaplist); - spin_lock(&recall_lock); + spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); list_move(&dp->dl_recall_lru, &reaplist); } - spin_unlock(&recall_lock); + spin_unlock(&state_lock); list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); destroy_delegation(dp); @@ -5144,6 +5262,7 @@ nfs4_state_shutdown_net(struct net *net) nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); + nfs4_unlock_state(); } void diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index d9454fe5653..944275c8f56 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -98,16 +98,6 @@ xdr_error: \ status = nfserr_bad_xdr; \ goto out -#define READ32(x) (x) = ntohl(*p++) -#define READ64(x) do { \ - (x) = (u64)ntohl(*p++) << 32; \ - (x) |= ntohl(*p++); \ -} while (0) -#define READTIME(x) do { \ - p++; \ - (x) = ntohl(*p++); \ - p++; \ -} while (0) #define READMEM(x,nbytes) do { \ x = (char *)p; \ p += XDR_QUADLEN(nbytes); \ @@ -141,8 +131,8 @@ xdr_error: \ static void next_decode_page(struct nfsd4_compoundargs *argp) { - argp->pagelist++; argp->p = page_address(argp->pagelist[0]); + argp->pagelist++; if (argp->pagelen < PAGE_SIZE) { argp->end = argp->p + (argp->pagelen>>2); argp->pagelen = 0; @@ -190,6 +180,15 @@ static int zero_clientid(clientid_t *clid) return (clid->cl_boot == 0) && (clid->cl_id == 0); } +/** + * defer_free - mark an allocation as deferred freed + * @argp: NFSv4 compound argument structure to be freed with + * @release: release callback to free @p, typically kfree() + * @p: pointer to be freed + * + * Marks @p to be freed when processing the compound operation + * described in @argp finishes. + */ static int defer_free(struct nfsd4_compoundargs *argp, void (*release)(const void *), void *p) @@ -206,6 +205,16 @@ defer_free(struct nfsd4_compoundargs *argp, return 0; } +/** + * savemem - duplicate a chunk of memory for later processing + * @argp: NFSv4 compound argument structure to be freed with + * @p: pointer to be duplicated + * @nbytes: length to be duplicated + * + * Returns a pointer to a copy of @nbytes bytes of memory at @p + * that are preserved until processing of the NFSv4 compound + * operation described by @argp finishes. + */ static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes) { if (p == argp->tmp) { @@ -234,17 +243,17 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) bmval[2] = 0; READ_BUF(4); - READ32(bmlen); + bmlen = be32_to_cpup(p++); if (bmlen > 1000) goto xdr_error; READ_BUF(bmlen << 2); if (bmlen > 0) - READ32(bmval[0]); + bmval[0] = be32_to_cpup(p++); if (bmlen > 1) - READ32(bmval[1]); + bmval[1] = be32_to_cpup(p++); if (bmlen > 2) - READ32(bmval[2]); + bmval[2] = be32_to_cpup(p++); DECODE_TAIL; } @@ -256,8 +265,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, { int expected_len, len = 0; u32 dummy32; + u64 sec; char *buf; - int host_err; DECODE_HEAD; iattr->ia_valid = 0; @@ -265,12 +274,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, return status; READ_BUF(4); - READ32(expected_len); + expected_len = be32_to_cpup(p++); if (bmval[0] & FATTR4_WORD0_SIZE) { READ_BUF(8); len += 8; - READ64(iattr->ia_size); + p = xdr_decode_hyper(p, &iattr->ia_size); iattr->ia_valid |= ATTR_SIZE; } if (bmval[0] & FATTR4_WORD0_ACL) { @@ -278,25 +287,24 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct nfs4_ace *ace; READ_BUF(4); len += 4; - READ32(nace); + nace = be32_to_cpup(p++); if (nace > NFS4_ACL_MAX) - return nfserr_resource; + return nfserr_fbig; *acl = nfs4_acl_new(nace); - if (*acl == NULL) { - host_err = -ENOMEM; - goto out_nfserr; - } + if (*acl == NULL) + return nfserr_jukebox; + defer_free(argp, kfree, *acl); (*acl)->naces = nace; for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) { READ_BUF(16); len += 16; - READ32(ace->type); - READ32(ace->flag); - READ32(ace->access_mask); - READ32(dummy32); + ace->type = be32_to_cpup(p++); + ace->flag = be32_to_cpup(p++); + ace->access_mask = be32_to_cpup(p++); + dummy32 = be32_to_cpup(p++); READ_BUF(dummy32); len += XDR_QUADLEN(dummy32) << 2; READMEM(buf, dummy32); @@ -318,14 +326,14 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, if (bmval[1] & FATTR4_WORD1_MODE) { READ_BUF(4); len += 4; - READ32(iattr->ia_mode); + iattr->ia_mode = be32_to_cpup(p++); iattr->ia_mode &= (S_IFMT | S_IALLUGO); iattr->ia_valid |= ATTR_MODE; } if (bmval[1] & FATTR4_WORD1_OWNER) { READ_BUF(4); len += 4; - READ32(dummy32); + dummy32 = be32_to_cpup(p++); READ_BUF(dummy32); len += (XDR_QUADLEN(dummy32) << 2); READMEM(buf, dummy32); @@ -336,7 +344,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) { READ_BUF(4); len += 4; - READ32(dummy32); + dummy32 = be32_to_cpup(p++); READ_BUF(dummy32); len += (XDR_QUADLEN(dummy32) << 2); READMEM(buf, dummy32); @@ -347,15 +355,16 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { READ_BUF(4); len += 4; - READ32(dummy32); + dummy32 = be32_to_cpup(p++); switch (dummy32) { case NFS4_SET_TO_CLIENT_TIME: /* We require the high 32 bits of 'seconds' to be 0, and we ignore all 32 bits of 'nseconds'. */ READ_BUF(12); len += 12; - READ64(iattr->ia_atime.tv_sec); - READ32(iattr->ia_atime.tv_nsec); + p = xdr_decode_hyper(p, &sec); + iattr->ia_atime.tv_sec = (time_t)sec; + iattr->ia_atime.tv_nsec = be32_to_cpup(p++); if (iattr->ia_atime.tv_nsec >= (u32)1000000000) return nfserr_inval; iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET); @@ -370,15 +379,16 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { READ_BUF(4); len += 4; - READ32(dummy32); + dummy32 = be32_to_cpup(p++); switch (dummy32) { case NFS4_SET_TO_CLIENT_TIME: /* We require the high 32 bits of 'seconds' to be 0, and we ignore all 32 bits of 'nseconds'. */ READ_BUF(12); len += 12; - READ64(iattr->ia_mtime.tv_sec); - READ32(iattr->ia_mtime.tv_nsec); + p = xdr_decode_hyper(p, &sec); + iattr->ia_mtime.tv_sec = sec; + iattr->ia_mtime.tv_nsec = be32_to_cpup(p++); if (iattr->ia_mtime.tv_nsec >= (u32)1000000000) return nfserr_inval; iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET); @@ -396,13 +406,13 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) { READ_BUF(4); len += 4; - READ32(dummy32); /* lfs: we don't use it */ + dummy32 = be32_to_cpup(p++); /* lfs: we don't use it */ READ_BUF(4); len += 4; - READ32(dummy32); /* pi: we don't use it either */ + dummy32 = be32_to_cpup(p++); /* pi: we don't use it either */ READ_BUF(4); len += 4; - READ32(dummy32); + dummy32 = be32_to_cpup(p++); READ_BUF(dummy32); if (dummy32 > NFSD4_MAX_SEC_LABEL_LEN) return nfserr_badlabel; @@ -411,6 +421,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, label->data = kzalloc(dummy32 + 1, GFP_KERNEL); if (!label->data) return nfserr_jukebox; + label->len = dummy32; defer_free(argp, kfree, label->data); memcpy(label->data, buf, dummy32); } @@ -424,10 +435,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, goto xdr_error; DECODE_TAIL; - -out_nfserr: - status = nfserrno(host_err); - goto out; } static __be32 @@ -436,7 +443,7 @@ nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid) DECODE_HEAD; READ_BUF(sizeof(stateid_t)); - READ32(sid->si_generation); + sid->si_generation = be32_to_cpup(p++); COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); DECODE_TAIL; @@ -448,7 +455,7 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access DECODE_HEAD; READ_BUF(4); - READ32(access->ac_req_access); + access->ac_req_access = be32_to_cpup(p++); DECODE_TAIL; } @@ -463,7 +470,7 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_ /* callback_sec_params4 */ READ_BUF(4); - READ32(nr_secflavs); + nr_secflavs = be32_to_cpup(p++); if (nr_secflavs) cbs->flavor = (u32)(-1); else @@ -471,7 +478,7 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_ cbs->flavor = 0; for (i = 0; i < nr_secflavs; ++i) { READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); switch (dummy) { case RPC_AUTH_NULL: /* Nothing to read */ @@ -481,21 +488,21 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_ case RPC_AUTH_UNIX: READ_BUF(8); /* stamp */ - READ32(dummy); + dummy = be32_to_cpup(p++); /* machine name */ - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy); SAVEMEM(machine_name, dummy); /* uid, gid */ READ_BUF(8); - READ32(uid); - READ32(gid); + uid = be32_to_cpup(p++); + gid = be32_to_cpup(p++); /* more gids */ READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy * 4); if (cbs->flavor == (u32)(-1)) { kuid_t kuid = make_kuid(&init_user_ns, uid); @@ -515,14 +522,14 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_ "not supported!\n"); READ_BUF(8); /* gcbp_service */ - READ32(dummy); + dummy = be32_to_cpup(p++); /* gcbp_handle_from_server */ - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy); p += XDR_QUADLEN(dummy); /* gcbp_handle_from_client */ READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy); break; default: @@ -538,7 +545,7 @@ static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, stru DECODE_HEAD; READ_BUF(4); - READ32(bc->bc_cb_program); + bc->bc_cb_program = be32_to_cpup(p++); nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec); DECODE_TAIL; @@ -550,7 +557,7 @@ static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, READ_BUF(NFS4_MAX_SESSIONID_LEN + 8); COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN); - READ32(bcts->dir); + bcts->dir = be32_to_cpup(p++); /* XXX: skipping ctsa_use_conn_in_rdma_mode. Perhaps Tom Tucker * could help us figure out we should be using it. */ DECODE_TAIL; @@ -562,7 +569,7 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) DECODE_HEAD; READ_BUF(4); - READ32(close->cl_seqid); + close->cl_seqid = be32_to_cpup(p++); return nfsd4_decode_stateid(argp, &close->cl_stateid); DECODE_TAIL; @@ -575,8 +582,8 @@ nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit DECODE_HEAD; READ_BUF(12); - READ64(commit->co_offset); - READ32(commit->co_count); + p = xdr_decode_hyper(p, &commit->co_offset); + commit->co_count = be32_to_cpup(p++); DECODE_TAIL; } @@ -587,19 +594,30 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create DECODE_HEAD; READ_BUF(4); - READ32(create->cr_type); + create->cr_type = be32_to_cpup(p++); switch (create->cr_type) { case NF4LNK: READ_BUF(4); - READ32(create->cr_linklen); + create->cr_linklen = be32_to_cpup(p++); READ_BUF(create->cr_linklen); - SAVEMEM(create->cr_linkname, create->cr_linklen); + /* + * The VFS will want a null-terminated string, and + * null-terminating in place isn't safe since this might + * end on a page boundary: + */ + create->cr_linkname = + kmalloc(create->cr_linklen + 1, GFP_KERNEL); + if (!create->cr_linkname) + return nfserr_jukebox; + memcpy(create->cr_linkname, p, create->cr_linklen); + create->cr_linkname[create->cr_linklen] = '\0'; + defer_free(argp, kfree, create->cr_linkname); break; case NF4BLK: case NF4CHR: READ_BUF(8); - READ32(create->cr_specdata1); - READ32(create->cr_specdata2); + create->cr_specdata1 = be32_to_cpup(p++); + create->cr_specdata2 = be32_to_cpup(p++); break; case NF4SOCK: case NF4FIFO: @@ -609,7 +627,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create } READ_BUF(4); - READ32(create->cr_namelen); + create->cr_namelen = be32_to_cpup(p++); READ_BUF(create->cr_namelen); SAVEMEM(create->cr_name, create->cr_namelen); if ((status = check_filename(create->cr_name, create->cr_namelen))) @@ -641,7 +659,7 @@ nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link) DECODE_HEAD; READ_BUF(4); - READ32(link->li_namelen); + link->li_namelen = be32_to_cpup(p++); READ_BUF(link->li_namelen); SAVEMEM(link->li_name, link->li_namelen); if ((status = check_filename(link->li_name, link->li_namelen))) @@ -659,24 +677,24 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) * type, reclaim(boolean), offset, length, new_lock_owner(boolean) */ READ_BUF(28); - READ32(lock->lk_type); + lock->lk_type = be32_to_cpup(p++); if ((lock->lk_type < NFS4_READ_LT) || (lock->lk_type > NFS4_WRITEW_LT)) goto xdr_error; - READ32(lock->lk_reclaim); - READ64(lock->lk_offset); - READ64(lock->lk_length); - READ32(lock->lk_is_new); + lock->lk_reclaim = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &lock->lk_offset); + p = xdr_decode_hyper(p, &lock->lk_length); + lock->lk_is_new = be32_to_cpup(p++); if (lock->lk_is_new) { READ_BUF(4); - READ32(lock->lk_new_open_seqid); + lock->lk_new_open_seqid = be32_to_cpup(p++); status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid); if (status) return status; READ_BUF(8 + sizeof(clientid_t)); - READ32(lock->lk_new_lock_seqid); + lock->lk_new_lock_seqid = be32_to_cpup(p++); COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t)); - READ32(lock->lk_new_owner.len); + lock->lk_new_owner.len = be32_to_cpup(p++); READ_BUF(lock->lk_new_owner.len); READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len); } else { @@ -684,7 +702,7 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) if (status) return status; READ_BUF(4); - READ32(lock->lk_old_lock_seqid); + lock->lk_old_lock_seqid = be32_to_cpup(p++); } DECODE_TAIL; @@ -696,13 +714,13 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt) DECODE_HEAD; READ_BUF(32); - READ32(lockt->lt_type); + lockt->lt_type = be32_to_cpup(p++); if((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT)) goto xdr_error; - READ64(lockt->lt_offset); - READ64(lockt->lt_length); + p = xdr_decode_hyper(p, &lockt->lt_offset); + p = xdr_decode_hyper(p, &lockt->lt_length); COPYMEM(&lockt->lt_clientid, 8); - READ32(lockt->lt_owner.len); + lockt->lt_owner.len = be32_to_cpup(p++); READ_BUF(lockt->lt_owner.len); READMEM(lockt->lt_owner.data, lockt->lt_owner.len); @@ -715,16 +733,16 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) DECODE_HEAD; READ_BUF(8); - READ32(locku->lu_type); + locku->lu_type = be32_to_cpup(p++); if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT)) goto xdr_error; - READ32(locku->lu_seqid); + locku->lu_seqid = be32_to_cpup(p++); status = nfsd4_decode_stateid(argp, &locku->lu_stateid); if (status) return status; READ_BUF(16); - READ64(locku->lu_offset); - READ64(locku->lu_length); + p = xdr_decode_hyper(p, &locku->lu_offset); + p = xdr_decode_hyper(p, &locku->lu_length); DECODE_TAIL; } @@ -735,7 +753,7 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup DECODE_HEAD; READ_BUF(4); - READ32(lookup->lo_len); + lookup->lo_len = be32_to_cpup(p++); READ_BUF(lookup->lo_len); SAVEMEM(lookup->lo_name, lookup->lo_len); if ((status = check_filename(lookup->lo_name, lookup->lo_len))) @@ -750,7 +768,7 @@ static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *sh u32 w; READ_BUF(4); - READ32(w); + w = be32_to_cpup(p++); *share_access = w & NFS4_SHARE_ACCESS_MASK; *deleg_want = w & NFS4_SHARE_WANT_MASK; if (deleg_when) @@ -802,7 +820,7 @@ static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x) __be32 *p; READ_BUF(4); - READ32(*x); + *x = be32_to_cpup(p++); /* Note: unlinke access bits, deny bits may be zero. */ if (*x & ~NFS4_SHARE_DENY_BOTH) return nfserr_bad_xdr; @@ -816,7 +834,7 @@ static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_ne __be32 *p; READ_BUF(4); - READ32(o->len); + o->len = be32_to_cpup(p++); if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT) return nfserr_bad_xdr; @@ -841,7 +859,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) open->op_xdr_error = 0; /* seqid, share_access, share_deny, clientid, ownerlen */ READ_BUF(4); - READ32(open->op_seqid); + open->op_seqid = be32_to_cpup(p++); /* decode, yet ignore deleg_when until supported */ status = nfsd4_decode_share_access(argp, &open->op_share_access, &open->op_deleg_want, &dummy); @@ -856,13 +874,13 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) if (status) goto xdr_error; READ_BUF(4); - READ32(open->op_create); + open->op_create = be32_to_cpup(p++); switch (open->op_create) { case NFS4_OPEN_NOCREATE: break; case NFS4_OPEN_CREATE: READ_BUF(4); - READ32(open->op_createmode); + open->op_createmode = be32_to_cpup(p++); switch (open->op_createmode) { case NFS4_CREATE_UNCHECKED: case NFS4_CREATE_GUARDED: @@ -895,12 +913,12 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) /* open_claim */ READ_BUF(4); - READ32(open->op_claim_type); + open->op_claim_type = be32_to_cpup(p++); switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_NULL: case NFS4_OPEN_CLAIM_DELEGATE_PREV: READ_BUF(4); - READ32(open->op_fname.len); + open->op_fname.len = be32_to_cpup(p++); READ_BUF(open->op_fname.len); SAVEMEM(open->op_fname.data, open->op_fname.len); if ((status = check_filename(open->op_fname.data, open->op_fname.len))) @@ -908,14 +926,14 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) break; case NFS4_OPEN_CLAIM_PREVIOUS: READ_BUF(4); - READ32(open->op_delegate_type); + open->op_delegate_type = be32_to_cpup(p++); break; case NFS4_OPEN_CLAIM_DELEGATE_CUR: status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid); if (status) return status; READ_BUF(4); - READ32(open->op_fname.len); + open->op_fname.len = be32_to_cpup(p++); READ_BUF(open->op_fname.len); SAVEMEM(open->op_fname.data, open->op_fname.len); if ((status = check_filename(open->op_fname.data, open->op_fname.len))) @@ -945,13 +963,16 @@ static __be32 nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf) { DECODE_HEAD; - + + if (argp->minorversion >= 1) + return nfserr_notsupp; + status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid); if (status) return status; READ_BUF(4); - READ32(open_conf->oc_seqid); - + open_conf->oc_seqid = be32_to_cpup(p++); + DECODE_TAIL; } @@ -964,7 +985,7 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d if (status) return status; READ_BUF(4); - READ32(open_down->od_seqid); + open_down->od_seqid = be32_to_cpup(p++); status = nfsd4_decode_share_access(argp, &open_down->od_share_access, &open_down->od_deleg_want, NULL); if (status) @@ -981,7 +1002,7 @@ nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh) DECODE_HEAD; READ_BUF(4); - READ32(putfh->pf_fhlen); + putfh->pf_fhlen = be32_to_cpup(p++); if (putfh->pf_fhlen > NFS4_FHSIZE) goto xdr_error; READ_BUF(putfh->pf_fhlen); @@ -991,6 +1012,14 @@ nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh) } static __be32 +nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, void *p) +{ + if (argp->minorversion == 0) + return nfs_ok; + return nfserr_notsupp; +} + +static __be32 nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read) { DECODE_HEAD; @@ -999,8 +1028,8 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read) if (status) return status; READ_BUF(12); - READ64(read->rd_offset); - READ32(read->rd_length); + p = xdr_decode_hyper(p, &read->rd_offset); + read->rd_length = be32_to_cpup(p++); DECODE_TAIL; } @@ -1011,10 +1040,10 @@ nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *read DECODE_HEAD; READ_BUF(24); - READ64(readdir->rd_cookie); + p = xdr_decode_hyper(p, &readdir->rd_cookie); COPYMEM(readdir->rd_verf.data, sizeof(readdir->rd_verf.data)); - READ32(readdir->rd_dircount); /* just in case you needed a useless field... */ - READ32(readdir->rd_maxcount); + readdir->rd_dircount = be32_to_cpup(p++); + readdir->rd_maxcount = be32_to_cpup(p++); if ((status = nfsd4_decode_bitmap(argp, readdir->rd_bmval))) goto out; @@ -1027,7 +1056,7 @@ nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove DECODE_HEAD; READ_BUF(4); - READ32(remove->rm_namelen); + remove->rm_namelen = be32_to_cpup(p++); READ_BUF(remove->rm_namelen); SAVEMEM(remove->rm_name, remove->rm_namelen); if ((status = check_filename(remove->rm_name, remove->rm_namelen))) @@ -1042,10 +1071,10 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename DECODE_HEAD; READ_BUF(4); - READ32(rename->rn_snamelen); + rename->rn_snamelen = be32_to_cpup(p++); READ_BUF(rename->rn_snamelen + 4); SAVEMEM(rename->rn_sname, rename->rn_snamelen); - READ32(rename->rn_tnamelen); + rename->rn_tnamelen = be32_to_cpup(p++); READ_BUF(rename->rn_tnamelen); SAVEMEM(rename->rn_tname, rename->rn_tnamelen); if ((status = check_filename(rename->rn_sname, rename->rn_snamelen))) @@ -1061,6 +1090,9 @@ nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid) { DECODE_HEAD; + if (argp->minorversion >= 1) + return nfserr_notsupp; + READ_BUF(sizeof(clientid_t)); COPYMEM(clientid, sizeof(clientid_t)); @@ -1074,7 +1106,7 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp, DECODE_HEAD; READ_BUF(4); - READ32(secinfo->si_namelen); + secinfo->si_namelen = be32_to_cpup(p++); READ_BUF(secinfo->si_namelen); SAVEMEM(secinfo->si_name, secinfo->si_namelen); status = check_filename(secinfo->si_name, secinfo->si_namelen); @@ -1090,7 +1122,7 @@ nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp, DECODE_HEAD; READ_BUF(4); - READ32(sin->sin_style); + sin->sin_style = be32_to_cpup(p++); DECODE_TAIL; } @@ -1111,6 +1143,9 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient { DECODE_HEAD; + if (argp->minorversion >= 1) + return nfserr_notsupp; + READ_BUF(NFS4_VERIFIER_SIZE); COPYMEM(setclientid->se_verf.data, NFS4_VERIFIER_SIZE); @@ -1118,16 +1153,16 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient if (status) return nfserr_bad_xdr; READ_BUF(8); - READ32(setclientid->se_callback_prog); - READ32(setclientid->se_callback_netid_len); + setclientid->se_callback_prog = be32_to_cpup(p++); + setclientid->se_callback_netid_len = be32_to_cpup(p++); READ_BUF(setclientid->se_callback_netid_len + 4); SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len); - READ32(setclientid->se_callback_addr_len); + setclientid->se_callback_addr_len = be32_to_cpup(p++); READ_BUF(setclientid->se_callback_addr_len + 4); SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len); - READ32(setclientid->se_callback_ident); + setclientid->se_callback_ident = be32_to_cpup(p++); DECODE_TAIL; } @@ -1137,6 +1172,9 @@ nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_s { DECODE_HEAD; + if (argp->minorversion >= 1) + return nfserr_notsupp; + READ_BUF(8 + NFS4_VERIFIER_SIZE); COPYMEM(&scd_c->sc_clientid, 8); COPYMEM(&scd_c->sc_confirm, NFS4_VERIFIER_SIZE); @@ -1157,7 +1195,7 @@ nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify * nfsd4_proc_verify */ READ_BUF(4); - READ32(verify->ve_attrlen); + verify->ve_attrlen = be32_to_cpup(p++); READ_BUF(verify->ve_attrlen); SAVEMEM(verify->ve_attrval, verify->ve_attrlen); @@ -1175,11 +1213,11 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) if (status) return status; READ_BUF(16); - READ64(write->wr_offset); - READ32(write->wr_stable_how); + p = xdr_decode_hyper(p, &write->wr_offset); + write->wr_stable_how = be32_to_cpup(p++); if (write->wr_stable_how > 2) goto xdr_error; - READ32(write->wr_buflen); + write->wr_buflen = be32_to_cpup(p++); /* Sorry .. no magic macros for this.. * * READ_BUF(write->wr_buflen); @@ -1193,7 +1231,6 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) } write->wr_head.iov_base = p; write->wr_head.iov_len = avail; - WARN_ON(avail != (XDR_QUADLEN(avail) << 2)); write->wr_pagelist = argp->pagelist; len = XDR_QUADLEN(write->wr_buflen) << 2; @@ -1208,6 +1245,7 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) len -= pages * PAGE_SIZE; argp->p = (__be32 *)page_address(argp->pagelist[0]); + argp->pagelist++; argp->end = argp->p + XDR_QUADLEN(PAGE_SIZE); } argp->p += XDR_QUADLEN(len); @@ -1220,9 +1258,12 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel { DECODE_HEAD; + if (argp->minorversion >= 1) + return nfserr_notsupp; + READ_BUF(12); COPYMEM(&rlockowner->rl_clientid, sizeof(clientid_t)); - READ32(rlockowner->rl_owner.len); + rlockowner->rl_owner.len = be32_to_cpup(p++); READ_BUF(rlockowner->rl_owner.len); READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len); @@ -1246,63 +1287,63 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, return nfserr_bad_xdr; READ_BUF(4); - READ32(exid->flags); + exid->flags = be32_to_cpup(p++); /* Ignore state_protect4_a */ READ_BUF(4); - READ32(exid->spa_how); + exid->spa_how = be32_to_cpup(p++); switch (exid->spa_how) { case SP4_NONE: break; case SP4_MACH_CRED: /* spo_must_enforce */ READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy * 4); p += dummy; /* spo_must_allow */ READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy * 4); p += dummy; break; case SP4_SSV: /* ssp_ops */ READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy * 4); p += dummy; READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy * 4); p += dummy; /* ssp_hash_algs<> */ READ_BUF(4); - READ32(tmp); + tmp = be32_to_cpup(p++); while (tmp--) { READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy); p += XDR_QUADLEN(dummy); } /* ssp_encr_algs<> */ READ_BUF(4); - READ32(tmp); + tmp = be32_to_cpup(p++); while (tmp--) { READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy); p += XDR_QUADLEN(dummy); } /* ssp_window and ssp_num_gss_handles */ READ_BUF(8); - READ32(dummy); - READ32(dummy); + dummy = be32_to_cpup(p++); + dummy = be32_to_cpup(p++); break; default: goto xdr_error; @@ -1310,7 +1351,7 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, /* Ignore Implementation ID */ READ_BUF(4); /* nfs_impl_id4 array length */ - READ32(dummy); + dummy = be32_to_cpup(p++); if (dummy > 1) goto xdr_error; @@ -1318,13 +1359,13 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, if (dummy == 1) { /* nii_domain */ READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy); p += XDR_QUADLEN(dummy); /* nii_name */ READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy); p += XDR_QUADLEN(dummy); @@ -1344,21 +1385,21 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, READ_BUF(16); COPYMEM(&sess->clientid, 8); - READ32(sess->seqid); - READ32(sess->flags); + sess->seqid = be32_to_cpup(p++); + sess->flags = be32_to_cpup(p++); /* Fore channel attrs */ READ_BUF(28); - READ32(dummy); /* headerpadsz is always 0 */ - READ32(sess->fore_channel.maxreq_sz); - READ32(sess->fore_channel.maxresp_sz); - READ32(sess->fore_channel.maxresp_cached); - READ32(sess->fore_channel.maxops); - READ32(sess->fore_channel.maxreqs); - READ32(sess->fore_channel.nr_rdma_attrs); + dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */ + sess->fore_channel.maxreq_sz = be32_to_cpup(p++); + sess->fore_channel.maxresp_sz = be32_to_cpup(p++); + sess->fore_channel.maxresp_cached = be32_to_cpup(p++); + sess->fore_channel.maxops = be32_to_cpup(p++); + sess->fore_channel.maxreqs = be32_to_cpup(p++); + sess->fore_channel.nr_rdma_attrs = be32_to_cpup(p++); if (sess->fore_channel.nr_rdma_attrs == 1) { READ_BUF(4); - READ32(sess->fore_channel.rdma_attrs); + sess->fore_channel.rdma_attrs = be32_to_cpup(p++); } else if (sess->fore_channel.nr_rdma_attrs > 1) { dprintk("Too many fore channel attr bitmaps!\n"); goto xdr_error; @@ -1366,23 +1407,23 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, /* Back channel attrs */ READ_BUF(28); - READ32(dummy); /* headerpadsz is always 0 */ - READ32(sess->back_channel.maxreq_sz); - READ32(sess->back_channel.maxresp_sz); - READ32(sess->back_channel.maxresp_cached); - READ32(sess->back_channel.maxops); - READ32(sess->back_channel.maxreqs); - READ32(sess->back_channel.nr_rdma_attrs); + dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */ + sess->back_channel.maxreq_sz = be32_to_cpup(p++); + sess->back_channel.maxresp_sz = be32_to_cpup(p++); + sess->back_channel.maxresp_cached = be32_to_cpup(p++); + sess->back_channel.maxops = be32_to_cpup(p++); + sess->back_channel.maxreqs = be32_to_cpup(p++); + sess->back_channel.nr_rdma_attrs = be32_to_cpup(p++); if (sess->back_channel.nr_rdma_attrs == 1) { READ_BUF(4); - READ32(sess->back_channel.rdma_attrs); + sess->back_channel.rdma_attrs = be32_to_cpup(p++); } else if (sess->back_channel.nr_rdma_attrs > 1) { dprintk("Too many back channel attr bitmaps!\n"); goto xdr_error; } READ_BUF(4); - READ32(sess->callback_prog); + sess->callback_prog = be32_to_cpup(p++); nfsd4_decode_cb_sec(argp, &sess->cb_sec); DECODE_TAIL; } @@ -1405,7 +1446,7 @@ nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp, DECODE_HEAD; READ_BUF(sizeof(stateid_t)); - READ32(free_stateid->fr_stateid.si_generation); + free_stateid->fr_stateid.si_generation = be32_to_cpup(p++); COPYMEM(&free_stateid->fr_stateid.si_opaque, sizeof(stateid_opaque_t)); DECODE_TAIL; @@ -1419,10 +1460,10 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, READ_BUF(NFS4_MAX_SESSIONID_LEN + 16); COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN); - READ32(seq->seqid); - READ32(seq->slotid); - READ32(seq->maxslots); - READ32(seq->cachethis); + seq->seqid = be32_to_cpup(p++); + seq->slotid = be32_to_cpup(p++); + seq->maxslots = be32_to_cpup(p++); + seq->cachethis = be32_to_cpup(p++); DECODE_TAIL; } @@ -1479,7 +1520,7 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str DECODE_HEAD; READ_BUF(4); - READ32(rc->rca_one_fs); + rc->rca_one_fs = be32_to_cpup(p++); DECODE_TAIL; } @@ -1519,7 +1560,7 @@ static nfsd4_dec nfsd4_dec_ops[] = { [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm, [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade, [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh, - [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_putpubfh, [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop, [OP_READ] = (nfsd4_dec)nfsd4_decode_read, [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir, @@ -1536,46 +1577,6 @@ static nfsd4_dec nfsd4_dec_ops[] = { [OP_VERIFY] = (nfsd4_dec)nfsd4_decode_verify, [OP_WRITE] = (nfsd4_dec)nfsd4_decode_write, [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner, -}; - -static nfsd4_dec nfsd41_dec_ops[] = { - [OP_ACCESS] = (nfsd4_dec)nfsd4_decode_access, - [OP_CLOSE] = (nfsd4_dec)nfsd4_decode_close, - [OP_COMMIT] = (nfsd4_dec)nfsd4_decode_commit, - [OP_CREATE] = (nfsd4_dec)nfsd4_decode_create, - [OP_DELEGPURGE] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_DELEGRETURN] = (nfsd4_dec)nfsd4_decode_delegreturn, - [OP_GETATTR] = (nfsd4_dec)nfsd4_decode_getattr, - [OP_GETFH] = (nfsd4_dec)nfsd4_decode_noop, - [OP_LINK] = (nfsd4_dec)nfsd4_decode_link, - [OP_LOCK] = (nfsd4_dec)nfsd4_decode_lock, - [OP_LOCKT] = (nfsd4_dec)nfsd4_decode_lockt, - [OP_LOCKU] = (nfsd4_dec)nfsd4_decode_locku, - [OP_LOOKUP] = (nfsd4_dec)nfsd4_decode_lookup, - [OP_LOOKUPP] = (nfsd4_dec)nfsd4_decode_noop, - [OP_NVERIFY] = (nfsd4_dec)nfsd4_decode_verify, - [OP_OPEN] = (nfsd4_dec)nfsd4_decode_open, - [OP_OPENATTR] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade, - [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh, - [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop, - [OP_READ] = (nfsd4_dec)nfsd4_decode_read, - [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir, - [OP_READLINK] = (nfsd4_dec)nfsd4_decode_noop, - [OP_REMOVE] = (nfsd4_dec)nfsd4_decode_remove, - [OP_RENAME] = (nfsd4_dec)nfsd4_decode_rename, - [OP_RENEW] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_RESTOREFH] = (nfsd4_dec)nfsd4_decode_noop, - [OP_SAVEFH] = (nfsd4_dec)nfsd4_decode_noop, - [OP_SECINFO] = (nfsd4_dec)nfsd4_decode_secinfo, - [OP_SETATTR] = (nfsd4_dec)nfsd4_decode_setattr, - [OP_SETCLIENTID] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_SETCLIENTID_CONFIRM]= (nfsd4_dec)nfsd4_decode_notsupp, - [OP_VERIFY] = (nfsd4_dec)nfsd4_decode_verify, - [OP_WRITE] = (nfsd4_dec)nfsd4_decode_write, - [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_notsupp, /* new operations for NFSv4.1 */ [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_backchannel_ctl, @@ -1599,32 +1600,39 @@ static nfsd4_dec nfsd41_dec_ops[] = { [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, }; -struct nfsd4_minorversion_ops { - nfsd4_dec *decoders; - int nops; -}; - -static struct nfsd4_minorversion_ops nfsd4_minorversion[] = { - [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) }, - [1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) }, - [2] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) }, -}; +static inline bool +nfsd4_opnum_in_range(struct nfsd4_compoundargs *argp, struct nfsd4_op *op) +{ + if (op->opnum < FIRST_NFS4_OP) + return false; + else if (argp->minorversion == 0 && op->opnum > LAST_NFS40_OP) + return false; + else if (argp->minorversion == 1 && op->opnum > LAST_NFS41_OP) + return false; + else if (argp->minorversion == 2 && op->opnum > LAST_NFS42_OP) + return false; + return true; +} static __be32 nfsd4_decode_compound(struct nfsd4_compoundargs *argp) { DECODE_HEAD; struct nfsd4_op *op; - struct nfsd4_minorversion_ops *ops; bool cachethis = false; + int auth_slack= argp->rqstp->rq_auth_slack; + int max_reply = auth_slack + 8; /* opcnt, status */ + int readcount = 0; + int readbytes = 0; int i; READ_BUF(4); - READ32(argp->taglen); + argp->taglen = be32_to_cpup(p++); READ_BUF(argp->taglen + 8); SAVEMEM(argp->tag, argp->taglen); - READ32(argp->minorversion); - READ32(argp->opcnt); + argp->minorversion = be32_to_cpup(p++); + argp->opcnt = be32_to_cpup(p++); + max_reply += 4 + (XDR_QUADLEN(argp->taglen) << 2); if (argp->taglen > NFSD4_MAX_TAGLEN) goto xdr_error; @@ -1640,110 +1648,98 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) } } - if (argp->minorversion >= ARRAY_SIZE(nfsd4_minorversion)) + if (argp->minorversion > NFSD_SUPPORTED_MINOR_VERSION) argp->opcnt = 0; - ops = &nfsd4_minorversion[argp->minorversion]; for (i = 0; i < argp->opcnt; i++) { op = &argp->ops[i]; op->replay = NULL; READ_BUF(4); - READ32(op->opnum); + op->opnum = be32_to_cpup(p++); - if (op->opnum >= FIRST_NFS4_OP && op->opnum <= LAST_NFS4_OP) - op->status = ops->decoders[op->opnum](argp, &op->u); + if (nfsd4_opnum_in_range(argp, op)) + op->status = nfsd4_dec_ops[op->opnum](argp, &op->u); else { op->opnum = OP_ILLEGAL; op->status = nfserr_op_illegal; } - - if (op->status) { - argp->opcnt = i+1; - break; - } /* * We'll try to cache the result in the DRC if any one * op in the compound wants to be cached: */ cachethis |= nfsd4_cache_this_op(op); + + if (op->opnum == OP_READ) { + readcount++; + readbytes += nfsd4_max_reply(argp->rqstp, op); + } else + max_reply += nfsd4_max_reply(argp->rqstp, op); + + if (op->status) { + argp->opcnt = i+1; + break; + } } /* Sessions make the DRC unnecessary: */ if (argp->minorversion) cachethis = false; + svc_reserve(argp->rqstp, max_reply + readbytes); argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; - DECODE_TAIL; -} - -#define WRITE32(n) *p++ = htonl(n) -#define WRITE64(n) do { \ - *p++ = htonl((u32)((n) >> 32)); \ - *p++ = htonl((u32)(n)); \ -} while (0) -#define WRITEMEM(ptr,nbytes) do { if (nbytes > 0) { \ - *(p + XDR_QUADLEN(nbytes) -1) = 0; \ - memcpy(p, ptr, nbytes); \ - p += XDR_QUADLEN(nbytes); \ -}} while (0) + if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack) + argp->rqstp->rq_splice_ok = false; -static void write32(__be32 **p, u32 n) -{ - *(*p)++ = htonl(n); -} - -static void write64(__be32 **p, u64 n) -{ - write32(p, (n >> 32)); - write32(p, (u32)n); + DECODE_TAIL; } -static void write_change(__be32 **p, struct kstat *stat, struct inode *inode) +static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode) { if (IS_I_VERSION(inode)) { - write64(p, inode->i_version); + p = xdr_encode_hyper(p, inode->i_version); } else { - write32(p, stat->ctime.tv_sec); - write32(p, stat->ctime.tv_nsec); + *p++ = cpu_to_be32(stat->ctime.tv_sec); + *p++ = cpu_to_be32(stat->ctime.tv_nsec); } + return p; } -static void write_cinfo(__be32 **p, struct nfsd4_change_info *c) +static __be32 *encode_cinfo(__be32 *p, struct nfsd4_change_info *c) { - write32(p, c->atomic); + *p++ = cpu_to_be32(c->atomic); if (c->change_supported) { - write64(p, c->before_change); - write64(p, c->after_change); + p = xdr_encode_hyper(p, c->before_change); + p = xdr_encode_hyper(p, c->after_change); } else { - write32(p, c->before_ctime_sec); - write32(p, c->before_ctime_nsec); - write32(p, c->after_ctime_sec); - write32(p, c->after_ctime_nsec); + *p++ = cpu_to_be32(c->before_ctime_sec); + *p++ = cpu_to_be32(c->before_ctime_nsec); + *p++ = cpu_to_be32(c->after_ctime_sec); + *p++ = cpu_to_be32(c->after_ctime_nsec); } + return p; } -#define RESERVE_SPACE(nbytes) do { \ - p = resp->p; \ - BUG_ON(p + XDR_QUADLEN(nbytes) > resp->end); \ -} while (0) -#define ADJUST_ARGS() resp->p = p - /* Encode as an array of strings the string given with components * separated @sep, escaped with esc_enter and esc_exit. */ -static __be32 nfsd4_encode_components_esc(char sep, char *components, - __be32 **pp, int *buflen, - char esc_enter, char esc_exit) +static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep, + char *components, char esc_enter, + char esc_exit) { - __be32 *p = *pp; - __be32 *countp = p; + __be32 *p; + __be32 pathlen; + int pathlen_offset; int strlen, count=0; char *str, *end, *next; dprintk("nfsd4_encode_components(%s)\n", components); - if ((*buflen -= 4) < 0) + + pathlen_offset = xdr->buf->len; + p = xdr_reserve_space(xdr, 4); + if (!p) return nfserr_resource; - WRITE32(0); /* We will fill this in with @count later */ + p++; /* We will fill this in with @count later */ + end = str = components; while (*end) { bool found_esc = false; @@ -1765,59 +1761,57 @@ static __be32 nfsd4_encode_components_esc(char sep, char *components, strlen = end - str; if (strlen) { - if ((*buflen -= ((XDR_QUADLEN(strlen) << 2) + 4)) < 0) + p = xdr_reserve_space(xdr, strlen + 4); + if (!p) return nfserr_resource; - WRITE32(strlen); - WRITEMEM(str, strlen); + p = xdr_encode_opaque(p, str, strlen); count++; } else end++; str = end; } - *pp = p; - p = countp; - WRITE32(count); + pathlen = htonl(xdr->buf->len - pathlen_offset); + write_bytes_to_xdr_buf(xdr->buf, pathlen_offset, &pathlen, 4); return 0; } /* Encode as an array of strings the string given with components * separated @sep. */ -static __be32 nfsd4_encode_components(char sep, char *components, - __be32 **pp, int *buflen) +static __be32 nfsd4_encode_components(struct xdr_stream *xdr, char sep, + char *components) { - return nfsd4_encode_components_esc(sep, components, pp, buflen, 0, 0); + return nfsd4_encode_components_esc(xdr, sep, components, 0, 0); } /* * encode a location element of a fs_locations structure */ -static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location, - __be32 **pp, int *buflen) +static __be32 nfsd4_encode_fs_location4(struct xdr_stream *xdr, + struct nfsd4_fs_location *location) { __be32 status; - __be32 *p = *pp; - status = nfsd4_encode_components_esc(':', location->hosts, &p, buflen, + status = nfsd4_encode_components_esc(xdr, ':', location->hosts, '[', ']'); if (status) return status; - status = nfsd4_encode_components('/', location->path, &p, buflen); + status = nfsd4_encode_components(xdr, '/', location->path); if (status) return status; - *pp = p; return 0; } /* * Encode a path in RFC3530 'pathname4' format */ -static __be32 nfsd4_encode_path(const struct path *root, - const struct path *path, __be32 **pp, int *buflen) +static __be32 nfsd4_encode_path(struct xdr_stream *xdr, + const struct path *root, + const struct path *path) { struct path cur = *path; - __be32 *p = *pp; + __be32 *p; struct dentry **components = NULL; unsigned int ncomponents = 0; __be32 err = nfserr_jukebox; @@ -1848,11 +1842,11 @@ static __be32 nfsd4_encode_path(const struct path *root, components[ncomponents++] = cur.dentry; cur.dentry = dget_parent(cur.dentry); } - - *buflen -= 4; - if (*buflen < 0) + err = nfserr_resource; + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_free; - WRITE32(ncomponents); + *p++ = cpu_to_be32(ncomponents); while (ncomponents) { struct dentry *dentry = components[ncomponents - 1]; @@ -1860,20 +1854,18 @@ static __be32 nfsd4_encode_path(const struct path *root, spin_lock(&dentry->d_lock); len = dentry->d_name.len; - *buflen -= 4 + (XDR_QUADLEN(len) << 2); - if (*buflen < 0) { + p = xdr_reserve_space(xdr, len + 4); + if (!p) { spin_unlock(&dentry->d_lock); goto out_free; } - WRITE32(len); - WRITEMEM(dentry->d_name.name, len); + p = xdr_encode_opaque(p, dentry->d_name.name, len); dprintk("/%s", dentry->d_name.name); spin_unlock(&dentry->d_lock); dput(dentry); ncomponents--; } - *pp = p; err = 0; out_free: dprintk(")\n"); @@ -1884,8 +1876,8 @@ out_free: return err; } -static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp, - const struct path *path, __be32 **pp, int *buflen) +static __be32 nfsd4_encode_fsloc_fsroot(struct xdr_stream *xdr, + struct svc_rqst *rqstp, const struct path *path) { struct svc_export *exp_ps; __be32 res; @@ -1893,7 +1885,7 @@ static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp, exp_ps = rqst_find_fsidzero_export(rqstp); if (IS_ERR(exp_ps)) return nfserrno(PTR_ERR(exp_ps)); - res = nfsd4_encode_path(&exp_ps->ex_path, path, pp, buflen); + res = nfsd4_encode_path(xdr, &exp_ps->ex_path, path); exp_put(exp_ps); return res; } @@ -1901,28 +1893,26 @@ static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp, /* * encode a fs_locations structure */ -static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp, - struct svc_export *exp, - __be32 **pp, int *buflen) +static __be32 nfsd4_encode_fs_locations(struct xdr_stream *xdr, + struct svc_rqst *rqstp, struct svc_export *exp) { __be32 status; int i; - __be32 *p = *pp; + __be32 *p; struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs; - status = nfsd4_encode_fsloc_fsroot(rqstp, &exp->ex_path, &p, buflen); + status = nfsd4_encode_fsloc_fsroot(xdr, rqstp, &exp->ex_path); if (status) return status; - if ((*buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) return nfserr_resource; - WRITE32(fslocs->locations_count); + *p++ = cpu_to_be32(fslocs->locations_count); for (i=0; i<fslocs->locations_count; i++) { - status = nfsd4_encode_fs_location4(&fslocs->locations[i], - &p, buflen); + status = nfsd4_encode_fs_location4(xdr, &fslocs->locations[i]); if (status) return status; } - *pp = p; return 0; } @@ -1940,56 +1930,16 @@ static u32 nfs4_file_type(umode_t mode) }; } -static __be32 -nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, kuid_t uid, kgid_t gid, - __be32 **p, int *buflen) -{ - int status; - - if (*buflen < (XDR_QUADLEN(IDMAP_NAMESZ) << 2) + 4) - return nfserr_resource; - if (whotype != NFS4_ACL_WHO_NAMED) - status = nfs4_acl_write_who(whotype, (u8 *)(*p + 1)); - else if (gid_valid(gid)) - status = nfsd_map_gid_to_name(rqstp, gid, (u8 *)(*p + 1)); - else - status = nfsd_map_uid_to_name(rqstp, uid, (u8 *)(*p + 1)); - if (status < 0) - return nfserrno(status); - *p = xdr_encode_opaque(*p, NULL, status); - *buflen -= (XDR_QUADLEN(status) << 2) + 4; - BUG_ON(*buflen < 0); - return 0; -} - -static inline __be32 -nfsd4_encode_user(struct svc_rqst *rqstp, kuid_t user, __be32 **p, int *buflen) -{ - return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, user, INVALID_GID, - p, buflen); -} - static inline __be32 -nfsd4_encode_group(struct svc_rqst *rqstp, kgid_t group, __be32 **p, int *buflen) +nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp, + struct nfs4_ace *ace) { - return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, INVALID_UID, group, - p, buflen); -} - -static inline __be32 -nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace, - __be32 **p, int *buflen) -{ - kuid_t uid = INVALID_UID; - kgid_t gid = INVALID_GID; - - if (ace->whotype == NFS4_ACL_WHO_NAMED) { - if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) - gid = ace->who_gid; - else - uid = ace->who_uid; - } - return nfsd4_encode_name(rqstp, ace->whotype, uid, gid, p, buflen); + if (ace->whotype != NFS4_ACL_WHO_NAMED) + return nfs4_acl_write_who(xdr, ace->whotype); + else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) + return nfsd4_encode_group(xdr, rqstp, ace->who_gid); + else + return nfsd4_encode_user(xdr, rqstp, ace->who_uid); } #define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \ @@ -1998,31 +1948,28 @@ nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace, #ifdef CONFIG_NFSD_V4_SECURITY_LABEL static inline __be32 -nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen) +nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp, + void *context, int len) { - __be32 *p = *pp; + __be32 *p; - if (*buflen < ((XDR_QUADLEN(len) << 2) + 4 + 4 + 4)) + p = xdr_reserve_space(xdr, len + 4 + 4 + 4); + if (!p) return nfserr_resource; /* * For now we use a 0 here to indicate the null translation; in * the future we may place a call to translation code here. */ - if ((*buflen -= 8) < 0) - return nfserr_resource; - - WRITE32(0); /* lfs */ - WRITE32(0); /* pi */ + *p++ = cpu_to_be32(0); /* lfs */ + *p++ = cpu_to_be32(0); /* pi */ p = xdr_encode_opaque(p, context, len); - *buflen -= (XDR_QUADLEN(len) << 2) + 4; - - *pp = p; return 0; } #else static inline __be32 -nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen) +nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp, + void *context, int len) { return 0; } #endif @@ -2061,26 +2008,26 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat) /* * Note: @fhp can be NULL; in this case, we might have to compose the filehandle * ourselves. - * - * countp is the buffer size in _words_ */ -__be32 -nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, - struct dentry *dentry, __be32 **buffer, int count, u32 *bmval, +static __be32 +nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, + struct svc_export *exp, + struct dentry *dentry, u32 *bmval, struct svc_rqst *rqstp, int ignore_crossmnt) { u32 bmval0 = bmval[0]; u32 bmval1 = bmval[1]; u32 bmval2 = bmval[2]; struct kstat stat; - struct svc_fh tempfh; + struct svc_fh *tempfh = NULL; struct kstatfs statfs; - int buflen = count << 2; - __be32 *attrlenp; + __be32 *p; + int starting_len = xdr->buf->len; + int attrlen_offset; + __be32 attrlen; u32 dummy; u64 dummy64; u32 rdattr_err = 0; - __be32 *p = *buffer; __be32 status; int err; int aclsupport = 0; @@ -2111,8 +2058,8 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, err = vfs_getattr(&path, &stat); if (err) goto out_nfserr; - if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | - FATTR4_WORD0_MAXNAME)) || + if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE | + FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) || (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL))) { err = vfs_statfs(&path, &statfs); @@ -2120,11 +2067,15 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, goto out_nfserr; } if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) { - fh_init(&tempfh, NFS4_FHSIZE); - status = fh_compose(&tempfh, exp, dentry, NULL); + tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); + status = nfserr_jukebox; + if (!tempfh) + goto out; + fh_init(tempfh, NFS4_FHSIZE); + status = fh_compose(tempfh, exp, dentry, NULL); if (status) goto out; - fhp = &tempfh; + fhp = tempfh; } if (bmval0 & (FATTR4_WORD0_ACL | FATTR4_WORD0_ACLSUPPORT | FATTR4_WORD0_SUPPORTED_ATTRS)) { @@ -2157,25 +2108,33 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ if (bmval2) { - if ((buflen -= 16) < 0) + p = xdr_reserve_space(xdr, 16); + if (!p) goto out_resource; - WRITE32(3); - WRITE32(bmval0); - WRITE32(bmval1); - WRITE32(bmval2); + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(bmval0); + *p++ = cpu_to_be32(bmval1); + *p++ = cpu_to_be32(bmval2); } else if (bmval1) { - if ((buflen -= 12) < 0) + p = xdr_reserve_space(xdr, 12); + if (!p) goto out_resource; - WRITE32(2); - WRITE32(bmval0); - WRITE32(bmval1); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(bmval0); + *p++ = cpu_to_be32(bmval1); } else { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE32(1); - WRITE32(bmval0); + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(bmval0); } - attrlenp = p++; /* to be backfilled later */ + + attrlen_offset = xdr->buf->len; + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + p++; /* to be backfilled later */ if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { u32 word0 = nfsd_suppattrs0(minorversion); @@ -2187,302 +2146,343 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, if (!contextsupport) word2 &= ~FATTR4_WORD2_SECURITY_LABEL; if (!word2) { - if ((buflen -= 12) < 0) + p = xdr_reserve_space(xdr, 12); + if (!p) goto out_resource; - WRITE32(2); - WRITE32(word0); - WRITE32(word1); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(word0); + *p++ = cpu_to_be32(word1); } else { - if ((buflen -= 16) < 0) + p = xdr_reserve_space(xdr, 16); + if (!p) goto out_resource; - WRITE32(3); - WRITE32(word0); - WRITE32(word1); - WRITE32(word2); + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(word0); + *p++ = cpu_to_be32(word1); + *p++ = cpu_to_be32(word2); } } if (bmval0 & FATTR4_WORD0_TYPE) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; dummy = nfs4_file_type(stat.mode); - if (dummy == NF4BAD) - goto out_serverfault; - WRITE32(dummy); + if (dummy == NF4BAD) { + status = nfserr_serverfault; + goto out; + } + *p++ = cpu_to_be32(dummy); } if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) - WRITE32(NFS4_FH_PERSISTENT); + *p++ = cpu_to_be32(NFS4_FH_PERSISTENT); else - WRITE32(NFS4_FH_PERSISTENT|NFS4_FH_VOL_RENAME); + *p++ = cpu_to_be32(NFS4_FH_PERSISTENT| + NFS4_FH_VOL_RENAME); } if (bmval0 & FATTR4_WORD0_CHANGE) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - write_change(&p, &stat, dentry->d_inode); + p = encode_change(p, &stat, dentry->d_inode); } if (bmval0 & FATTR4_WORD0_SIZE) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE64(stat.size); + p = xdr_encode_hyper(p, stat.size); } if (bmval0 & FATTR4_WORD0_LINK_SUPPORT) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(1); + *p++ = cpu_to_be32(1); } if (bmval0 & FATTR4_WORD0_SYMLINK_SUPPORT) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(1); + *p++ = cpu_to_be32(1); } if (bmval0 & FATTR4_WORD0_NAMED_ATTR) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(0); + *p++ = cpu_to_be32(0); } if (bmval0 & FATTR4_WORD0_FSID) { - if ((buflen -= 16) < 0) + p = xdr_reserve_space(xdr, 16); + if (!p) goto out_resource; if (exp->ex_fslocs.migrated) { - WRITE64(NFS4_REFERRAL_FSID_MAJOR); - WRITE64(NFS4_REFERRAL_FSID_MINOR); + p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MAJOR); + p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MINOR); } else switch(fsid_source(fhp)) { case FSIDSOURCE_FSID: - WRITE64((u64)exp->ex_fsid); - WRITE64((u64)0); + p = xdr_encode_hyper(p, (u64)exp->ex_fsid); + p = xdr_encode_hyper(p, (u64)0); break; case FSIDSOURCE_DEV: - WRITE32(0); - WRITE32(MAJOR(stat.dev)); - WRITE32(0); - WRITE32(MINOR(stat.dev)); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(MAJOR(stat.dev)); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(MINOR(stat.dev)); break; case FSIDSOURCE_UUID: - WRITEMEM(exp->ex_uuid, 16); + p = xdr_encode_opaque_fixed(p, exp->ex_uuid, + EX_UUID_LEN); break; } } if (bmval0 & FATTR4_WORD0_UNIQUE_HANDLES) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(0); + *p++ = cpu_to_be32(0); } if (bmval0 & FATTR4_WORD0_LEASE_TIME) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(nn->nfsd4_lease); + *p++ = cpu_to_be32(nn->nfsd4_lease); } if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(rdattr_err); + *p++ = cpu_to_be32(rdattr_err); } if (bmval0 & FATTR4_WORD0_ACL) { struct nfs4_ace *ace; if (acl == NULL) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(0); + *p++ = cpu_to_be32(0); goto out_acl; } - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(acl->naces); + *p++ = cpu_to_be32(acl->naces); for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) { - if ((buflen -= 4*3) < 0) - goto out_resource; - WRITE32(ace->type); - WRITE32(ace->flag); - WRITE32(ace->access_mask & NFS4_ACE_MASK_ALL); - status = nfsd4_encode_aclname(rqstp, ace, &p, &buflen); - if (status == nfserr_resource) + p = xdr_reserve_space(xdr, 4*3); + if (!p) goto out_resource; + *p++ = cpu_to_be32(ace->type); + *p++ = cpu_to_be32(ace->flag); + *p++ = cpu_to_be32(ace->access_mask & + NFS4_ACE_MASK_ALL); + status = nfsd4_encode_aclname(xdr, rqstp, ace); if (status) goto out; } } out_acl: if (bmval0 & FATTR4_WORD0_ACLSUPPORT) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(aclsupport ? + *p++ = cpu_to_be32(aclsupport ? ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0); } if (bmval0 & FATTR4_WORD0_CANSETTIME) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(1); + *p++ = cpu_to_be32(1); } if (bmval0 & FATTR4_WORD0_CASE_INSENSITIVE) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(0); + *p++ = cpu_to_be32(0); } if (bmval0 & FATTR4_WORD0_CASE_PRESERVING) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(1); + *p++ = cpu_to_be32(1); } if (bmval0 & FATTR4_WORD0_CHOWN_RESTRICTED) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(1); + *p++ = cpu_to_be32(1); } if (bmval0 & FATTR4_WORD0_FILEHANDLE) { - buflen -= (XDR_QUADLEN(fhp->fh_handle.fh_size) << 2) + 4; - if (buflen < 0) + p = xdr_reserve_space(xdr, fhp->fh_handle.fh_size + 4); + if (!p) goto out_resource; - WRITE32(fhp->fh_handle.fh_size); - WRITEMEM(&fhp->fh_handle.fh_base, fhp->fh_handle.fh_size); + p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base, + fhp->fh_handle.fh_size); } if (bmval0 & FATTR4_WORD0_FILEID) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE64(stat.ino); + p = xdr_encode_hyper(p, stat.ino); } if (bmval0 & FATTR4_WORD0_FILES_AVAIL) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE64((u64) statfs.f_ffree); + p = xdr_encode_hyper(p, (u64) statfs.f_ffree); } if (bmval0 & FATTR4_WORD0_FILES_FREE) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE64((u64) statfs.f_ffree); + p = xdr_encode_hyper(p, (u64) statfs.f_ffree); } if (bmval0 & FATTR4_WORD0_FILES_TOTAL) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE64((u64) statfs.f_files); + p = xdr_encode_hyper(p, (u64) statfs.f_files); } if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) { - status = nfsd4_encode_fs_locations(rqstp, exp, &p, &buflen); - if (status == nfserr_resource) - goto out_resource; + status = nfsd4_encode_fs_locations(xdr, rqstp, exp); if (status) goto out; } if (bmval0 & FATTR4_WORD0_HOMOGENEOUS) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(1); + *p++ = cpu_to_be32(1); } if (bmval0 & FATTR4_WORD0_MAXFILESIZE) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE64(~(u64)0); + p = xdr_encode_hyper(p, exp->ex_path.mnt->mnt_sb->s_maxbytes); } if (bmval0 & FATTR4_WORD0_MAXLINK) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(255); + *p++ = cpu_to_be32(255); } if (bmval0 & FATTR4_WORD0_MAXNAME) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(statfs.f_namelen); + *p++ = cpu_to_be32(statfs.f_namelen); } if (bmval0 & FATTR4_WORD0_MAXREAD) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE64((u64) svc_max_payload(rqstp)); + p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp)); } if (bmval0 & FATTR4_WORD0_MAXWRITE) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE64((u64) svc_max_payload(rqstp)); + p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp)); } if (bmval1 & FATTR4_WORD1_MODE) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(stat.mode & S_IALLUGO); + *p++ = cpu_to_be32(stat.mode & S_IALLUGO); } if (bmval1 & FATTR4_WORD1_NO_TRUNC) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(1); + *p++ = cpu_to_be32(1); } if (bmval1 & FATTR4_WORD1_NUMLINKS) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(stat.nlink); + *p++ = cpu_to_be32(stat.nlink); } if (bmval1 & FATTR4_WORD1_OWNER) { - status = nfsd4_encode_user(rqstp, stat.uid, &p, &buflen); - if (status == nfserr_resource) - goto out_resource; + status = nfsd4_encode_user(xdr, rqstp, stat.uid); if (status) goto out; } if (bmval1 & FATTR4_WORD1_OWNER_GROUP) { - status = nfsd4_encode_group(rqstp, stat.gid, &p, &buflen); - if (status == nfserr_resource) - goto out_resource; + status = nfsd4_encode_group(xdr, rqstp, stat.gid); if (status) goto out; } if (bmval1 & FATTR4_WORD1_RAWDEV) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE32((u32) MAJOR(stat.rdev)); - WRITE32((u32) MINOR(stat.rdev)); + *p++ = cpu_to_be32((u32) MAJOR(stat.rdev)); + *p++ = cpu_to_be32((u32) MINOR(stat.rdev)); } if (bmval1 & FATTR4_WORD1_SPACE_AVAIL) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; dummy64 = (u64)statfs.f_bavail * (u64)statfs.f_bsize; - WRITE64(dummy64); + p = xdr_encode_hyper(p, dummy64); } if (bmval1 & FATTR4_WORD1_SPACE_FREE) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; dummy64 = (u64)statfs.f_bfree * (u64)statfs.f_bsize; - WRITE64(dummy64); + p = xdr_encode_hyper(p, dummy64); } if (bmval1 & FATTR4_WORD1_SPACE_TOTAL) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; dummy64 = (u64)statfs.f_blocks * (u64)statfs.f_bsize; - WRITE64(dummy64); + p = xdr_encode_hyper(p, dummy64); } if (bmval1 & FATTR4_WORD1_SPACE_USED) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; dummy64 = (u64)stat.blocks << 9; - WRITE64(dummy64); + p = xdr_encode_hyper(p, dummy64); } if (bmval1 & FATTR4_WORD1_TIME_ACCESS) { - if ((buflen -= 12) < 0) + p = xdr_reserve_space(xdr, 12); + if (!p) goto out_resource; - WRITE64((s64)stat.atime.tv_sec); - WRITE32(stat.atime.tv_nsec); + p = xdr_encode_hyper(p, (s64)stat.atime.tv_sec); + *p++ = cpu_to_be32(stat.atime.tv_nsec); } if (bmval1 & FATTR4_WORD1_TIME_DELTA) { - if ((buflen -= 12) < 0) + p = xdr_reserve_space(xdr, 12); + if (!p) goto out_resource; - WRITE32(0); - WRITE32(1); - WRITE32(0); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(0); } if (bmval1 & FATTR4_WORD1_TIME_METADATA) { - if ((buflen -= 12) < 0) + p = xdr_reserve_space(xdr, 12); + if (!p) goto out_resource; - WRITE64((s64)stat.ctime.tv_sec); - WRITE32(stat.ctime.tv_nsec); + p = xdr_encode_hyper(p, (s64)stat.ctime.tv_sec); + *p++ = cpu_to_be32(stat.ctime.tv_nsec); } if (bmval1 & FATTR4_WORD1_TIME_MODIFY) { - if ((buflen -= 12) < 0) + p = xdr_reserve_space(xdr, 12); + if (!p) goto out_resource; - WRITE64((s64)stat.mtime.tv_sec); - WRITE32(stat.mtime.tv_nsec); + p = xdr_encode_hyper(p, (s64)stat.mtime.tv_sec); + *p++ = cpu_to_be32(stat.mtime.tv_nsec); } if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; /* * Get parent's attributes if not ignoring crossmount @@ -2491,23 +2491,26 @@ out_acl: if (ignore_crossmnt == 0 && dentry == exp->ex_path.mnt->mnt_root) get_parent_attributes(exp, &stat); - WRITE64(stat.ino); + p = xdr_encode_hyper(p, stat.ino); } if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { - status = nfsd4_encode_security_label(rqstp, context, - contextlen, &p, &buflen); + status = nfsd4_encode_security_label(xdr, rqstp, context, + contextlen); if (status) goto out; } if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { - WRITE32(3); - WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0); - WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1); - WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD2); + p = xdr_reserve_space(xdr, 16); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD0); + *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD1); + *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD2); } - *attrlenp = htonl((char *)p - (char *)attrlenp - 4); - *buffer = p; + attrlen = htonl(xdr->buf->len - attrlen_offset - 4); + write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, 4); status = nfs_ok; out: @@ -2516,8 +2519,12 @@ out: security_release_secctx(context, contextlen); #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ kfree(acl); - if (fhp == &tempfh) - fh_put(&tempfh); + if (tempfh) { + fh_put(tempfh); + kfree(tempfh); + } + if (status) + xdr_truncate_encode(xdr, starting_len); return status; out_nfserr: status = nfserrno(err); @@ -2525,9 +2532,37 @@ out_nfserr: out_resource: status = nfserr_resource; goto out; -out_serverfault: - status = nfserr_serverfault; - goto out; +} + +static void svcxdr_init_encode_from_buffer(struct xdr_stream *xdr, + struct xdr_buf *buf, __be32 *p, int bytes) +{ + xdr->scratch.iov_len = 0; + memset(buf, 0, sizeof(struct xdr_buf)); + buf->head[0].iov_base = p; + buf->head[0].iov_len = 0; + buf->len = 0; + xdr->buf = buf; + xdr->iov = buf->head; + xdr->p = p; + xdr->end = (void *)p + bytes; + buf->buflen = bytes; +} + +__be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words, + struct svc_fh *fhp, struct svc_export *exp, + struct dentry *dentry, u32 *bmval, + struct svc_rqst *rqstp, int ignore_crossmnt) +{ + struct xdr_buf dummy; + struct xdr_stream xdr; + __be32 ret; + + svcxdr_init_encode_from_buffer(&xdr, &dummy, *p, words << 2); + ret = nfsd4_encode_fattr(&xdr, fhp, exp, dentry, bmval, rqstp, + ignore_crossmnt); + *p = xdr.p; + return ret; } static inline int attributes_need_mount(u32 *bmval) @@ -2540,8 +2575,8 @@ static inline int attributes_need_mount(u32 *bmval) } static __be32 -nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd, - const char *name, int namlen, __be32 **p, int buflen) +nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd, + const char *name, int namlen) { struct svc_export *exp = cd->rd_fhp->fh_export; struct dentry *dentry; @@ -2593,7 +2628,7 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd, } out_encode: - nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval, + nfserr = nfsd4_encode_fattr(xdr, NULL, exp, dentry, cd->rd_bmval, cd->rd_rqstp, ignore_crossmnt); out_put: dput(dentry); @@ -2602,19 +2637,19 @@ out_put: } static __be32 * -nfsd4_encode_rdattr_error(__be32 *p, int buflen, __be32 nfserr) +nfsd4_encode_rdattr_error(struct xdr_stream *xdr, __be32 nfserr) { - __be32 *attrlenp; + __be32 *p; - if (buflen < 6) + p = xdr_reserve_space(xdr, 20); + if (!p) return NULL; *p++ = htonl(2); *p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */ *p++ = htonl(0); /* bmval1 */ - attrlenp = p++; + *p++ = htonl(4); /* attribute length */ *p++ = nfserr; /* no htonl */ - *attrlenp = htonl((char *)p - (char *)attrlenp - 4); return p; } @@ -2624,10 +2659,13 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, { struct readdir_cd *ccd = ccdv; struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common); - int buflen; - __be32 *p = cd->buffer; - __be32 *cookiep; + struct xdr_stream *xdr = cd->xdr; + int start_offset = xdr->buf->len; + int cookie_offset; + int entry_bytes; __be32 nfserr = nfserr_toosmall; + __be64 wire_offset; + __be32 *p; /* In nfsv4, "." and ".." never make it onto the wire.. */ if (name && isdotent(name, namlen)) { @@ -2635,19 +2673,24 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, return 0; } - if (cd->offset) - xdr_encode_hyper(cd->offset, (u64) offset); + if (cd->cookie_offset) { + wire_offset = cpu_to_be64(offset); + write_bytes_to_xdr_buf(xdr->buf, cd->cookie_offset, + &wire_offset, 8); + } - buflen = cd->buflen - 4 - XDR_QUADLEN(namlen); - if (buflen < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto fail; - *p++ = xdr_one; /* mark entry present */ - cookiep = p; + cookie_offset = xdr->buf->len; + p = xdr_reserve_space(xdr, 3*4 + namlen); + if (!p) + goto fail; p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */ p = xdr_encode_array(p, name, namlen); /* name length & name */ - nfserr = nfsd4_encode_dirent_fattr(cd, name, namlen, &p, buflen); + nfserr = nfsd4_encode_dirent_fattr(xdr, cd, name, namlen); switch (nfserr) { case nfs_ok: break; @@ -2655,6 +2698,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, nfserr = nfserr_toosmall; goto fail; case nfserr_noent: + xdr_truncate_encode(xdr, start_offset); goto skip_entry; default: /* @@ -2666,59 +2710,74 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, */ if (!(cd->rd_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)) goto fail; - p = nfsd4_encode_rdattr_error(p, buflen, nfserr); + p = nfsd4_encode_rdattr_error(xdr, nfserr); if (p == NULL) { nfserr = nfserr_toosmall; goto fail; } } - cd->buflen -= (p - cd->buffer); - cd->buffer = p; - cd->offset = cookiep; + nfserr = nfserr_toosmall; + entry_bytes = xdr->buf->len - start_offset; + if (entry_bytes > cd->rd_maxcount) + goto fail; + cd->rd_maxcount -= entry_bytes; + if (!cd->rd_dircount) + goto fail; + cd->rd_dircount--; + cd->cookie_offset = cookie_offset; skip_entry: cd->common.err = nfs_ok; return 0; fail: + xdr_truncate_encode(xdr, start_offset); cd->common.err = nfserr; return -EINVAL; } -static void -nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid) +static __be32 +nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid) { __be32 *p; - RESERVE_SPACE(sizeof(stateid_t)); - WRITE32(sid->si_generation); - WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, sizeof(stateid_t)); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(sid->si_generation); + p = xdr_encode_opaque_fixed(p, &sid->si_opaque, + sizeof(stateid_opaque_t)); + return 0; } static __be32 nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(8); - WRITE32(access->ac_supported); - WRITE32(access->ac_resp_access); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 8); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(access->ac_supported); + *p++ = cpu_to_be32(access->ac_resp_access); } return nfserr; } static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8); - WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN); - WRITE32(bcts->dir); + p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 8); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, bcts->sessionid.data, + NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(bcts->dir); /* Sorry, we do not yet support RDMA over 4.1: */ - WRITE32(0); - ADJUST_ARGS(); + *p++ = cpu_to_be32(0); } return nfserr; } @@ -2726,8 +2785,10 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, static __be32 nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close) { + struct xdr_stream *xdr = &resp->xdr; + if (!nfserr) - nfsd4_encode_stateid(resp, &close->cl_stateid); + nfserr = nfsd4_encode_stateid(xdr, &close->cl_stateid); return nfserr; } @@ -2736,12 +2797,15 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c static __be32 nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(NFS4_VERIFIER_SIZE); - WRITEMEM(commit->co_verf.data, NFS4_VERIFIER_SIZE); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, commit->co_verf.data, + NFS4_VERIFIER_SIZE); } return nfserr; } @@ -2749,15 +2813,17 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ static __be32 nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(32); - write_cinfo(&p, &create->cr_cinfo); - WRITE32(2); - WRITE32(create->cr_bmval[0]); - WRITE32(create->cr_bmval[1]); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 32); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &create->cr_cinfo); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(create->cr_bmval[0]); + *p++ = cpu_to_be32(create->cr_bmval[1]); } return nfserr; } @@ -2766,14 +2832,13 @@ static __be32 nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr) { struct svc_fh *fhp = getattr->ga_fhp; - int buflen; + struct xdr_stream *xdr = &resp->xdr; if (nfserr) return nfserr; - buflen = resp->end - resp->p - (COMPOUND_ERR_SLACK_SPACE >> 2); - nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry, - &resp->p, buflen, getattr->ga_bmval, + nfserr = nfsd4_encode_fattr(xdr, fhp, fhp->fh_export, fhp->fh_dentry, + getattr->ga_bmval, resp->rqstp, 0); return nfserr; } @@ -2781,16 +2846,17 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 static __be32 nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp) { + struct xdr_stream *xdr = &resp->xdr; struct svc_fh *fhp = *fhpp; unsigned int len; __be32 *p; if (!nfserr) { len = fhp->fh_handle.fh_size; - RESERVE_SPACE(len + 4); - WRITE32(len); - WRITEMEM(&fhp->fh_handle.fh_base, len); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, len + 4); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base, len); } return nfserr; } @@ -2799,35 +2865,50 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh * Including all fields other than the name, a LOCK4denied structure requires * 8(clientid) + 4(namelen) + 8(offset) + 8(length) + 4(type) = 32 bytes. */ -static void -nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denied *ld) +static __be32 +nfsd4_encode_lock_denied(struct xdr_stream *xdr, struct nfsd4_lock_denied *ld) { struct xdr_netobj *conf = &ld->ld_owner; __be32 *p; - RESERVE_SPACE(32 + XDR_LEN(conf->len)); - WRITE64(ld->ld_start); - WRITE64(ld->ld_length); - WRITE32(ld->ld_type); +again: + p = xdr_reserve_space(xdr, 32 + XDR_LEN(conf->len)); + if (!p) { + /* + * Don't fail to return the result just because we can't + * return the conflicting open: + */ + if (conf->len) { + kfree(conf->data); + conf->len = 0; + conf->data = NULL; + goto again; + } + return nfserr_resource; + } + p = xdr_encode_hyper(p, ld->ld_start); + p = xdr_encode_hyper(p, ld->ld_length); + *p++ = cpu_to_be32(ld->ld_type); if (conf->len) { - WRITEMEM(&ld->ld_clientid, 8); - WRITE32(conf->len); - WRITEMEM(conf->data, conf->len); + p = xdr_encode_opaque_fixed(p, &ld->ld_clientid, 8); + p = xdr_encode_opaque(p, conf->data, conf->len); kfree(conf->data); } else { /* non - nfsv4 lock in conflict, no clientid nor owner */ - WRITE64((u64)0); /* clientid */ - WRITE32(0); /* length of owner name */ + p = xdr_encode_hyper(p, (u64)0); /* clientid */ + *p++ = cpu_to_be32(0); /* length of owner name */ } - ADJUST_ARGS(); + return nfserr_denied; } static __be32 nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock) { + struct xdr_stream *xdr = &resp->xdr; + if (!nfserr) - nfsd4_encode_stateid(resp, &lock->lk_resp_stateid); + nfserr = nfsd4_encode_stateid(xdr, &lock->lk_resp_stateid); else if (nfserr == nfserr_denied) - nfsd4_encode_lock_denied(resp, &lock->lk_denied); + nfserr = nfsd4_encode_lock_denied(xdr, &lock->lk_denied); return nfserr; } @@ -2835,16 +2916,20 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo static __be32 nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt) { + struct xdr_stream *xdr = &resp->xdr; + if (nfserr == nfserr_denied) - nfsd4_encode_lock_denied(resp, &lockt->lt_denied); + nfsd4_encode_lock_denied(xdr, &lockt->lt_denied); return nfserr; } static __be32 nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku) { + struct xdr_stream *xdr = &resp->xdr; + if (!nfserr) - nfsd4_encode_stateid(resp, &locku->lu_stateid); + nfserr = nfsd4_encode_stateid(xdr, &locku->lu_stateid); return nfserr; } @@ -2853,12 +2938,14 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l static __be32 nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(20); - write_cinfo(&p, &link->li_cinfo); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 20); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &link->li_cinfo); } return nfserr; } @@ -2867,72 +2954,86 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li static __be32 nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (nfserr) goto out; - nfsd4_encode_stateid(resp, &open->op_stateid); - RESERVE_SPACE(40); - write_cinfo(&p, &open->op_cinfo); - WRITE32(open->op_rflags); - WRITE32(2); - WRITE32(open->op_bmval[0]); - WRITE32(open->op_bmval[1]); - WRITE32(open->op_delegate_type); - ADJUST_ARGS(); + nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid); + if (nfserr) + goto out; + p = xdr_reserve_space(xdr, 40); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &open->op_cinfo); + *p++ = cpu_to_be32(open->op_rflags); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(open->op_bmval[0]); + *p++ = cpu_to_be32(open->op_bmval[1]); + *p++ = cpu_to_be32(open->op_delegate_type); switch (open->op_delegate_type) { case NFS4_OPEN_DELEGATE_NONE: break; case NFS4_OPEN_DELEGATE_READ: - nfsd4_encode_stateid(resp, &open->op_delegate_stateid); - RESERVE_SPACE(20); - WRITE32(open->op_recall); + nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid); + if (nfserr) + return nfserr; + p = xdr_reserve_space(xdr, 20); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(open->op_recall); /* * TODO: ACE's in delegations */ - WRITE32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); - WRITE32(0); - WRITE32(0); - WRITE32(0); /* XXX: is NULL principal ok? */ - ADJUST_ARGS(); + *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */ break; case NFS4_OPEN_DELEGATE_WRITE: - nfsd4_encode_stateid(resp, &open->op_delegate_stateid); - RESERVE_SPACE(32); - WRITE32(0); + nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid); + if (nfserr) + return nfserr; + p = xdr_reserve_space(xdr, 32); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(0); /* * TODO: space_limit's in delegations */ - WRITE32(NFS4_LIMIT_SIZE); - WRITE32(~(u32)0); - WRITE32(~(u32)0); + *p++ = cpu_to_be32(NFS4_LIMIT_SIZE); + *p++ = cpu_to_be32(~(u32)0); + *p++ = cpu_to_be32(~(u32)0); /* * TODO: ACE's in delegations */ - WRITE32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); - WRITE32(0); - WRITE32(0); - WRITE32(0); /* XXX: is NULL principal ok? */ - ADJUST_ARGS(); + *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */ break; case NFS4_OPEN_DELEGATE_NONE_EXT: /* 4.1 */ switch (open->op_why_no_deleg) { case WND4_CONTENTION: case WND4_RESOURCE: - RESERVE_SPACE(8); - WRITE32(open->op_why_no_deleg); - WRITE32(0); /* deleg signaling not supported yet */ + p = xdr_reserve_space(xdr, 8); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(open->op_why_no_deleg); + /* deleg signaling not supported yet: */ + *p++ = cpu_to_be32(0); break; default: - RESERVE_SPACE(4); - WRITE32(open->op_why_no_deleg); + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(open->op_why_no_deleg); } - ADJUST_ARGS(); break; default: BUG(); @@ -2945,8 +3046,10 @@ out: static __be32 nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc) { + struct xdr_stream *xdr = &resp->xdr; + if (!nfserr) - nfsd4_encode_stateid(resp, &oc->oc_resp_stateid); + nfserr = nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid); return nfserr; } @@ -2954,127 +3057,233 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct static __be32 nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od) { + struct xdr_stream *xdr = &resp->xdr; + if (!nfserr) - nfsd4_encode_stateid(resp, &od->od_stateid); + nfserr = nfsd4_encode_stateid(xdr, &od->od_stateid); return nfserr; } -static __be32 -nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_read *read) +static __be32 nfsd4_encode_splice_read( + struct nfsd4_compoundres *resp, + struct nfsd4_read *read, + struct file *file, unsigned long maxcount) { + struct xdr_stream *xdr = &resp->xdr; + struct xdr_buf *buf = xdr->buf; u32 eof; - int v; - struct page *page; - unsigned long maxcount; - long len; - __be32 *p; + int space_left; + __be32 nfserr; + __be32 *p = xdr->p - 2; - if (nfserr) - return nfserr; - if (resp->xbuf->page_len) + /* + * Don't inline pages unless we know there's room for eof, + * count, and possible padding: + */ + if (xdr->end - xdr->p < 3) return nfserr_resource; - RESERVE_SPACE(8); /* eof flag and byte count */ + nfserr = nfsd_splice_read(read->rd_rqstp, file, + read->rd_offset, &maxcount); + if (nfserr) { + /* + * nfsd_splice_actor may have already messed with the + * page length; reset it so as not to confuse + * xdr_truncate_encode: + */ + buf->page_len = 0; + return nfserr; + } + + eof = (read->rd_offset + maxcount >= + read->rd_fhp->fh_dentry->d_inode->i_size); - maxcount = svc_max_payload(resp->rqstp); - if (maxcount > read->rd_length) - maxcount = read->rd_length; + *(p++) = htonl(eof); + *(p++) = htonl(maxcount); + + buf->page_len = maxcount; + buf->len += maxcount; + xdr->page_ptr += (maxcount + PAGE_SIZE - 1) / PAGE_SIZE; + + /* Use rest of head for padding and remaining ops: */ + buf->tail[0].iov_base = xdr->p; + buf->tail[0].iov_len = 0; + xdr->iov = buf->tail; + if (maxcount&3) { + int pad = 4 - (maxcount&3); + + *(xdr->p++) = 0; + + buf->tail[0].iov_base += maxcount&3; + buf->tail[0].iov_len = pad; + buf->len += pad; + } + + space_left = min_t(int, (void *)xdr->end - (void *)xdr->p, + buf->buflen - buf->len); + buf->buflen = buf->len + space_left; + xdr->end = (__be32 *)((void *)xdr->end + space_left); + + return 0; +} + +static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, + struct nfsd4_read *read, + struct file *file, unsigned long maxcount) +{ + struct xdr_stream *xdr = &resp->xdr; + u32 eof; + int v; + int starting_len = xdr->buf->len - 8; + long len; + int thislen; + __be32 nfserr; + __be32 tmp; + __be32 *p; + u32 zzz = 0; + int pad; len = maxcount; v = 0; - while (len > 0) { - page = *(resp->rqstp->rq_next_page); - if (!page) { /* ran out of pages */ - maxcount -= len; - break; - } - resp->rqstp->rq_vec[v].iov_base = page_address(page); - resp->rqstp->rq_vec[v].iov_len = - len < PAGE_SIZE ? len : PAGE_SIZE; - resp->rqstp->rq_next_page++; + + thislen = (void *)xdr->end - (void *)xdr->p; + if (len < thislen) + thislen = len; + p = xdr_reserve_space(xdr, (thislen+3)&~3); + WARN_ON_ONCE(!p); + resp->rqstp->rq_vec[v].iov_base = p; + resp->rqstp->rq_vec[v].iov_len = thislen; + v++; + len -= thislen; + + while (len) { + thislen = min_t(long, len, PAGE_SIZE); + p = xdr_reserve_space(xdr, (thislen+3)&~3); + WARN_ON_ONCE(!p); + resp->rqstp->rq_vec[v].iov_base = p; + resp->rqstp->rq_vec[v].iov_len = thislen; v++; - len -= PAGE_SIZE; + len -= thislen; } read->rd_vlen = v; - nfserr = nfsd_read_file(read->rd_rqstp, read->rd_fhp, read->rd_filp, - read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, - &maxcount); - + nfserr = nfsd_readv(file, read->rd_offset, resp->rqstp->rq_vec, + read->rd_vlen, &maxcount); if (nfserr) return nfserr; + xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3)); + eof = (read->rd_offset + maxcount >= read->rd_fhp->fh_dentry->d_inode->i_size); - WRITE32(eof); - WRITE32(maxcount); - ADJUST_ARGS(); - resp->xbuf->head[0].iov_len = (char*)p - - (char*)resp->xbuf->head[0].iov_base; - resp->xbuf->page_len = maxcount; + tmp = htonl(eof); + write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4); + tmp = htonl(maxcount); + write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4); - /* Use rest of head for padding and remaining ops: */ - resp->xbuf->tail[0].iov_base = p; - resp->xbuf->tail[0].iov_len = 0; - if (maxcount&3) { - RESERVE_SPACE(4); - WRITE32(0); - resp->xbuf->tail[0].iov_base += maxcount&3; - resp->xbuf->tail[0].iov_len = 4 - (maxcount&3); - ADJUST_ARGS(); - } + pad = (maxcount&3) ? 4 - (maxcount&3) : 0; + write_bytes_to_xdr_buf(xdr->buf, starting_len + 8 + maxcount, + &zzz, pad); return 0; + } static __be32 -nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink) +nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_read *read) { - int maxcount; - char *page; + unsigned long maxcount; + struct xdr_stream *xdr = &resp->xdr; + struct file *file = read->rd_filp; + int starting_len = xdr->buf->len; + struct raparms *ra; __be32 *p; + __be32 err; if (nfserr) return nfserr; - if (resp->xbuf->page_len) + + p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */ + if (!p) { + WARN_ON_ONCE(resp->rqstp->rq_splice_ok); return nfserr_resource; - if (!*resp->rqstp->rq_next_page) + } + if (resp->xdr.buf->page_len && resp->rqstp->rq_splice_ok) { + WARN_ON_ONCE(1); return nfserr_resource; + } + xdr_commit_encode(xdr); + + maxcount = svc_max_payload(resp->rqstp); + if (maxcount > xdr->buf->buflen - xdr->buf->len) + maxcount = xdr->buf->buflen - xdr->buf->len; + if (maxcount > read->rd_length) + maxcount = read->rd_length; + + if (!read->rd_filp) { + err = nfsd_get_tmp_read_open(resp->rqstp, read->rd_fhp, + &file, &ra); + if (err) + goto err_truncate; + } + + if (file->f_op->splice_read && resp->rqstp->rq_splice_ok) + err = nfsd4_encode_splice_read(resp, read, file, maxcount); + else + err = nfsd4_encode_readv(resp, read, file, maxcount); + + if (!read->rd_filp) + nfsd_put_tmp_read_open(file, ra); + +err_truncate: + if (err) + xdr_truncate_encode(xdr, starting_len); + return err; +} - page = page_address(*(resp->rqstp->rq_next_page++)); +static __be32 +nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink) +{ + int maxcount; + __be32 wire_count; + int zero = 0; + struct xdr_stream *xdr = &resp->xdr; + int length_offset = xdr->buf->len; + __be32 *p; + + if (nfserr) + return nfserr; + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; maxcount = PAGE_SIZE; - RESERVE_SPACE(4); + p = xdr_reserve_space(xdr, maxcount); + if (!p) + return nfserr_resource; /* * XXX: By default, the ->readlink() VFS op will truncate symlinks * if they would overflow the buffer. Is this kosher in NFSv4? If * not, one easy fix is: if ->readlink() precisely fills the buffer, * assume that truncation occurred, and return NFS4ERR_RESOURCE. */ - nfserr = nfsd_readlink(readlink->rl_rqstp, readlink->rl_fhp, page, &maxcount); + nfserr = nfsd_readlink(readlink->rl_rqstp, readlink->rl_fhp, + (char *)p, &maxcount); if (nfserr == nfserr_isdir) - return nfserr_inval; - if (nfserr) + nfserr = nfserr_inval; + if (nfserr) { + xdr_truncate_encode(xdr, length_offset); return nfserr; - - WRITE32(maxcount); - ADJUST_ARGS(); - resp->xbuf->head[0].iov_len = (char*)p - - (char*)resp->xbuf->head[0].iov_base; - resp->xbuf->page_len = maxcount; - - /* Use rest of head for padding and remaining ops: */ - resp->xbuf->tail[0].iov_base = p; - resp->xbuf->tail[0].iov_len = 0; - if (maxcount&3) { - RESERVE_SPACE(4); - WRITE32(0); - resp->xbuf->tail[0].iov_base += maxcount&3; - resp->xbuf->tail[0].iov_len = 4 - (maxcount&3); - ADJUST_ARGS(); } + + wire_count = htonl(maxcount); + write_bytes_to_xdr_buf(xdr->buf, length_offset, &wire_count, 4); + xdr_truncate_encode(xdr, length_offset + 4 + ALIGN(maxcount, 4)); + if (maxcount & 3) + write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount, + &zero, 4 - (maxcount&3)); return 0; } @@ -3082,47 +3291,52 @@ static __be32 nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readdir *readdir) { int maxcount; + int bytes_left; loff_t offset; - __be32 *page, *savep, *tailbase; + __be64 wire_offset; + struct xdr_stream *xdr = &resp->xdr; + int starting_len = xdr->buf->len; __be32 *p; if (nfserr) return nfserr; - if (resp->xbuf->page_len) - return nfserr_resource; - if (!*resp->rqstp->rq_next_page) - return nfserr_resource; - RESERVE_SPACE(NFS4_VERIFIER_SIZE); - savep = p; + p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_resource; /* XXX: Following NFSv3, we ignore the READDIR verifier for now. */ - WRITE32(0); - WRITE32(0); - ADJUST_ARGS(); - resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base; - tailbase = p; - - maxcount = PAGE_SIZE; - if (maxcount > readdir->rd_maxcount) - maxcount = readdir->rd_maxcount; + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); + resp->xdr.buf->head[0].iov_len = ((char *)resp->xdr.p) + - (char *)resp->xdr.buf->head[0].iov_base; /* - * Convert from bytes to words, account for the two words already - * written, make sure to leave two words at the end for the next - * pointer and eof field. + * Number of bytes left for directory entries allowing for the + * final 8 bytes of the readdir and a following failed op: */ - maxcount = (maxcount >> 2) - 4; - if (maxcount < 0) { - nfserr = nfserr_toosmall; + bytes_left = xdr->buf->buflen - xdr->buf->len + - COMPOUND_ERR_SLACK_SPACE - 8; + if (bytes_left < 0) { + nfserr = nfserr_resource; + goto err_no_verf; + } + maxcount = min_t(u32, readdir->rd_maxcount, INT_MAX); + /* + * Note the rfc defines rd_maxcount as the size of the + * READDIR4resok structure, which includes the verifier above + * and the 8 bytes encoded at the end of this function: + */ + if (maxcount < 16) { + nfserr = nfserr_toosmall; goto err_no_verf; } + maxcount = min_t(int, maxcount-16, bytes_left); - page = page_address(*(resp->rqstp->rq_next_page++)); + readdir->xdr = xdr; + readdir->rd_maxcount = maxcount; readdir->common.err = 0; - readdir->buflen = maxcount; - readdir->buffer = page; - readdir->offset = NULL; + readdir->cookie_offset = 0; offset = readdir->rd_cookie; nfserr = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp, @@ -3130,42 +3344,49 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 &readdir->common, nfsd4_encode_dirent); if (nfserr == nfs_ok && readdir->common.err == nfserr_toosmall && - readdir->buffer == page) - nfserr = nfserr_toosmall; + xdr->buf->len == starting_len + 8) { + /* nothing encoded; which limit did we hit?: */ + if (maxcount - 16 < bytes_left) + /* It was the fault of rd_maxcount: */ + nfserr = nfserr_toosmall; + else + /* We ran out of buffer space: */ + nfserr = nfserr_resource; + } if (nfserr) goto err_no_verf; - if (readdir->offset) - xdr_encode_hyper(readdir->offset, offset); + if (readdir->cookie_offset) { + wire_offset = cpu_to_be64(offset); + write_bytes_to_xdr_buf(xdr->buf, readdir->cookie_offset, + &wire_offset, 8); + } - p = readdir->buffer; + p = xdr_reserve_space(xdr, 8); + if (!p) { + WARN_ON_ONCE(1); + goto err_no_verf; + } *p++ = 0; /* no more entries */ *p++ = htonl(readdir->common.err == nfserr_eof); - resp->xbuf->page_len = ((char*)p) - - (char*)page_address(*(resp->rqstp->rq_next_page-1)); - - /* Use rest of head for padding and remaining ops: */ - resp->xbuf->tail[0].iov_base = tailbase; - resp->xbuf->tail[0].iov_len = 0; - resp->p = resp->xbuf->tail[0].iov_base; - resp->end = resp->p + (PAGE_SIZE - resp->xbuf->head[0].iov_len)/4; return 0; err_no_verf: - p = savep; - ADJUST_ARGS(); + xdr_truncate_encode(xdr, starting_len); return nfserr; } static __be32 nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(20); - write_cinfo(&p, &remove->rm_cinfo); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 20); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &remove->rm_cinfo); } return nfserr; } @@ -3173,19 +3394,21 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ static __be32 nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(40); - write_cinfo(&p, &rename->rn_sinfo); - write_cinfo(&p, &rename->rn_tinfo); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 40); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &rename->rn_sinfo); + p = encode_cinfo(p, &rename->rn_tinfo); } return nfserr; } static __be32 -nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, +nfsd4_do_encode_secinfo(struct xdr_stream *xdr, __be32 nfserr, struct svc_export *exp) { u32 i, nflavs, supported; @@ -3196,6 +3419,7 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, if (nfserr) goto out; + nfserr = nfserr_resource; if (exp->ex_nflavors) { flavs = exp->ex_flavors; nflavs = exp->ex_nflavors; @@ -3217,9 +3441,10 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, } supported = 0; - RESERVE_SPACE(4); + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out; flavorsp = p++; /* to be backfilled later */ - ADJUST_ARGS(); for (i = 0; i < nflavs; i++) { rpc_authflavor_t pf = flavs[i].pseudoflavor; @@ -3227,18 +3452,20 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, if (rpcauth_get_gssinfo(pf, &info) == 0) { supported++; - RESERVE_SPACE(4 + 4 + info.oid.len + 4 + 4); - WRITE32(RPC_AUTH_GSS); - WRITE32(info.oid.len); - WRITEMEM(info.oid.data, info.oid.len); - WRITE32(info.qop); - WRITE32(info.service); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 4 + 4 + + XDR_LEN(info.oid.len) + 4 + 4); + if (!p) + goto out; + *p++ = cpu_to_be32(RPC_AUTH_GSS); + p = xdr_encode_opaque(p, info.oid.data, info.oid.len); + *p++ = cpu_to_be32(info.qop); + *p++ = cpu_to_be32(info.service); } else if (pf < RPC_AUTH_MAXFLAVOR) { supported++; - RESERVE_SPACE(4); - WRITE32(pf); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out; + *p++ = cpu_to_be32(pf); } else { if (report) pr_warn("NFS: SECINFO: security flavor %u " @@ -3249,7 +3476,7 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, if (nflavs != supported) report = false; *flavorsp = htonl(supported); - + nfserr = 0; out: if (exp) exp_put(exp); @@ -3260,14 +3487,18 @@ static __be32 nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_secinfo *secinfo) { - return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->si_exp); + struct xdr_stream *xdr = &resp->xdr; + + return nfsd4_do_encode_secinfo(xdr, nfserr, secinfo->si_exp); } static __be32 nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_secinfo_no_name *secinfo) { - return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->sin_exp); + struct xdr_stream *xdr = &resp->xdr; + + return nfsd4_do_encode_secinfo(xdr, nfserr, secinfo->sin_exp); } /* @@ -3277,41 +3508,47 @@ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; - RESERVE_SPACE(16); + p = xdr_reserve_space(xdr, 16); + if (!p) + return nfserr_resource; if (nfserr) { - WRITE32(3); - WRITE32(0); - WRITE32(0); - WRITE32(0); + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); } else { - WRITE32(3); - WRITE32(setattr->sa_bmval[0]); - WRITE32(setattr->sa_bmval[1]); - WRITE32(setattr->sa_bmval[2]); + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(setattr->sa_bmval[0]); + *p++ = cpu_to_be32(setattr->sa_bmval[1]); + *p++ = cpu_to_be32(setattr->sa_bmval[2]); } - ADJUST_ARGS(); return nfserr; } static __be32 nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(8 + NFS4_VERIFIER_SIZE); - WRITEMEM(&scd->se_clientid, 8); - WRITEMEM(&scd->se_confirm, NFS4_VERIFIER_SIZE); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 8 + NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, &scd->se_clientid, 8); + p = xdr_encode_opaque_fixed(p, &scd->se_confirm, + NFS4_VERIFIER_SIZE); } else if (nfserr == nfserr_clid_inuse) { - RESERVE_SPACE(8); - WRITE32(0); - WRITE32(0); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 8); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); } return nfserr; } @@ -3319,14 +3556,17 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n static __be32 nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(16); - WRITE32(write->wr_bytes_written); - WRITE32(write->wr_how_written); - WRITEMEM(write->wr_verifier.data, NFS4_VERIFIER_SIZE); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 16); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(write->wr_bytes_written); + *p++ = cpu_to_be32(write->wr_how_written); + p = xdr_encode_opaque_fixed(p, write->wr_verifier.data, + NFS4_VERIFIER_SIZE); } return nfserr; } @@ -3343,6 +3583,7 @@ static __be32 nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_exchange_id *exid) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; char *major_id; char *server_scope; @@ -3358,52 +3599,61 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, server_scope = utsname()->nodename; server_scope_sz = strlen(server_scope); - RESERVE_SPACE( + p = xdr_reserve_space(xdr, 8 /* eir_clientid */ + 4 /* eir_sequenceid */ + 4 /* eir_flags */ + - 4 /* spr_how */ + - 8 /* spo_must_enforce, spo_must_allow */ + - 8 /* so_minor_id */ + - 4 /* so_major_id.len */ + - (XDR_QUADLEN(major_id_sz) * 4) + - 4 /* eir_server_scope.len */ + - (XDR_QUADLEN(server_scope_sz) * 4) + - 4 /* eir_server_impl_id.count (0) */); + 4 /* spr_how */); + if (!p) + return nfserr_resource; + + p = xdr_encode_opaque_fixed(p, &exid->clientid, 8); + *p++ = cpu_to_be32(exid->seqid); + *p++ = cpu_to_be32(exid->flags); - WRITEMEM(&exid->clientid, 8); - WRITE32(exid->seqid); - WRITE32(exid->flags); + *p++ = cpu_to_be32(exid->spa_how); - WRITE32(exid->spa_how); switch (exid->spa_how) { case SP4_NONE: break; case SP4_MACH_CRED: + /* spo_must_enforce, spo_must_allow */ + p = xdr_reserve_space(xdr, 16); + if (!p) + return nfserr_resource; + /* spo_must_enforce bitmap: */ - WRITE32(2); - WRITE32(nfs4_minimal_spo_must_enforce[0]); - WRITE32(nfs4_minimal_spo_must_enforce[1]); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[0]); + *p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[1]); /* empty spo_must_allow bitmap: */ - WRITE32(0); + *p++ = cpu_to_be32(0); + break; default: WARN_ON_ONCE(1); } + p = xdr_reserve_space(xdr, + 8 /* so_minor_id */ + + 4 /* so_major_id.len */ + + (XDR_QUADLEN(major_id_sz) * 4) + + 4 /* eir_server_scope.len */ + + (XDR_QUADLEN(server_scope_sz) * 4) + + 4 /* eir_server_impl_id.count (0) */); + if (!p) + return nfserr_resource; + /* The server_owner struct */ - WRITE64(minor_id); /* Minor id */ + p = xdr_encode_hyper(p, minor_id); /* Minor id */ /* major id */ - WRITE32(major_id_sz); - WRITEMEM(major_id, major_id_sz); + p = xdr_encode_opaque(p, major_id, major_id_sz); /* Server scope */ - WRITE32(server_scope_sz); - WRITEMEM(server_scope, server_scope_sz); + p = xdr_encode_opaque(p, server_scope, server_scope_sz); /* Implementation id */ - WRITE32(0); /* zero length nfs_impl_id4 array */ - ADJUST_ARGS(); + *p++ = cpu_to_be32(0); /* zero length nfs_impl_id4 array */ return 0; } @@ -3411,93 +3661,81 @@ static __be32 nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create_session *sess) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (nfserr) return nfserr; - RESERVE_SPACE(24); - WRITEMEM(sess->sessionid.data, NFS4_MAX_SESSIONID_LEN); - WRITE32(sess->seqid); - WRITE32(sess->flags); - ADJUST_ARGS(); - - RESERVE_SPACE(28); - WRITE32(0); /* headerpadsz */ - WRITE32(sess->fore_channel.maxreq_sz); - WRITE32(sess->fore_channel.maxresp_sz); - WRITE32(sess->fore_channel.maxresp_cached); - WRITE32(sess->fore_channel.maxops); - WRITE32(sess->fore_channel.maxreqs); - WRITE32(sess->fore_channel.nr_rdma_attrs); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 24); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, sess->sessionid.data, + NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(sess->seqid); + *p++ = cpu_to_be32(sess->flags); + + p = xdr_reserve_space(xdr, 28); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(0); /* headerpadsz */ + *p++ = cpu_to_be32(sess->fore_channel.maxreq_sz); + *p++ = cpu_to_be32(sess->fore_channel.maxresp_sz); + *p++ = cpu_to_be32(sess->fore_channel.maxresp_cached); + *p++ = cpu_to_be32(sess->fore_channel.maxops); + *p++ = cpu_to_be32(sess->fore_channel.maxreqs); + *p++ = cpu_to_be32(sess->fore_channel.nr_rdma_attrs); if (sess->fore_channel.nr_rdma_attrs) { - RESERVE_SPACE(4); - WRITE32(sess->fore_channel.rdma_attrs); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(sess->fore_channel.rdma_attrs); } - RESERVE_SPACE(28); - WRITE32(0); /* headerpadsz */ - WRITE32(sess->back_channel.maxreq_sz); - WRITE32(sess->back_channel.maxresp_sz); - WRITE32(sess->back_channel.maxresp_cached); - WRITE32(sess->back_channel.maxops); - WRITE32(sess->back_channel.maxreqs); - WRITE32(sess->back_channel.nr_rdma_attrs); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 28); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(0); /* headerpadsz */ + *p++ = cpu_to_be32(sess->back_channel.maxreq_sz); + *p++ = cpu_to_be32(sess->back_channel.maxresp_sz); + *p++ = cpu_to_be32(sess->back_channel.maxresp_cached); + *p++ = cpu_to_be32(sess->back_channel.maxops); + *p++ = cpu_to_be32(sess->back_channel.maxreqs); + *p++ = cpu_to_be32(sess->back_channel.nr_rdma_attrs); if (sess->back_channel.nr_rdma_attrs) { - RESERVE_SPACE(4); - WRITE32(sess->back_channel.rdma_attrs); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(sess->back_channel.rdma_attrs); } return 0; } static __be32 -nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_destroy_session *destroy_session) -{ - return nfserr; -} - -static __be32 -nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_free_stateid *free_stateid) -{ - __be32 *p; - - if (nfserr) - return nfserr; - - RESERVE_SPACE(4); - *p++ = nfserr; - ADJUST_ARGS(); - return nfserr; -} - -static __be32 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_sequence *seq) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (nfserr) return nfserr; - RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 20); - WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN); - WRITE32(seq->seqid); - WRITE32(seq->slotid); + p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 20); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, seq->sessionid.data, + NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(seq->seqid); + *p++ = cpu_to_be32(seq->slotid); /* Note slotid's are numbered from zero: */ - WRITE32(seq->maxslots - 1); /* sr_highest_slotid */ - WRITE32(seq->maxslots - 1); /* sr_target_highest_slotid */ - WRITE32(seq->status_flags); + *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_highest_slotid */ + *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_target_highest_slotid */ + *p++ = cpu_to_be32(seq->status_flags); - ADJUST_ARGS(); - resp->cstate.datap = p; /* DRC cache data pointer */ + resp->cstate.data_offset = xdr->buf->len; /* DRC cache data pointer */ return 0; } @@ -3505,17 +3743,22 @@ static __be32 nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_test_stateid *test_stateid) { + struct xdr_stream *xdr = &resp->xdr; struct nfsd4_test_stateid_id *stateid, *next; __be32 *p; - RESERVE_SPACE(4 + (4 * test_stateid->ts_num_ids)); + if (nfserr) + return nfserr; + + p = xdr_reserve_space(xdr, 4 + (4 * test_stateid->ts_num_ids)); + if (!p) + return nfserr_resource; *p++ = htonl(test_stateid->ts_num_ids); list_for_each_entry_safe(stateid, next, &test_stateid->ts_stateid_list, ts_id_list) { *p++ = stateid->ts_id_status; } - ADJUST_ARGS(); return nfserr; } @@ -3576,8 +3819,8 @@ static nfsd4_enc nfsd4_enc_ops[] = { [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session, [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id, [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session, - [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, - [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_free_stateid, + [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop, + [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, @@ -3594,83 +3837,99 @@ static nfsd4_enc nfsd4_enc_ops[] = { }; /* - * Calculate the total amount of memory that the compound response has taken - * after encoding the current operation with pad. + * Calculate whether we still have space to encode repsize bytes. + * There are two considerations: + * - For NFS versions >=4.1, the size of the reply must stay within + * session limits + * - For all NFS versions, we must stay within limited preallocated + * buffer space. * - * pad: if operation is non-idempotent, pad was calculate by op_rsize_bop() - * which was specified at nfsd4_operation, else pad is zero. - * - * Compare this length to the session se_fmaxresp_sz and se_fmaxresp_cached. - * - * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so - * will be at least a page and will therefore hold the xdr_buf head. + * This is called before the operation is processed, so can only provide + * an upper estimate. For some nonidempotent operations (such as + * getattr), it's not necessarily a problem if that estimate is wrong, + * as we can fail it after processing without significant side effects. */ -__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad) +__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 respsize) { - struct xdr_buf *xb = &resp->rqstp->rq_res; - struct nfsd4_session *session = NULL; + struct xdr_buf *buf = &resp->rqstp->rq_res; struct nfsd4_slot *slot = resp->cstate.slot; - u32 length, tlen = 0; + if (buf->len + respsize <= buf->buflen) + return nfs_ok; if (!nfsd4_has_session(&resp->cstate)) - return 0; - - session = resp->cstate.session; - if (session == NULL) - return 0; - - if (xb->page_len == 0) { - length = (char *)resp->p - (char *)xb->head[0].iov_base + pad; - } else { - if (xb->tail[0].iov_base && xb->tail[0].iov_len > 0) - tlen = (char *)resp->p - (char *)xb->tail[0].iov_base; - - length = xb->head[0].iov_len + xb->page_len + tlen + pad; - } - dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__, - length, xb->page_len, tlen, pad); - - if (length > session->se_fchannel.maxresp_sz) - return nfserr_rep_too_big; - - if ((slot->sl_flags & NFSD4_SLOT_CACHETHIS) && - length > session->se_fchannel.maxresp_cached) + return nfserr_resource; + if (slot->sl_flags & NFSD4_SLOT_CACHETHIS) { + WARN_ON_ONCE(1); return nfserr_rep_too_big_to_cache; - - return 0; + } + return nfserr_rep_too_big; } void nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) { + struct xdr_stream *xdr = &resp->xdr; struct nfs4_stateowner *so = resp->cstate.replay_owner; - __be32 *statp; + struct svc_rqst *rqstp = resp->rqstp; + int post_err_offset; + nfsd4_enc encoder; __be32 *p; - RESERVE_SPACE(8); - WRITE32(op->opnum); - statp = p++; /* to be backfilled at the end */ - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 8); + if (!p) { + WARN_ON_ONCE(1); + return; + } + *p++ = cpu_to_be32(op->opnum); + post_err_offset = xdr->buf->len; if (op->opnum == OP_ILLEGAL) goto status; BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || !nfsd4_enc_ops[op->opnum]); - op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u); - /* nfsd4_check_drc_limit guarantees enough room for error status */ - if (!op->status) - op->status = nfsd4_check_resp_size(resp, 0); + encoder = nfsd4_enc_ops[op->opnum]; + op->status = encoder(resp, op->status, &op->u); + xdr_commit_encode(xdr); + + /* nfsd4_check_resp_size guarantees enough room for error status */ + if (!op->status) { + int space_needed = 0; + if (!nfsd4_last_compound_op(rqstp)) + space_needed = COMPOUND_ERR_SLACK_SPACE; + op->status = nfsd4_check_resp_size(resp, space_needed); + } + if (op->status == nfserr_resource && nfsd4_has_session(&resp->cstate)) { + struct nfsd4_slot *slot = resp->cstate.slot; + + if (slot->sl_flags & NFSD4_SLOT_CACHETHIS) + op->status = nfserr_rep_too_big_to_cache; + else + op->status = nfserr_rep_too_big; + } + if (op->status == nfserr_resource || + op->status == nfserr_rep_too_big || + op->status == nfserr_rep_too_big_to_cache) { + /* + * The operation may have already been encoded or + * partially encoded. No op returns anything additional + * in the case of one of these three errors, so we can + * just truncate back to after the status. But it's a + * bug if we had to do this on a non-idempotent op: + */ + warn_on_nonidempotent_op(op); + xdr_truncate_encode(xdr, post_err_offset); + } if (so) { + int len = xdr->buf->len - post_err_offset; + so->so_replay.rp_status = op->status; - so->so_replay.rp_buflen = (char *)resp->p - (char *)(statp+1); - memcpy(so->so_replay.rp_buf, statp+1, so->so_replay.rp_buflen); + so->so_replay.rp_buflen = len; + read_bytes_from_xdr_buf(xdr->buf, post_err_offset, + so->so_replay.rp_buf, len); } status: - /* - * Note: We write the status directly, instead of using WRITE32(), - * since it is already in network byte order. - */ - *statp = op->status; + /* Note that op->status is already in network byte order: */ + write_bytes_to_xdr_buf(xdr->buf, post_err_offset - 4, &op->status, 4); } /* @@ -3682,21 +3941,22 @@ status: * called with nfs4_lock_state() held */ void -nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op) +nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op) { __be32 *p; struct nfs4_replay *rp = op->replay; BUG_ON(!rp); - RESERVE_SPACE(8); - WRITE32(op->opnum); + p = xdr_reserve_space(xdr, 8 + rp->rp_buflen); + if (!p) { + WARN_ON_ONCE(1); + return; + } + *p++ = cpu_to_be32(op->opnum); *p++ = rp->rp_status; /* already xdr'ed */ - ADJUST_ARGS(); - RESERVE_SPACE(rp->rp_buflen); - WRITEMEM(rp->rp_buf, rp->rp_buflen); - ADJUST_ARGS(); + p = xdr_encode_opaque_fixed(p, rp->rp_buf, rp->rp_buflen); } int @@ -3728,6 +3988,12 @@ int nfsd4_release_compoundargs(void *rq, __be32 *p, void *resp) int nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundargs *args) { + if (rqstp->rq_arg.head[0].iov_len % 4) { + /* client is nuts */ + dprintk("%s: compound not properly padded! (peeraddr=%pISc xid=0x%x)", + __func__, svc_addr(rqstp), be32_to_cpu(rqstp->rq_xid)); + return 0; + } args->p = p; args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len; args->pagelist = rqstp->rq_arg.pages; @@ -3747,19 +4013,19 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo * All that remains is to write the tag and operation count... */ struct nfsd4_compound_state *cs = &resp->cstate; - struct kvec *iov; + struct xdr_buf *buf = resp->xdr.buf; + + WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len + + buf->tail[0].iov_len); + + rqstp->rq_next_page = resp->xdr.page_ptr + 1; + p = resp->tagp; *p++ = htonl(resp->taglen); memcpy(p, resp->tag, resp->taglen); p += XDR_QUADLEN(resp->taglen); *p++ = htonl(resp->opcnt); - if (rqstp->rq_res.page_len) - iov = &rqstp->rq_res.tail[0]; - else - iov = &rqstp->rq_res.head[0]; - iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base; - BUG_ON(iov->iov_len > PAGE_SIZE); if (nfsd4_has_session(cs)) { struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct nfs4_client *clp = cs->session->se_client; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 9186c7ce0b1..6040da8830f 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -224,13 +224,6 @@ hash_refile(struct svc_cacherep *rp) hlist_add_head(&rp->c_hash, cache_hash + hash_32(rp->c_xid, maskbits)); } -static inline bool -nfsd_cache_entry_expired(struct svc_cacherep *rp) -{ - return rp->c_state != RC_INPROG && - time_after(jiffies, rp->c_timestamp + RC_EXPIRE); -} - /* * Walk the LRU list and prune off entries that are older than RC_EXPIRE. * Also prune the oldest ones when the total exceeds the max number of entries. @@ -242,8 +235,14 @@ prune_cache_entries(void) long freed = 0; list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) { - if (!nfsd_cache_entry_expired(rp) && - num_drc_entries <= max_drc_entries) + /* + * Don't free entries attached to calls that are still + * in-progress, but do keep scanning the list. + */ + if (rp->c_state == RC_INPROG) + continue; + if (num_drc_entries <= max_drc_entries && + time_before(jiffies, rp->c_timestamp + RC_EXPIRE)) break; nfsd_reply_cache_free_locked(rp); freed++; @@ -409,22 +408,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) /* * Since the common case is a cache miss followed by an insert, - * preallocate an entry. First, try to reuse the first entry on the LRU - * if it works, then go ahead and prune the LRU list. + * preallocate an entry. */ - spin_lock(&cache_lock); - if (!list_empty(&lru_head)) { - rp = list_first_entry(&lru_head, struct svc_cacherep, c_lru); - if (nfsd_cache_entry_expired(rp) || - num_drc_entries >= max_drc_entries) { - lru_put_end(rp); - prune_cache_entries(); - goto search_cache; - } - } - - /* No expired ones available, allocate a new one. */ - spin_unlock(&cache_lock); rp = nfsd_reply_cache_alloc(); spin_lock(&cache_lock); if (likely(rp)) { @@ -432,7 +417,9 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) drc_mem_usage += sizeof(*rp); } -search_cache: + /* go ahead and prune the cache */ + prune_cache_entries(); + found = nfsd_cache_search(rqstp, csum); if (found) { if (likely(rp)) @@ -446,15 +433,6 @@ search_cache: goto out; } - /* - * We're keeping the one we just allocated. Are we now over the - * limit? Prune one off the tip of the LRU in trade for the one we - * just allocated if so. - */ - if (num_drc_entries >= max_drc_entries) - nfsd_reply_cache_free_locked(list_first_entry(&lru_head, - struct svc_cacherep, c_lru)); - nfsdstats.rcmisses++; rqstp->rq_cacherep = rp; rp->c_state = RC_INPROG; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 7f555179bf8..51844048937 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -699,6 +699,11 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net) if (err != 0 || fd < 0) return -EINVAL; + if (svc_alien_sock(net, fd)) { + printk(KERN_ERR "%s: socket net is different to NFSd's one\n", __func__); + return -EINVAL; + } + err = nfsd_create_serv(net); if (err != 0) return err; @@ -1174,7 +1179,6 @@ static int __init init_nfsd(void) retval = nfsd4_init_slabs(); if (retval) goto out_unregister_pernet; - nfs4_state_init(); retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */ if (retval) goto out_free_slabs; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 30f34ab0213..847daf37e56 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -15,11 +15,20 @@ #include <linux/nfs2.h> #include <linux/nfs3.h> #include <linux/nfs4.h> +#include <linux/sunrpc/svc.h> #include <linux/sunrpc/msg_prot.h> -#include <linux/nfsd/debug.h> -#include <linux/nfsd/export.h> -#include <linux/nfsd/stats.h> +#include <uapi/linux/nfsd/debug.h> + +#include "stats.h" +#include "export.h" + +#undef ifdebug +#ifdef NFSD_DEBUG +# define ifdebug(flag) if (nfsd_debug & NFSDDBG_##flag) +#else +# define ifdebug(flag) if (0) +#endif /* * nfsd version @@ -106,7 +115,6 @@ static inline int nfsd_v4client(struct svc_rqst *rq) */ #ifdef CONFIG_NFSD_V4 extern unsigned long max_delegations; -void nfs4_state_init(void); int nfsd4_init_slabs(void); void nfsd4_free_slabs(void); int nfs4_state_start(void); @@ -117,7 +125,6 @@ void nfs4_reset_lease(time_t leasetime); int nfs4_reset_recoverydir(char *recdir); char * nfs4_recoverydir(void); #else -static inline void nfs4_state_init(void) { } static inline int nfsd4_init_slabs(void) { return 0; } static inline void nfsd4_free_slabs(void) { } static inline int nfs4_state_start(void) { return 0; } @@ -282,7 +289,7 @@ void nfsd_lockd_shutdown(void); * reason. */ #define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ -#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */ +#define COMPOUND_ERR_SLACK_SPACE 16 /* OP_SETATTR */ #define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */ diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 3d0e15ae6f7..ec839341815 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -88,9 +88,8 @@ static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, /* Check if the request originated from a secure port. */ if (!rqstp->rq_secure && !(flags & NFSEXP_INSECURE_PORT)) { RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); - dprintk(KERN_WARNING - "nfsd: request from insecure port %s!\n", - svc_print_addr(rqstp, buf, sizeof(buf))); + dprintk("nfsd: request from insecure port %s!\n", + svc_print_addr(rqstp, buf, sizeof(buf))); return nfserr_perm; } @@ -169,8 +168,8 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) data_left -= len; if (data_left < 0) return error; - exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth); - fid = (struct fid *)(fh->fh_auth + len); + exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_fsid); + fid = (struct fid *)(fh->fh_fsid + len); } else { __u32 tfh[2]; dev_t xdev; @@ -385,7 +384,7 @@ static void _fh_update(struct svc_fh *fhp, struct svc_export *exp, { if (dentry != exp->ex_path.dentry) { struct fid *fid = (struct fid *) - (fhp->fh_handle.fh_auth + fhp->fh_handle.fh_size/4 - 1); + (fhp->fh_handle.fh_fsid + fhp->fh_handle.fh_size/4 - 1); int maxsize = (fhp->fh_maxsize - fhp->fh_handle.fh_size)/4; int subtreecheck = !(exp->ex_flags & NFSEXP_NOSUBTREECHECK); @@ -513,7 +512,6 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, */ struct inode * inode = dentry->d_inode; - __u32 *datap; dev_t ex_dev = exp_sb(exp)->s_dev; dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %pd2, ino=%ld)\n", @@ -557,17 +555,16 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, if (inode) _fh_update_old(dentry, exp, &fhp->fh_handle); } else { - int len; + fhp->fh_handle.fh_size = + key_len(fhp->fh_handle.fh_fsid_type) + 4; fhp->fh_handle.fh_auth_type = 0; - datap = fhp->fh_handle.fh_auth+0; - mk_fsid(fhp->fh_handle.fh_fsid_type, datap, ex_dev, + + mk_fsid(fhp->fh_handle.fh_fsid_type, + fhp->fh_handle.fh_fsid, + ex_dev, exp->ex_path.dentry->d_inode->i_ino, exp->ex_fsid, exp->ex_uuid); - len = key_len(fhp->fh_handle.fh_fsid_type); - datap += len/4; - fhp->fh_handle.fh_size = 4 + len; - if (inode) _fh_update(fhp, exp, dentry); if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) { @@ -598,22 +595,20 @@ fh_update(struct svc_fh *fhp) _fh_update_old(dentry, fhp->fh_export, &fhp->fh_handle); } else { if (fhp->fh_handle.fh_fileid_type != FILEID_ROOT) - goto out; + return 0; _fh_update(fhp, fhp->fh_export, dentry); if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) return nfserr_opnotsupp; } -out: return 0; - out_bad: printk(KERN_ERR "fh_update: fh not verified!\n"); - goto out; + return nfserr_serverfault; out_negative: printk(KERN_ERR "fh_update: %pd2 still negative!\n", dentry); - goto out; + return nfserr_serverfault; } /* diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 4775bc4896c..2e89e70ac15 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -1,9 +1,58 @@ -/* Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */ +/* + * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> + * + * This file describes the layout of the file handles as passed + * over the wire. + */ +#ifndef _LINUX_NFSD_NFSFH_H +#define _LINUX_NFSD_NFSFH_H + +#include <linux/sunrpc/svc.h> +#include <uapi/linux/nfsd/nfsfh.h> + +static inline __u32 ino_t_to_u32(ino_t ino) +{ + return (__u32) ino; +} -#ifndef _LINUX_NFSD_FH_INT_H -#define _LINUX_NFSD_FH_INT_H +static inline ino_t u32_to_ino_t(__u32 uino) +{ + return (ino_t) uino; +} + +/* + * This is the internal representation of an NFS handle used in knfsd. + * pre_mtime/post_version will be used to support wcc_attr's in NFSv3. + */ +typedef struct svc_fh { + struct knfsd_fh fh_handle; /* FH data */ + struct dentry * fh_dentry; /* validated dentry */ + struct svc_export * fh_export; /* export pointer */ + int fh_maxsize; /* max size for fh_handle */ -#include <linux/nfsd/nfsfh.h> + unsigned char fh_locked; /* inode locked by us */ + unsigned char fh_want_write; /* remount protection taken */ + +#ifdef CONFIG_NFSD_V3 + unsigned char fh_post_saved; /* post-op attrs saved */ + unsigned char fh_pre_saved; /* pre-op attrs saved */ + + /* Pre-op attributes saved during fh_lock */ + __u64 fh_pre_size; /* size before operation */ + struct timespec fh_pre_mtime; /* mtime before oper */ + struct timespec fh_pre_ctime; /* ctime before oper */ + /* + * pre-op nfsv4 change attr: note must check IS_I_VERSION(inode) + * to find out if it is valid. + */ + u64 fh_pre_change; + + /* Post-op attributes saved in fh_unlock */ + struct kstat fh_post_attr; /* full attrs after operation */ + u64 fh_post_change; /* nfsv4 change; see above */ +#endif /* CONFIG_NFSD_V3 */ + +} svc_fh; enum nfsd_fsid { FSID_DEV = 0, @@ -133,6 +182,17 @@ fh_init(struct svc_fh *fhp, int maxsize) #ifdef CONFIG_NFSD_V3 /* + * The wcc data stored in current_fh should be cleared + * between compound ops. + */ +static inline void +fh_clear_wcc(struct svc_fh *fhp) +{ + fhp->fh_post_saved = 0; + fhp->fh_pre_saved = 0; +} + +/* * Fill in the pre_op attr for the wcc data */ static inline void @@ -152,7 +212,8 @@ fill_pre_wcc(struct svc_fh *fhp) extern void fill_post_wcc(struct svc_fh *); #else -#define fill_pre_wcc(ignored) +#define fh_clear_wcc(ignored) +#define fill_pre_wcc(ignored) #define fill_post_wcc(notused) #endif /* CONFIG_NFSD_V3 */ @@ -203,4 +264,4 @@ fh_unlock(struct svc_fh *fhp) } } -#endif /* _LINUX_NFSD_FH_INT_H */ +#endif /* _LINUX_NFSD_NFSFH_H */ diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 760c85a6f53..1879e43f286 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -241,6 +241,15 @@ static void nfsd_shutdown_generic(void) nfsd_racache_shutdown(); } +static bool nfsd_needs_lockd(void) +{ +#if defined(CONFIG_NFSD_V3) + return (nfsd_versions[2] != NULL) || (nfsd_versions[3] != NULL); +#else + return (nfsd_versions[2] != NULL); +#endif +} + static int nfsd_startup_net(int nrservs, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -255,9 +264,14 @@ static int nfsd_startup_net(int nrservs, struct net *net) ret = nfsd_init_socks(net); if (ret) goto out_socks; - ret = lockd_up(net); - if (ret) - goto out_socks; + + if (nfsd_needs_lockd() && !nn->lockd_up) { + ret = lockd_up(net); + if (ret) + goto out_socks; + nn->lockd_up = 1; + } + ret = nfs4_state_start_net(net); if (ret) goto out_lockd; @@ -266,7 +280,10 @@ static int nfsd_startup_net(int nrservs, struct net *net) return 0; out_lockd: - lockd_down(net); + if (nn->lockd_up) { + lockd_down(net); + nn->lockd_up = 0; + } out_socks: nfsd_shutdown_generic(); return ret; @@ -277,7 +294,10 @@ static void nfsd_shutdown_net(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); nfs4_state_shutdown_net(net); - lockd_down(net); + if (nn->lockd_up) { + lockd_down(net); + nn->lockd_up = 0; + } nn->nfsd_net_up = false; nfsd_shutdown_generic(); } @@ -571,12 +591,6 @@ nfsd(void *vrqstp) nfsdstats.th_cnt++; mutex_unlock(&nfsd_mutex); - /* - * We want less throttling in balance_dirty_pages() so that nfs to - * localhost doesn't cause nfsd to lock up due to all the client's - * dirty pages. - */ - current->flags |= PF_LESS_THROTTLE; set_freezable(); /* diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 9c769a47ac5..1ac306b769d 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -214,7 +214,8 @@ nfssvc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) int nfssvc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; return xdr_argsize_check(rqstp, p); } @@ -248,7 +249,8 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, { unsigned int len; int v; - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; args->offset = ntohl(*p++); @@ -281,7 +283,8 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, unsigned int len, hdr, dlen; int v; - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; p++; /* beginoffset */ @@ -355,7 +358,8 @@ nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p, int nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readlinkargs *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; args->buffer = page_address(*(rqstp->rq_next_page++)); @@ -391,7 +395,8 @@ int nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readdirargs *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; args->cookie = ntohl(*p++); args->count = ntohl(*p++); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 424d8f5f231..374c66283ac 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -37,7 +37,6 @@ #include <linux/idr.h> #include <linux/sunrpc/svc_xprt.h> -#include <linux/nfsd/nfsfh.h> #include "nfsfh.h" typedef struct { @@ -123,7 +122,7 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s) /* Maximum number of operations per session compound */ #define NFSD_MAX_OPS_PER_COMPOUND 16 /* Maximum session per slot cache size */ -#define NFSD_SLOT_CACHE_SIZE 1024 +#define NFSD_SLOT_CACHE_SIZE 2048 /* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */ #define NFSD_CACHE_SIZE_SLOTS_PER_SESSION 32 #define NFSD_MAX_MEM_PER_SESSION \ @@ -464,8 +463,6 @@ extern void nfs4_release_reclaim(struct nfsd_net *); extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn); extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn); -extern void nfs4_free_openowner(struct nfs4_openowner *); -extern void nfs4_free_lockowner(struct nfs4_lockowner *); extern int set_callback_cred(void); extern void nfsd4_init_callback(struct nfsd4_callback *); extern void nfsd4_probe_callback(struct nfs4_client *clp); diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 6d4521feb6e..cd90878a76a 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -24,7 +24,6 @@ #include <linux/seq_file.h> #include <linux/module.h> #include <linux/sunrpc/stats.h> -#include <linux/nfsd/stats.h> #include <net/net_namespace.h> #include "nfsd.h" diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h new file mode 100644 index 00000000000..a5c944b771c --- /dev/null +++ b/fs/nfsd/stats.h @@ -0,0 +1,43 @@ +/* + * Statistics for NFS server. + * + * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> + */ +#ifndef _NFSD_STATS_H +#define _NFSD_STATS_H + +#include <uapi/linux/nfsd/stats.h> + + +struct nfsd_stats { + unsigned int rchits; /* repcache hits */ + unsigned int rcmisses; /* repcache hits */ + unsigned int rcnocache; /* uncached reqs */ + unsigned int fh_stale; /* FH stale error */ + unsigned int fh_lookup; /* dentry cached */ + unsigned int fh_anon; /* anon file dentry returned */ + unsigned int fh_nocache_dir; /* filehandle not found in dcache */ + unsigned int fh_nocache_nondir; /* filehandle not found in dcache */ + unsigned int io_read; /* bytes returned to read requests */ + unsigned int io_write; /* bytes passed in write requests */ + unsigned int th_cnt; /* number of available threads */ + unsigned int th_usage[10]; /* number of ticks during which n perdeciles + * of available threads were in use */ + unsigned int th_fullcnt; /* number of times last free thread was used */ + unsigned int ra_size; /* size of ra cache */ + unsigned int ra_depth[11]; /* number of times ra entry was found that deep + * in the cache (10percentiles). [10] = not found */ +#ifdef CONFIG_NFSD_V4 + unsigned int nfs4_opcount[LAST_NFS4_OP + 1]; /* count of individual nfsv4 operations */ +#endif + +}; + + +extern struct nfsd_stats nfsdstats; +extern struct svc_stat nfsd_svcstats; + +void nfsd_stat_init(void); +void nfsd_stat_shutdown(void); + +#endif /* _NFSD_STATS_H */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 13886f7f40d..140c496f612 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -207,7 +207,12 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out_nfserr; } } else { - fh_lock(fhp); + /* + * In the nfsd4_open() case, this may be held across + * subsequent open and delegation acquisition which may + * need to take the child's i_mutex: + */ + fh_lock_nested(fhp, I_MUTEX_PARENT); dentry = lookup_one_len(name, dparent, len); host_err = PTR_ERR(dentry); if (IS_ERR(dentry)) @@ -273,13 +278,6 @@ out: return err; } -static int nfsd_break_lease(struct inode *inode) -{ - if (!S_ISREG(inode->i_mode)) - return 0; - return break_lease(inode, O_WRONLY | O_NONBLOCK); -} - /* * Commit metadata changes to stable storage. */ @@ -298,41 +296,12 @@ commit_metadata(struct svc_fh *fhp) } /* - * Set various file attributes. - * N.B. After this call fhp needs an fh_put + * Go over the attributes and take care of the small differences between + * NFS semantics and what Linux expects. */ -__be32 -nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, - int check_guard, time_t guardtime) +static void +nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap) { - struct dentry *dentry; - struct inode *inode; - int accmode = NFSD_MAY_SATTR; - umode_t ftype = 0; - __be32 err; - int host_err; - int size_change = 0; - - if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) - accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; - if (iap->ia_valid & ATTR_SIZE) - ftype = S_IFREG; - - /* Get inode */ - err = fh_verify(rqstp, fhp, ftype, accmode); - if (err) - goto out; - - dentry = fhp->fh_dentry; - inode = dentry->d_inode; - - /* Ignore any mode updates on symlinks */ - if (S_ISLNK(inode->i_mode)) - iap->ia_valid &= ~ATTR_MODE; - - if (!iap->ia_valid) - goto out; - /* * NFSv2 does not differentiate between "set-[ac]time-to-now" * which only requires access, and "set-[ac]time-to-X" which @@ -342,8 +311,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, * convert to "set to now" instead of "set to explicit time" * * We only call inode_change_ok as the last test as technically - * it is not an interface that we should be using. It is only - * valid if the filesystem does not define it's own i_op->setattr. + * it is not an interface that we should be using. */ #define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET) #define MAX_TOUCH_TIME_ERROR (30*60) @@ -369,30 +337,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, iap->ia_valid &= ~BOTH_TIME_SET; } } - - /* - * The size case is special. - * It changes the file as well as the attributes. - */ - if (iap->ia_valid & ATTR_SIZE) { - if (iap->ia_size < inode->i_size) { - err = nfsd_permission(rqstp, fhp->fh_export, dentry, - NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE); - if (err) - goto out; - } - - host_err = get_write_access(inode); - if (host_err) - goto out_nfserr; - - size_change = 1; - host_err = locks_verify_truncate(inode, NULL, iap->ia_size); - if (host_err) { - put_write_access(inode); - goto out_nfserr; - } - } /* sanitize the mode change */ if (iap->ia_valid & ATTR_MODE) { @@ -402,8 +346,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, /* Revoke setuid/setgid on chown */ if (!S_ISDIR(inode->i_mode) && - (((iap->ia_valid & ATTR_UID) && !uid_eq(iap->ia_uid, inode->i_uid)) || - ((iap->ia_valid & ATTR_GID) && !gid_eq(iap->ia_gid, inode->i_gid)))) { + ((iap->ia_valid & ATTR_UID) || (iap->ia_valid & ATTR_GID))) { iap->ia_valid |= ATTR_KILL_PRIV; if (iap->ia_valid & ATTR_MODE) { /* we're setting mode too, just clear the s*id bits */ @@ -415,186 +358,118 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID); } } - - /* Change the attributes. */ - - iap->ia_valid |= ATTR_CTIME; - - err = nfserr_notsync; - if (!check_guard || guardtime == inode->i_ctime.tv_sec) { - host_err = nfsd_break_lease(inode); - if (host_err) - goto out_nfserr; - fh_lock(fhp); - - host_err = notify_change(dentry, iap); - err = nfserrno(host_err); - fh_unlock(fhp); - } - if (size_change) - put_write_access(inode); - if (!err) - commit_metadata(fhp); -out: - return err; - -out_nfserr: - err = nfserrno(host_err); - goto out; } -#if defined(CONFIG_NFSD_V2_ACL) || \ - defined(CONFIG_NFSD_V3_ACL) || \ - defined(CONFIG_NFSD_V4) -static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf) +static __be32 +nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct iattr *iap) { - ssize_t buflen; - ssize_t ret; - - buflen = vfs_getxattr(dentry, key, NULL, 0); - if (buflen <= 0) - return buflen; + struct inode *inode = fhp->fh_dentry->d_inode; + int host_err; - *buf = kmalloc(buflen, GFP_KERNEL); - if (!*buf) - return -ENOMEM; + if (iap->ia_size < inode->i_size) { + __be32 err; - ret = vfs_getxattr(dentry, key, *buf, buflen); - if (ret < 0) - kfree(*buf); - return ret; -} -#endif + err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, + NFSD_MAY_TRUNC | NFSD_MAY_OWNER_OVERRIDE); + if (err) + return err; + } -#if defined(CONFIG_NFSD_V4) -static int -set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key) -{ - int len; - size_t buflen; - char *buf = NULL; - int error = 0; - - buflen = posix_acl_xattr_size(pacl->a_count); - buf = kmalloc(buflen, GFP_KERNEL); - error = -ENOMEM; - if (buf == NULL) - goto out; + host_err = get_write_access(inode); + if (host_err) + goto out_nfserrno; - len = posix_acl_to_xattr(&init_user_ns, pacl, buf, buflen); - if (len < 0) { - error = len; - goto out; - } + host_err = locks_verify_truncate(inode, NULL, iap->ia_size); + if (host_err) + goto out_put_write_access; + return 0; - error = vfs_setxattr(dentry, key, buf, len, 0); -out: - kfree(buf); - return error; +out_put_write_access: + put_write_access(inode); +out_nfserrno: + return nfserrno(host_err); } +/* + * Set various file attributes. After this call fhp needs an fh_put. + */ __be32 -nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct nfs4_acl *acl) +nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, + int check_guard, time_t guardtime) { - __be32 error; - int host_error; - struct dentry *dentry; - struct inode *inode; - struct posix_acl *pacl = NULL, *dpacl = NULL; - unsigned int flags = 0; + struct dentry *dentry; + struct inode *inode; + int accmode = NFSD_MAY_SATTR; + umode_t ftype = 0; + __be32 err; + int host_err; + bool get_write_count; + int size_change = 0; + + if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) + accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; + if (iap->ia_valid & ATTR_SIZE) + ftype = S_IFREG; + + /* Callers that do fh_verify should do the fh_want_write: */ + get_write_count = !fhp->fh_dentry; /* Get inode */ - error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR); - if (error) - return error; + err = fh_verify(rqstp, fhp, ftype, accmode); + if (err) + goto out; + if (get_write_count) { + host_err = fh_want_write(fhp); + if (host_err) + return nfserrno(host_err); + } dentry = fhp->fh_dentry; inode = dentry->d_inode; - if (S_ISDIR(inode->i_mode)) - flags = NFS4_ACL_DIR; - host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags); - if (host_error == -EINVAL) { - return nfserr_attrnotsupp; - } else if (host_error < 0) - goto out_nfserr; + /* Ignore any mode updates on symlinks */ + if (S_ISLNK(inode->i_mode)) + iap->ia_valid &= ~ATTR_MODE; - host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS); - if (host_error < 0) - goto out_release; + if (!iap->ia_valid) + goto out; - if (S_ISDIR(inode->i_mode)) - host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT); + nfsd_sanitize_attrs(inode, iap); -out_release: - posix_acl_release(pacl); - posix_acl_release(dpacl); -out_nfserr: - if (host_error == -EOPNOTSUPP) - return nfserr_attrnotsupp; - else - return nfserrno(host_error); -} + /* + * The size case is special, it changes the file in addition to the + * attributes. + */ + if (iap->ia_valid & ATTR_SIZE) { + err = nfsd_get_write_access(rqstp, fhp, iap); + if (err) + goto out; + size_change = 1; + } -static struct posix_acl * -_get_posix_acl(struct dentry *dentry, char *key) -{ - void *buf = NULL; - struct posix_acl *pacl = NULL; - int buflen; - - buflen = nfsd_getxattr(dentry, key, &buf); - if (!buflen) - buflen = -ENODATA; - if (buflen <= 0) - return ERR_PTR(buflen); - - pacl = posix_acl_from_xattr(&init_user_ns, buf, buflen); - kfree(buf); - return pacl; -} + iap->ia_valid |= ATTR_CTIME; -int -nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_acl **acl) -{ - struct inode *inode = dentry->d_inode; - int error = 0; - struct posix_acl *pacl = NULL, *dpacl = NULL; - unsigned int flags = 0; - - pacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_ACCESS); - if (IS_ERR(pacl) && PTR_ERR(pacl) == -ENODATA) - pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); - if (IS_ERR(pacl)) { - error = PTR_ERR(pacl); - pacl = NULL; - goto out; + if (check_guard && guardtime != inode->i_ctime.tv_sec) { + err = nfserr_notsync; + goto out_put_write_access; } - if (S_ISDIR(inode->i_mode)) { - dpacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_DEFAULT); - if (IS_ERR(dpacl) && PTR_ERR(dpacl) == -ENODATA) - dpacl = NULL; - else if (IS_ERR(dpacl)) { - error = PTR_ERR(dpacl); - dpacl = NULL; - goto out; - } - flags = NFS4_ACL_DIR; - } + fh_lock(fhp); + host_err = notify_change(dentry, iap, NULL); + fh_unlock(fhp); + err = nfserrno(host_err); - *acl = nfs4_acl_posix_to_nfsv4(pacl, dpacl, flags); - if (IS_ERR(*acl)) { - error = PTR_ERR(*acl); - *acl = NULL; - } - out: - posix_acl_release(pacl); - posix_acl_release(dpacl); - return error; +out_put_write_access: + if (size_change) + put_write_access(inode); + if (!err) + commit_metadata(fhp); +out: + return err; } +#if defined(CONFIG_NFSD_V4) /* * NFS junction information is stored in an extended attribute. */ @@ -945,51 +820,54 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, return __splice_from_pipe(pipe, sd, nfsd_splice_actor); } -static __be32 -nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, - loff_t offset, struct kvec *vec, int vlen, unsigned long *count) +__be32 nfsd_finish_read(struct file *file, unsigned long *count, int host_err) { - mm_segment_t oldfs; - __be32 err; - int host_err; - - err = nfserr_perm; - - if (file->f_op->splice_read && rqstp->rq_splice_ok) { - struct splice_desc sd = { - .len = 0, - .total_len = *count, - .pos = offset, - .u.data = rqstp, - }; - - rqstp->rq_next_page = rqstp->rq_respages + 1; - host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); - } else { - oldfs = get_fs(); - set_fs(KERNEL_DS); - host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset); - set_fs(oldfs); - } - if (host_err >= 0) { nfsdstats.io_read += host_err; *count = host_err; - err = 0; fsnotify_access(file); + return 0; } else - err = nfserrno(host_err); - return err; + return nfserrno(host_err); } -static void kill_suid(struct dentry *dentry) +int nfsd_splice_read(struct svc_rqst *rqstp, + struct file *file, loff_t offset, unsigned long *count) { - struct iattr ia; - ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; + struct splice_desc sd = { + .len = 0, + .total_len = *count, + .pos = offset, + .u.data = rqstp, + }; + int host_err; - mutex_lock(&dentry->d_inode->i_mutex); - notify_change(dentry, &ia); - mutex_unlock(&dentry->d_inode->i_mutex); + rqstp->rq_next_page = rqstp->rq_respages + 1; + host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); + return nfsd_finish_read(file, count, host_err); +} + +int nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen, + unsigned long *count) +{ + mm_segment_t oldfs; + int host_err; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset); + set_fs(oldfs); + return nfsd_finish_read(file, count, host_err); +} + +static __be32 +nfsd_vfs_read(struct svc_rqst *rqstp, struct file *file, + loff_t offset, struct kvec *vec, int vlen, unsigned long *count) +{ + if (file->f_op->splice_read && rqstp->rq_splice_ok) + return nfsd_splice_read(rqstp, file, offset, count); + else + return nfsd_readv(file, offset, vec, vlen, count); } /* @@ -1043,6 +921,16 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, int stable = *stablep; int use_wgather; loff_t pos = offset; + unsigned int pflags = current->flags; + + if (rqstp->rq_local) + /* + * We want less throttling in balance_dirty_pages() + * and shrink_inactive_list() so that nfs to + * localhost doesn't cause nfsd to lock up due to all + * the client's dirty pages or its congested queue. + */ + current->flags |= PF_LESS_THROTTLE; dentry = file->f_path.dentry; inode = dentry->d_inode; @@ -1063,10 +951,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, nfsdstats.io_write += host_err; fsnotify_modify(file); - /* clear setuid/setgid flag after write */ - if (inode->i_mode & (S_ISUID | S_ISGID)) - kill_suid(dentry); - if (stable) { if (use_wgather) host_err = wait_for_concurrent_writes(file); @@ -1080,36 +964,33 @@ out_nfserr: err = 0; else err = nfserrno(host_err); + if (rqstp->rq_local) + tsk_restore_flags(current, pflags, PF_LESS_THROTTLE); return err; } -/* - * Read data from a file. count must contain the requested read count - * on entry. On return, *count contains the number of bytes actually read. - * N.B. After this call fhp needs an fh_put - */ -__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, - loff_t offset, struct kvec *vec, int vlen, unsigned long *count) +__be32 nfsd_get_tmp_read_open(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file **file, struct raparms **ra) { - struct file *file; struct inode *inode; - struct raparms *ra; __be32 err; - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); + err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, file); if (err) return err; - inode = file_inode(file); + inode = file_inode(*file); /* Get readahead parameters */ - ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); - - if (ra && ra->p_set) - file->f_ra = ra->p_ra; + *ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); - err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); + if (*ra && (*ra)->p_set) + (*file)->f_ra = (*ra)->p_ra; + return nfs_ok; +} +void nfsd_put_tmp_read_open(struct file *file, struct raparms *ra) +{ /* Write back readahead params */ if (ra) { struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; @@ -1119,28 +1000,29 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, ra->p_count--; spin_unlock(&rab->pb_lock); } - nfsd_close(file); - return err; } -/* As above, but use the provided file descriptor. */ -__be32 -nfsd_read_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, - loff_t offset, struct kvec *vec, int vlen, - unsigned long *count) +/* + * Read data from a file. count must contain the requested read count + * on entry. On return, *count contains the number of bytes actually read. + * N.B. After this call fhp needs an fh_put + */ +__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + loff_t offset, struct kvec *vec, int vlen, unsigned long *count) { - __be32 err; + struct file *file; + struct raparms *ra; + __be32 err; + + err = nfsd_get_tmp_read_open(rqstp, fhp, &file, &ra); + if (err) + return err; + + err = nfsd_vfs_read(rqstp, file, offset, vec, vlen, count); + + nfsd_put_tmp_read_open(file, ra); - if (file) { - err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, - NFSD_MAY_READ|NFSD_MAY_OWNER_OVERRIDE); - if (err) - goto out; - err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); - } else /* Note file may still be NULL in NFSv4 special stateid case: */ - err = nfsd_read(rqstp, fhp, offset, vec, vlen, count); -out: return err; } @@ -1731,12 +1613,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, err = nfserr_noent; if (!dold->d_inode) goto out_dput; - host_err = nfsd_break_lease(dold->d_inode); - if (host_err) { - err = nfserrno(host_err); - goto out_dput; - } - host_err = vfs_link(dold, dirp, dnew); + host_err = vfs_link(dold, dirp, dnew, NULL); if (!host_err) { err = nfserrno(commit_metadata(ffhp)); if (!err) @@ -1829,15 +1706,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) goto out_dput_new; - host_err = nfsd_break_lease(odentry->d_inode); - if (host_err) - goto out_dput_new; - if (ndentry->d_inode) { - host_err = nfsd_break_lease(ndentry->d_inode); - if (host_err) - goto out_dput_new; - } - host_err = vfs_rename(fdir, odentry, tdir, ndentry); + host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0); if (!host_err) { host_err = commit_metadata(tfhp); if (!host_err) @@ -1849,10 +1718,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, dput(odentry); out_nfserr: err = nfserrno(host_err); - - /* we cannot reply on fh_unlock on the two filehandles, + /* + * We cannot rely on fh_unlock on the two filehandles, * as that would do the wrong thing if the two directories - * were the same, so again we do it by hand + * were the same, so again we do it by hand. */ fill_post_wcc(ffhp); fill_post_wcc(tfhp); @@ -1906,16 +1775,12 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (!type) type = rdentry->d_inode->i_mode & S_IFMT; - host_err = nfsd_break_lease(rdentry->d_inode); - if (host_err) - goto out_put; if (type != S_IFDIR) - host_err = vfs_unlink(dirp, rdentry); + host_err = vfs_unlink(dirp, rdentry, NULL); else host_err = vfs_rmdir(dirp, rdentry); if (!host_err) host_err = commit_metadata(fhp); -out_put: dput(rdentry); out_nfserr: @@ -2255,93 +2120,3 @@ out_nomem: nfsd_racache_shutdown(); return -ENOMEM; } - -#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) -struct posix_acl * -nfsd_get_posix_acl(struct svc_fh *fhp, int type) -{ - struct inode *inode = fhp->fh_dentry->d_inode; - char *name; - void *value = NULL; - ssize_t size; - struct posix_acl *acl; - - if (!IS_POSIXACL(inode)) - return ERR_PTR(-EOPNOTSUPP); - - switch (type) { - case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; - break; - case ACL_TYPE_DEFAULT: - name = POSIX_ACL_XATTR_DEFAULT; - break; - default: - return ERR_PTR(-EOPNOTSUPP); - } - - size = nfsd_getxattr(fhp->fh_dentry, name, &value); - if (size < 0) - return ERR_PTR(size); - - acl = posix_acl_from_xattr(&init_user_ns, value, size); - kfree(value); - return acl; -} - -int -nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl) -{ - struct inode *inode = fhp->fh_dentry->d_inode; - char *name; - void *value = NULL; - size_t size; - int error; - - if (!IS_POSIXACL(inode) || - !inode->i_op->setxattr || !inode->i_op->removexattr) - return -EOPNOTSUPP; - switch(type) { - case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; - break; - case ACL_TYPE_DEFAULT: - name = POSIX_ACL_XATTR_DEFAULT; - break; - default: - return -EOPNOTSUPP; - } - - if (acl && acl->a_count) { - size = posix_acl_xattr_size(acl->a_count); - value = kmalloc(size, GFP_KERNEL); - if (!value) - return -ENOMEM; - error = posix_acl_to_xattr(&init_user_ns, acl, value, size); - if (error < 0) - goto getout; - size = error; - } else - size = 0; - - error = fh_want_write(fhp); - if (error) - goto getout; - if (size) - error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0); - else { - if (!S_ISDIR(inode->i_mode) && type == ACL_TYPE_DEFAULT) - error = 0; - else { - error = vfs_removexattr(fhp->fh_dentry, name); - if (error == -ENODATA) - error = 0; - } - } - fh_drop_write(fhp); - -getout: - kfree(value); - return error; -} -#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index a4be2e38967..91b6ae3f658 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -52,9 +52,6 @@ __be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *, struct iattr *, int, time_t); int nfsd_mountpoint(struct dentry *, struct svc_export *); #ifdef CONFIG_NFSD_V4 -__be32 nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *, - struct nfs4_acl *); -int nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **); __be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *, struct xdr_netobj *); #endif /* CONFIG_NFSD_V4 */ @@ -73,10 +70,16 @@ __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); void nfsd_close(struct file *); +struct raparms; +__be32 nfsd_get_tmp_read_open(struct svc_rqst *, struct svc_fh *, + struct file **, struct raparms **); +void nfsd_put_tmp_read_open(struct file *, struct raparms *); +int nfsd_splice_read(struct svc_rqst *, + struct file *, loff_t, unsigned long *); +int nfsd_readv(struct file *, loff_t, struct kvec *, int, + unsigned long *); __be32 nfsd_read(struct svc_rqst *, struct svc_fh *, loff_t, struct kvec *, int, unsigned long *); -__be32 nfsd_read_file(struct svc_rqst *, struct svc_fh *, struct file *, - loff_t, struct kvec *, int, unsigned long *); __be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *, loff_t, struct kvec *,int, unsigned long *, int *); __be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *, @@ -89,8 +92,6 @@ __be32 nfsd_link(struct svc_rqst *, struct svc_fh *, __be32 nfsd_rename(struct svc_rqst *, struct svc_fh *, char *, int, struct svc_fh *, char *, int); -__be32 nfsd_remove(struct svc_rqst *, - struct svc_fh *, char *, int); __be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type, char *name, int len); __be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *, @@ -101,11 +102,6 @@ __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, __be32 nfsd_permission(struct svc_rqst *, struct svc_export *, struct dentry *, int); -#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) -struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int); -int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *); -#endif - static inline int fh_want_write(struct svc_fh *fh) { int ret = mnt_want_write(fh->fh_export->ex_path.mnt); diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index b6d5542a4ac..335e04aaf7d 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -174,6 +174,9 @@ struct nfsd3_linkres { struct nfsd3_readdirres { __be32 status; struct svc_fh fh; + /* Just to save kmalloc on every readdirplus entry (svc_fh is a + * little large for the stack): */ + struct svc_fh scratch; int count; __be32 verf[2]; diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index b3ed6446ed8..18cbb6d9c8a 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -58,7 +58,7 @@ struct nfsd4_compound_state { /* For sessions DRC */ struct nfsd4_session *session; struct nfsd4_slot *slot; - __be32 *datap; + int data_offset; size_t iovlen; u32 minorversion; __be32 status; @@ -228,7 +228,7 @@ struct nfsd4_open { u32 op_create; /* request */ u32 op_createmode; /* request */ u32 op_bmval[3]; /* request */ - struct iattr iattr; /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */ + struct iattr op_iattr; /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */ nfs4_verifier op_verf __attribute__((aligned(32))); /* EXCLUSIVE4 */ clientid_t op_clientid; /* request */ @@ -250,7 +250,6 @@ struct nfsd4_open { struct nfs4_acl *op_acl; struct xdr_netobj op_label; }; -#define op_iattr iattr struct nfsd4_open_confirm { stateid_t oc_req_stateid /* request */; @@ -288,9 +287,8 @@ struct nfsd4_readdir { struct svc_fh * rd_fhp; /* response */ struct readdir_cd common; - __be32 * buffer; - int buflen; - __be32 * offset; + struct xdr_stream *xdr; + int cookie_offset; }; struct nfsd4_release_lockowner { @@ -374,7 +372,6 @@ struct nfsd4_test_stateid { struct nfsd4_free_stateid { stateid_t fr_stateid; /* request */ - __be32 fr_status; /* response */ }; /* also used for NVERIFY */ @@ -508,9 +505,7 @@ struct nfsd4_compoundargs { struct nfsd4_compoundres { /* scratch variables for XDR encode */ - __be32 * p; - __be32 * end; - struct xdr_buf * xbuf; + struct xdr_stream xdr; struct svc_rqst * rqstp; u32 taglen; @@ -540,6 +535,9 @@ static inline bool nfsd4_last_compound_op(struct svc_rqst *rqstp) return argp->opcnt == resp->opcnt; } +int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op); +void warn_on_nonidempotent_op(struct nfsd4_op *op); + #define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs) static inline void @@ -565,10 +563,11 @@ int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *, struct nfsd4_compoundres *); __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32); void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *); -void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op); -__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, - struct dentry *dentry, __be32 **buffer, int countp, - u32 *bmval, struct svc_rqst *, int ignore_crossmnt); +void nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op); +__be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words, + struct svc_fh *fhp, struct svc_export *exp, + struct dentry *dentry, + u32 *bmval, struct svc_rqst *, int ignore_crossmnt); extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_setclientid *setclid); @@ -576,8 +575,6 @@ extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_setclientid_confirm *setclientid_confirm); extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp); -extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, - struct nfsd4_sequence *seq); extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_exchange_id *); extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_backchannel_ctl *); diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index deaa3d33a0a..0d58075f34e 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -942,6 +942,18 @@ int nilfs_cpfile_read(struct super_block *sb, size_t cpsize, struct inode *cpfile; int err; + if (cpsize > sb->s_blocksize) { + printk(KERN_ERR + "NILFS: too large checkpoint size: %zu bytes.\n", + cpsize); + return -EINVAL; + } else if (cpsize < NILFS_MIN_CHECKPOINT_SIZE) { + printk(KERN_ERR + "NILFS: too small checkpoint size: %zu bytes.\n", + cpsize); + return -EINVAL; + } + cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO); if (unlikely(!cpfile)) return -ENOMEM; diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index fa0f80308c2..0d5fada9119 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -484,6 +484,18 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size, struct nilfs_dat_info *di; int err; + if (entry_size > sb->s_blocksize) { + printk(KERN_ERR + "NILFS: too large DAT entry size: %zu bytes.\n", + entry_size); + return -EINVAL; + } else if (entry_size < NILFS_MIN_DAT_ENTRY_SIZE) { + printk(KERN_ERR + "NILFS: too small DAT entry size: %zu bytes.\n", + entry_size); + return -EINVAL; + } + dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO); if (unlikely(!dat)) return -ENOMEM; diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 08fdb77852a..24978153c0c 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -134,6 +134,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static const struct vm_operations_struct nilfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = nilfs_page_mkwrite, .remap_pages = generic_file_remap_pages, }; @@ -151,10 +152,10 @@ static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) */ const struct file_operations nilfs_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .unlocked_ioctl = nilfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = nilfs_compat_ioctl, diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 7e350c562e0..6252b173a46 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -298,19 +298,20 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping, } static ssize_t -nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs) +nilfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = file->f_mapping->host; + size_t count = iov_iter_count(iter); ssize_t size; if (rw == WRITE) return 0; /* Needs synchronization with the cleaner */ - size = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, + size = blockdev_direct_IO(rw, iocb, inode, iter, offset, nilfs_get_block); /* @@ -319,7 +320,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, */ if (unlikely((rw & WRITE) && size < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + iov_length(iov, nr_segs); + loff_t end = offset + count; if (end > isize) nilfs_write_failed(mapping, end); @@ -783,16 +784,14 @@ void nilfs_evict_inode(struct inode *inode) int ret; if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) { - if (inode->i_data.nrpages) - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); nilfs_clear_inode(inode); return; } nilfs_transaction_begin(sb, &ti, 0); /* never fails */ - if (inode->i_data.nrpages) - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); /* TODO: some of the following operations may fail. */ nilfs_truncate_bmap(ii, 0); diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index b44bdb291b8..422fb54b737 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -37,7 +37,26 @@ #include "sufile.h" #include "dat.h" - +/** + * nilfs_ioctl_wrap_copy - wrapping function of get/set metadata info + * @nilfs: nilfs object + * @argv: vector of arguments from userspace + * @dir: set of direction flags + * @dofunc: concrete function of get/set metadata info + * + * Description: nilfs_ioctl_wrap_copy() gets/sets metadata info by means of + * calling dofunc() function on the basis of @argv argument. + * + * Return Value: On success, 0 is returned and requested metadata info + * is copied into userspace. On error, one of the following + * negative error codes is returned. + * + * %-EINVAL - Invalid arguments from userspace. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EFAULT - Failure during execution of requested operation. + */ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs, struct nilfs_argv *argv, int dir, ssize_t (*dofunc)(struct the_nilfs *, @@ -57,6 +76,14 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs, if (argv->v_size > PAGE_SIZE) return -EINVAL; + /* + * Reject pairs of a start item position (argv->v_index) and a + * total count (argv->v_nmembs) which leads position 'pos' to + * overflow by the increment at the end of the loop. + */ + if (argv->v_index > ~(__u64)0 - argv->v_nmembs) + return -EINVAL; + buf = (void *)__get_free_pages(GFP_NOFS, 0); if (unlikely(!buf)) return -ENOMEM; @@ -99,6 +126,9 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs, return ret; } +/** + * nilfs_ioctl_getflags - ioctl to support lsattr + */ static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp) { unsigned int flags = NILFS_I(inode)->i_flags & FS_FL_USER_VISIBLE; @@ -106,6 +136,9 @@ static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp) return put_user(flags, (int __user *)argp); } +/** + * nilfs_ioctl_setflags - ioctl to support chattr + */ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, void __user *argp) { @@ -158,11 +191,33 @@ out: return ret; } +/** + * nilfs_ioctl_getversion - get info about a file's version (generation number) + */ static int nilfs_ioctl_getversion(struct inode *inode, void __user *argp) { return put_user(inode->i_generation, (int __user *)argp); } +/** + * nilfs_ioctl_change_cpmode - change checkpoint mode (checkpoint/snapshot) + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_change_cpmode() function changes mode of + * given checkpoint between checkpoint and snapshot state. This ioctl + * is used in chcp and mkcp utilities. + * + * Return Value: On success, 0 is returned and mode of a checkpoint is + * changed. On error, one of the following negative error codes + * is returned. + * + * %-EPERM - Operation not permitted. + * + * %-EFAULT - Failure during checkpoint mode changing. + */ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) { @@ -198,6 +253,25 @@ out: return ret; } +/** + * nilfs_ioctl_delete_checkpoint - remove checkpoint + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_delete_checkpoint() function removes + * checkpoint from NILFS2 file system. This ioctl is used in rmcp + * utility. + * + * Return Value: On success, 0 is returned and a checkpoint is + * removed. On error, one of the following negative error codes + * is returned. + * + * %-EPERM - Operation not permitted. + * + * %-EFAULT - Failure during checkpoint removing. + */ static int nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) @@ -229,6 +303,21 @@ out: return ret; } +/** + * nilfs_ioctl_do_get_cpinfo - callback method getting info about checkpoints + * @nilfs: nilfs object + * @posp: pointer on array of checkpoint's numbers + * @flags: checkpoint mode (checkpoint or snapshot) + * @buf: buffer for storing checkponts' info + * @size: size in bytes of one checkpoint info item in array + * @nmembs: number of checkpoints in array (numbers and infos) + * + * Description: nilfs_ioctl_do_get_cpinfo() function returns info about + * requested checkpoints. The NILFS_IOCTL_GET_CPINFO ioctl is used in + * lscp utility and by nilfs_cleanerd daemon. + * + * Return value: count of nilfs_cpinfo structures in output buffer. + */ static ssize_t nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, void *buf, size_t size, size_t nmembs) @@ -242,6 +331,27 @@ nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, return ret; } +/** + * nilfs_ioctl_get_cpstat - get checkpoints statistics + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_get_cpstat() returns information about checkpoints. + * The NILFS_IOCTL_GET_CPSTAT ioctl is used by lscp, rmcp utilities + * and by nilfs_cleanerd daemon. + * + * Return Value: On success, 0 is returned, and checkpoints information is + * copied into userspace pointer @argp. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EFAULT - Failure during getting checkpoints statistics. + */ static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) { @@ -260,6 +370,21 @@ static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp, return ret; } +/** + * nilfs_ioctl_do_get_suinfo - callback method getting segment usage info + * @nilfs: nilfs object + * @posp: pointer on array of segment numbers + * @flags: *not used* + * @buf: buffer for storing suinfo array + * @size: size in bytes of one suinfo item in array + * @nmembs: count of segment numbers and suinfos in array + * + * Description: nilfs_ioctl_do_get_suinfo() function returns segment usage + * info about requested segments. The NILFS_IOCTL_GET_SUINFO ioctl is used + * in lssu, nilfs_resize utilities and by nilfs_cleanerd daemon. + * + * Return value: count of nilfs_suinfo structures in output buffer. + */ static ssize_t nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, void *buf, size_t size, size_t nmembs) @@ -273,6 +398,27 @@ nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, return ret; } +/** + * nilfs_ioctl_get_sustat - get segment usage statistics + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_get_sustat() returns segment usage statistics. + * The NILFS_IOCTL_GET_SUSTAT ioctl is used in lssu, nilfs_resize utilities + * and by nilfs_cleanerd daemon. + * + * Return Value: On success, 0 is returned, and segment usage information is + * copied into userspace pointer @argp. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EFAULT - Failure during getting segment usage statistics. + */ static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) { @@ -291,6 +437,21 @@ static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp, return ret; } +/** + * nilfs_ioctl_do_get_vinfo - callback method getting virtual blocks info + * @nilfs: nilfs object + * @posp: *not used* + * @flags: *not used* + * @buf: buffer for storing array of nilfs_vinfo structures + * @size: size in bytes of one vinfo item in array + * @nmembs: count of vinfos in array + * + * Description: nilfs_ioctl_do_get_vinfo() function returns information + * on virtual block addresses. The NILFS_IOCTL_GET_VINFO ioctl is used + * by nilfs_cleanerd daemon. + * + * Return value: count of nilfs_vinfo structures in output buffer. + */ static ssize_t nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, void *buf, size_t size, size_t nmembs) @@ -303,6 +464,21 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, return ret; } +/** + * nilfs_ioctl_do_get_bdescs - callback method getting disk block descriptors + * @nilfs: nilfs object + * @posp: *not used* + * @flags: *not used* + * @buf: buffer for storing array of nilfs_bdesc structures + * @size: size in bytes of one bdesc item in array + * @nmembs: count of bdescs in array + * + * Description: nilfs_ioctl_do_get_bdescs() function returns information + * about descriptors of disk block numbers. The NILFS_IOCTL_GET_BDESCS ioctl + * is used by nilfs_cleanerd daemon. + * + * Return value: count of nilfs_bdescs structures in output buffer. + */ static ssize_t nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags, void *buf, size_t size, size_t nmembs) @@ -329,6 +505,29 @@ nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags, return nmembs; } +/** + * nilfs_ioctl_get_bdescs - get disk block descriptors + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_do_get_bdescs() function returns information + * about descriptors of disk block numbers. The NILFS_IOCTL_GET_BDESCS ioctl + * is used by nilfs_cleanerd daemon. + * + * Return Value: On success, 0 is returned, and disk block descriptors are + * copied into userspace pointer @argp. On error, one of the following + * negative error codes is returned. + * + * %-EINVAL - Invalid arguments from userspace. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EFAULT - Failure during getting disk block descriptors. + */ static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) { @@ -352,6 +551,26 @@ static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp, return ret; } +/** + * nilfs_ioctl_move_inode_block - prepare data/node block for moving by GC + * @inode: inode object + * @vdesc: descriptor of virtual block number + * @buffers: list of moving buffers + * + * Description: nilfs_ioctl_move_inode_block() function registers data/node + * buffer in the GC pagecache and submit read request. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - Requested block doesn't exist. + * + * %-EEXIST - Blocks conflict is detected. + */ static int nilfs_ioctl_move_inode_block(struct inode *inode, struct nilfs_vdesc *vdesc, struct list_head *buffers) @@ -397,6 +616,19 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode, return 0; } +/** + * nilfs_ioctl_move_blocks - move valid inode's blocks during garbage collection + * @sb: superblock object + * @argv: vector of arguments from userspace + * @buf: array of nilfs_vdesc structures + * + * Description: nilfs_ioctl_move_blocks() function reads valid data/node + * blocks that garbage collector specified with the array of nilfs_vdesc + * structures and stores them into page caches of GC inodes. + * + * Return Value: Number of processed nilfs_vdesc structures or + * error code, otherwise. + */ static int nilfs_ioctl_move_blocks(struct super_block *sb, struct nilfs_argv *argv, void *buf) { @@ -462,6 +694,25 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb, return ret; } +/** + * nilfs_ioctl_delete_checkpoints - delete checkpoints + * @nilfs: nilfs object + * @argv: vector of arguments from userspace + * @buf: array of periods of checkpoints numbers + * + * Description: nilfs_ioctl_delete_checkpoints() function deletes checkpoints + * in the period from p_start to p_end, excluding p_end itself. The checkpoints + * which have been already deleted are ignored. + * + * Return Value: Number of processed nilfs_period structures or + * error code, otherwise. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - invalid checkpoints. + */ static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs, struct nilfs_argv *argv, void *buf) { @@ -479,6 +730,24 @@ static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs, return nmembs; } +/** + * nilfs_ioctl_free_vblocknrs - free virtual block numbers + * @nilfs: nilfs object + * @argv: vector of arguments from userspace + * @buf: array of virtual block numbers + * + * Description: nilfs_ioctl_free_vblocknrs() function frees + * the virtual block numbers specified by @buf and @argv->v_nmembs. + * + * Return Value: Number of processed virtual block numbers or + * error code, otherwise. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - The virtual block number have not been allocated. + */ static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs, struct nilfs_argv *argv, void *buf) { @@ -490,6 +759,24 @@ static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs, return (ret < 0) ? ret : nmembs; } +/** + * nilfs_ioctl_mark_blocks_dirty - mark blocks dirty + * @nilfs: nilfs object + * @argv: vector of arguments from userspace + * @buf: array of block descriptors + * + * Description: nilfs_ioctl_mark_blocks_dirty() function marks + * metadata file or data blocks as dirty. + * + * Return Value: Number of processed block descriptors or + * error code, otherwise. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EIO - I/O error + * + * %-ENOENT - the specified block does not exist (hole block) + */ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs, struct nilfs_argv *argv, void *buf) { @@ -571,6 +858,20 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs, return ret; } +/** + * nilfs_ioctl_clean_segments - clean segments + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_clean_segments() function makes garbage + * collection operation in the environment of requested parameters + * from userspace. The NILFS_IOCTL_CLEAN_SEGMENTS ioctl is used by + * nilfs_cleanerd daemon. + * + * Return Value: On success, 0 is returned or error code, otherwise. + */ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) { @@ -682,6 +983,33 @@ out: return ret; } +/** + * nilfs_ioctl_sync - make a checkpoint + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_sync() function constructs a logical segment + * for checkpointing. This function guarantees that all modified data + * and metadata are written out to the device when it successfully + * returned. + * + * Return Value: On success, 0 is retured. On errors, one of the following + * negative error code is returned. + * + * %-EROFS - Read only filesystem. + * + * %-EIO - I/O error + * + * %-ENOSPC - No space left on device (only in a panic state). + * + * %-ERESTARTSYS - Interrupted. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EFAULT - Failure during execution of requested operation. + */ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) { @@ -710,6 +1038,14 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp, return 0; } +/** + * nilfs_ioctl_resize - resize NILFS2 volume + * @inode: inode object + * @filp: file object + * @argp: pointer on argument from userspace + * + * Return Value: On success, 0 is returned or error code, otherwise. + */ static int nilfs_ioctl_resize(struct inode *inode, struct file *filp, void __user *argp) { @@ -735,6 +1071,59 @@ out: return ret; } +/** + * nilfs_ioctl_trim_fs() - trim ioctl handle function + * @inode: inode object + * @argp: pointer on argument from userspace + * + * Decription: nilfs_ioctl_trim_fs is the FITRIM ioctl handle function. It + * checks the arguments from userspace and calls nilfs_sufile_trim_fs, which + * performs the actual trim operation. + * + * Return Value: On success, 0 is returned or negative error code, otherwise. + */ +static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + struct request_queue *q = bdev_get_queue(nilfs->ns_bdev); + struct fstrim_range range; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + if (copy_from_user(&range, argp, sizeof(range))) + return -EFAULT; + + range.minlen = max_t(u64, range.minlen, q->limits.discard_granularity); + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_sufile_trim_fs(nilfs->ns_sufile, &range); + up_read(&nilfs->ns_segctor_sem); + + if (ret < 0) + return ret; + + if (copy_to_user(argp, &range, sizeof(range))) + return -EFAULT; + + return 0; +} + +/** + * nilfs_ioctl_set_alloc_range - limit range of segments to be allocated + * @inode: inode object + * @argp: pointer on argument from userspace + * + * Decription: nilfs_ioctl_set_alloc_range() function defines lower limit + * of segments in bytes and upper limit of segments in bytes. + * The NILFS_IOCTL_SET_ALLOC_RANGE is used by nilfs_resize utility. + * + * Return Value: On success, 0 is returned or error code, otherwise. + */ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp) { struct the_nilfs *nilfs = inode->i_sb->s_fs_info; @@ -767,6 +1156,28 @@ out: return ret; } +/** + * nilfs_ioctl_get_info - wrapping function of get metadata info + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * @membsz: size of an item in bytes + * @dofunc: concrete function of getting metadata info + * + * Description: nilfs_ioctl_get_info() gets metadata info by means of + * calling dofunc() function. + * + * Return Value: On success, 0 is returned and requested metadata info + * is copied into userspace. On error, one of the following + * negative error codes is returned. + * + * %-EINVAL - Invalid arguments from userspace. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EFAULT - Failure during execution of requested operation. + */ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp, size_t membsz, @@ -794,6 +1205,95 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp, return ret; } +/** + * nilfs_ioctl_set_suinfo - set segment usage info + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: Expects an array of nilfs_suinfo_update structures + * encapsulated in nilfs_argv and updates the segment usage info + * according to the flags in nilfs_suinfo_update. + * + * Return Value: On success, 0 is returned. On error, one of the + * following negative error codes is returned. + * + * %-EPERM - Not enough permissions + * + * %-EFAULT - Error copying input data + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid values in input (segment number, flags or nblocks) + */ +static int nilfs_ioctl_set_suinfo(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + struct nilfs_transaction_info ti; + struct nilfs_argv argv; + size_t len; + void __user *base; + void *kbuf; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = -EFAULT; + if (copy_from_user(&argv, argp, sizeof(argv))) + goto out; + + ret = -EINVAL; + if (argv.v_size < sizeof(struct nilfs_suinfo_update)) + goto out; + + if (argv.v_nmembs > nilfs->ns_nsegments) + goto out; + + if (argv.v_nmembs >= UINT_MAX / argv.v_size) + goto out; + + len = argv.v_size * argv.v_nmembs; + if (!len) { + ret = 0; + goto out; + } + + base = (void __user *)(unsigned long)argv.v_base; + kbuf = vmalloc(len); + if (!kbuf) { + ret = -ENOMEM; + goto out; + } + + if (copy_from_user(kbuf, base, len)) { + ret = -EFAULT; + goto out_free; + } + + nilfs_transaction_begin(inode->i_sb, &ti, 0); + ret = nilfs_sufile_set_suinfo(nilfs->ns_sufile, kbuf, argv.v_size, + argv.v_nmembs); + if (unlikely(ret < 0)) + nilfs_transaction_abort(inode->i_sb); + else + nilfs_transaction_commit(inode->i_sb); /* never fails */ + +out_free: + vfree(kbuf); +out: + mnt_drop_write_file(filp); + return ret; +} + long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -820,6 +1320,8 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return nilfs_ioctl_get_info(inode, filp, cmd, argp, sizeof(struct nilfs_suinfo), nilfs_ioctl_do_get_suinfo); + case NILFS_IOCTL_SET_SUINFO: + return nilfs_ioctl_set_suinfo(inode, filp, cmd, argp); case NILFS_IOCTL_GET_SUSTAT: return nilfs_ioctl_get_sustat(inode, filp, cmd, argp); case NILFS_IOCTL_GET_VINFO: @@ -836,6 +1338,8 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return nilfs_ioctl_resize(inode, filp, argp); case NILFS_IOCTL_SET_ALLOC_RANGE: return nilfs_ioctl_set_alloc_range(inode, argp); + case FITRIM: + return nilfs_ioctl_trim_fs(inode, argp); default: return -ENOTTY; } @@ -859,6 +1363,7 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case NILFS_IOCTL_GET_CPINFO: case NILFS_IOCTL_GET_CPSTAT: case NILFS_IOCTL_GET_SUINFO: + case NILFS_IOCTL_SET_SUINFO: case NILFS_IOCTL_GET_SUSTAT: case NILFS_IOCTL_GET_VINFO: case NILFS_IOCTL_GET_BDESCS: @@ -866,6 +1371,7 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case NILFS_IOCTL_SYNC: case NILFS_IOCTL_RESIZE: case NILFS_IOCTL_SET_ALLOC_RANGE: + case FITRIM: break; default: return -ENOIOCTLCMD; diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 2d8be51f90d..dc3a9efdaab 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -416,7 +416,8 @@ static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start, } if (likely(bio)) { bio->bi_bdev = nilfs->ns_bdev; - bio->bi_sector = start << (nilfs->ns_blocksize_bits - 9); + bio->bi_iter.bi_sector = + start << (nilfs->ns_blocksize_bits - 9); } return bio; } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 9f6b486b6c0..a1a191634ab 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1440,17 +1440,19 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci, nilfs_clear_logs(&sci->sc_segbufs); - err = nilfs_segctor_extend_segments(sci, nilfs, nadd); - if (unlikely(err)) - return err; - if (sci->sc_stage.flags & NILFS_CF_SUFREED) { err = nilfs_sufile_cancel_freev(nilfs->ns_sufile, sci->sc_freesegs, sci->sc_nfreesegs, NULL); WARN_ON(err); /* do not happen */ + sci->sc_stage.flags &= ~NILFS_CF_SUFREED; } + + err = nilfs_segctor_extend_segments(sci, nilfs, nadd); + if (unlikely(err)) + return err; + nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA); sci->sc_stage = prev_stage; } diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 3127e9f438a..2a869c35c36 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -870,6 +870,289 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, } /** + * nilfs_sufile_set_suinfo - sets segment usage info + * @sufile: inode of segment usage file + * @buf: array of suinfo_update + * @supsz: byte size of suinfo_update + * @nsup: size of suinfo_update array + * + * Description: Takes an array of nilfs_suinfo_update structs and updates + * segment usage accordingly. Only the fields indicated by the sup_flags + * are updated. + * + * Return Value: On success, 0 is returned. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid values in input (segment number, flags or nblocks) + */ +ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, + unsigned supsz, size_t nsup) +{ + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + struct buffer_head *header_bh, *bh; + struct nilfs_suinfo_update *sup, *supend = buf + supsz * nsup; + struct nilfs_segment_usage *su; + void *kaddr; + unsigned long blkoff, prev_blkoff; + int cleansi, cleansu, dirtysi, dirtysu; + long ncleaned = 0, ndirtied = 0; + int ret = 0; + + if (unlikely(nsup == 0)) + return ret; + + for (sup = buf; sup < supend; sup = (void *)sup + supsz) { + if (sup->sup_segnum >= nilfs->ns_nsegments + || (sup->sup_flags & + (~0UL << __NR_NILFS_SUINFO_UPDATE_FIELDS)) + || (nilfs_suinfo_update_nblocks(sup) && + sup->sup_sui.sui_nblocks > + nilfs->ns_blocks_per_segment)) + return -EINVAL; + } + + down_write(&NILFS_MDT(sufile)->mi_sem); + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + + sup = buf; + blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum); + ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh); + if (ret < 0) + goto out_header; + + for (;;) { + kaddr = kmap_atomic(bh->b_page); + su = nilfs_sufile_block_get_segment_usage( + sufile, sup->sup_segnum, bh, kaddr); + + if (nilfs_suinfo_update_lastmod(sup)) + su->su_lastmod = cpu_to_le64(sup->sup_sui.sui_lastmod); + + if (nilfs_suinfo_update_nblocks(sup)) + su->su_nblocks = cpu_to_le32(sup->sup_sui.sui_nblocks); + + if (nilfs_suinfo_update_flags(sup)) { + /* + * Active flag is a virtual flag projected by running + * nilfs kernel code - drop it not to write it to + * disk. + */ + sup->sup_sui.sui_flags &= + ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE); + + cleansi = nilfs_suinfo_clean(&sup->sup_sui); + cleansu = nilfs_segment_usage_clean(su); + dirtysi = nilfs_suinfo_dirty(&sup->sup_sui); + dirtysu = nilfs_segment_usage_dirty(su); + + if (cleansi && !cleansu) + ++ncleaned; + else if (!cleansi && cleansu) + --ncleaned; + + if (dirtysi && !dirtysu) + ++ndirtied; + else if (!dirtysi && dirtysu) + --ndirtied; + + su->su_flags = cpu_to_le32(sup->sup_sui.sui_flags); + } + + kunmap_atomic(kaddr); + + sup = (void *)sup + supsz; + if (sup >= supend) + break; + + prev_blkoff = blkoff; + blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum); + if (blkoff == prev_blkoff) + continue; + + /* get different block */ + mark_buffer_dirty(bh); + put_bh(bh); + ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh); + if (unlikely(ret < 0)) + goto out_mark; + } + mark_buffer_dirty(bh); + put_bh(bh); + + out_mark: + if (ncleaned || ndirtied) { + nilfs_sufile_mod_counter(header_bh, (u64)ncleaned, + (u64)ndirtied); + NILFS_SUI(sufile)->ncleansegs += ncleaned; + } + nilfs_mdt_mark_dirty(sufile); + out_header: + put_bh(header_bh); + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_trim_fs() - trim ioctl handle function + * @sufile: inode of segment usage file + * @range: fstrim_range structure + * + * start: First Byte to trim + * len: number of Bytes to trim from start + * minlen: minimum extent length in Bytes + * + * Decription: nilfs_sufile_trim_fs goes through all segments containing bytes + * from start to start+len. start is rounded up to the next block boundary + * and start+len is rounded down. For each clean segment blkdev_issue_discard + * function is invoked. + * + * Return Value: On success, 0 is returned or negative error code, otherwise. + */ +int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) +{ + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + struct buffer_head *su_bh; + struct nilfs_segment_usage *su; + void *kaddr; + size_t n, i, susz = NILFS_MDT(sufile)->mi_entry_size; + sector_t seg_start, seg_end, start_block, end_block; + sector_t start = 0, nblocks = 0; + u64 segnum, segnum_end, minlen, len, max_blocks, ndiscarded = 0; + int ret = 0; + unsigned int sects_per_block; + + sects_per_block = (1 << nilfs->ns_blocksize_bits) / + bdev_logical_block_size(nilfs->ns_bdev); + len = range->len >> nilfs->ns_blocksize_bits; + minlen = range->minlen >> nilfs->ns_blocksize_bits; + max_blocks = ((u64)nilfs->ns_nsegments * nilfs->ns_blocks_per_segment); + + if (!len || range->start >= max_blocks << nilfs->ns_blocksize_bits) + return -EINVAL; + + start_block = (range->start + nilfs->ns_blocksize - 1) >> + nilfs->ns_blocksize_bits; + + /* + * range->len can be very large (actually, it is set to + * ULLONG_MAX by default) - truncate upper end of the range + * carefully so as not to overflow. + */ + if (max_blocks - start_block < len) + end_block = max_blocks - 1; + else + end_block = start_block + len - 1; + + segnum = nilfs_get_segnum_of_block(nilfs, start_block); + segnum_end = nilfs_get_segnum_of_block(nilfs, end_block); + + down_read(&NILFS_MDT(sufile)->mi_sem); + + while (segnum <= segnum_end) { + n = nilfs_sufile_segment_usages_in_block(sufile, segnum, + segnum_end); + + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, + &su_bh); + if (ret < 0) { + if (ret != -ENOENT) + goto out_sem; + /* hole */ + segnum += n; + continue; + } + + kaddr = kmap_atomic(su_bh->b_page); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, + su_bh, kaddr); + for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) { + if (!nilfs_segment_usage_clean(su)) + continue; + + nilfs_get_segment_range(nilfs, segnum, &seg_start, + &seg_end); + + if (!nblocks) { + /* start new extent */ + start = seg_start; + nblocks = seg_end - seg_start + 1; + continue; + } + + if (start + nblocks == seg_start) { + /* add to previous extent */ + nblocks += seg_end - seg_start + 1; + continue; + } + + /* discard previous extent */ + if (start < start_block) { + nblocks -= start_block - start; + start = start_block; + } + + if (nblocks >= minlen) { + kunmap_atomic(kaddr); + + ret = blkdev_issue_discard(nilfs->ns_bdev, + start * sects_per_block, + nblocks * sects_per_block, + GFP_NOFS, 0); + if (ret < 0) { + put_bh(su_bh); + goto out_sem; + } + + ndiscarded += nblocks; + kaddr = kmap_atomic(su_bh->b_page); + su = nilfs_sufile_block_get_segment_usage( + sufile, segnum, su_bh, kaddr); + } + + /* start new extent */ + start = seg_start; + nblocks = seg_end - seg_start + 1; + } + kunmap_atomic(kaddr); + put_bh(su_bh); + } + + + if (nblocks) { + /* discard last extent */ + if (start < start_block) { + nblocks -= start_block - start; + start = start_block; + } + if (start + nblocks > end_block + 1) + nblocks = end_block - start + 1; + + if (nblocks >= minlen) { + ret = blkdev_issue_discard(nilfs->ns_bdev, + start * sects_per_block, + nblocks * sects_per_block, + GFP_NOFS, 0); + if (!ret) + ndiscarded += nblocks; + } + } + +out_sem: + up_read(&NILFS_MDT(sufile)->mi_sem); + + range->len = ndiscarded << nilfs->ns_blocksize_bits; + return ret; +} + +/** * nilfs_sufile_read - read or get sufile inode * @sb: super block instance * @susize: size of a segment usage entry @@ -886,6 +1169,18 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize, void *kaddr; int err; + if (susize > sb->s_blocksize) { + printk(KERN_ERR + "NILFS: too large segment usage size: %zu bytes.\n", + susize); + return -EINVAL; + } else if (susize < NILFS_MIN_SEGMENT_USAGE_SIZE) { + printk(KERN_ERR + "NILFS: too small segment usage size: %zu bytes.\n", + susize); + return -EINVAL; + } + sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO); if (unlikely(!sufile)) return -ENOMEM; diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index e84bc5b51fc..b8afd72f237 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -44,6 +44,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *); ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned, size_t); +ssize_t nilfs_sufile_set_suinfo(struct inode *, void *, unsigned , size_t); int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *, void (*dofunc)(struct inode *, __u64, @@ -65,6 +66,7 @@ void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *, int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs); int nilfs_sufile_read(struct super_block *sb, size_t susize, struct nilfs_inode *raw_inode, struct inode **inodep); +int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range); /** * nilfs_sufile_scrap - make a segment garbage diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 7ac2a122ca1..8c532b2ca3a 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1129,6 +1129,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) unsigned long old_mount_opt; int err; + sync_filesystem(sb); old_sb_flags = sb->s_flags; old_mount_opt = nilfs->ns_mount_opt; diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 94c451ce6d2..8ba8229ba07 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -399,6 +399,16 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs, return -EINVAL; nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size); + if (nilfs->ns_inode_size > nilfs->ns_blocksize) { + printk(KERN_ERR "NILFS: too large inode size: %d bytes.\n", + nilfs->ns_inode_size); + return -EINVAL; + } else if (nilfs->ns_inode_size < NILFS_MIN_INODE_SIZE) { + printk(KERN_ERR "NILFS: too small inode size: %d bytes.\n", + nilfs->ns_inode_size); + return -EINVAL; + } + nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino); nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); diff --git a/fs/nls/mac-celtic.c b/fs/nls/mac-celtic.c index 634a8b717b0..266c2d7d50b 100644 --- a/fs/nls/mac-celtic.c +++ b/fs/nls/mac-celtic.c @@ -583,7 +583,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_macceltic(void) diff --git a/fs/nls/mac-centeuro.c b/fs/nls/mac-centeuro.c index 979e6265ac5..9789c605755 100644 --- a/fs/nls/mac-centeuro.c +++ b/fs/nls/mac-centeuro.c @@ -513,7 +513,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_maccenteuro(void) diff --git a/fs/nls/mac-croatian.c b/fs/nls/mac-croatian.c index dd3f675911e..bb19e7a07d4 100644 --- a/fs/nls/mac-croatian.c +++ b/fs/nls/mac-croatian.c @@ -583,7 +583,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_maccroatian(void) diff --git a/fs/nls/mac-cyrillic.c b/fs/nls/mac-cyrillic.c index 1112c84dd8b..2a7dea36acb 100644 --- a/fs/nls/mac-cyrillic.c +++ b/fs/nls/mac-cyrillic.c @@ -478,7 +478,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_maccyrillic(void) diff --git a/fs/nls/mac-gaelic.c b/fs/nls/mac-gaelic.c index 2de9158409c..77b00165358 100644 --- a/fs/nls/mac-gaelic.c +++ b/fs/nls/mac-gaelic.c @@ -548,7 +548,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_macgaelic(void) diff --git a/fs/nls/mac-greek.c b/fs/nls/mac-greek.c index a8631008280..1eccf499e2e 100644 --- a/fs/nls/mac-greek.c +++ b/fs/nls/mac-greek.c @@ -478,7 +478,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_macgreek(void) diff --git a/fs/nls/mac-iceland.c b/fs/nls/mac-iceland.c index babe2998d5c..cbd0875c6d6 100644 --- a/fs/nls/mac-iceland.c +++ b/fs/nls/mac-iceland.c @@ -583,7 +583,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_maciceland(void) diff --git a/fs/nls/mac-inuit.c b/fs/nls/mac-inuit.c index 312364f010d..fba8357aaf0 100644 --- a/fs/nls/mac-inuit.c +++ b/fs/nls/mac-inuit.c @@ -513,7 +513,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_macinuit(void) diff --git a/fs/nls/mac-roman.c b/fs/nls/mac-roman.c index 53ce0809cbd..b6a98a5208c 100644 --- a/fs/nls/mac-roman.c +++ b/fs/nls/mac-roman.c @@ -618,7 +618,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_macroman(void) diff --git a/fs/nls/mac-romanian.c b/fs/nls/mac-romanian.c index add6f7a0c66..25547f02363 100644 --- a/fs/nls/mac-romanian.c +++ b/fs/nls/mac-romanian.c @@ -583,7 +583,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_macromanian(void) diff --git a/fs/nls/mac-turkish.c b/fs/nls/mac-turkish.c index dffa96d5de0..b5454bc7b7f 100644 --- a/fs/nls/mac-turkish.c +++ b/fs/nls/mac-turkish.c @@ -583,7 +583,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_macturkish(void) diff --git a/fs/nls/nls_ascii.c b/fs/nls/nls_ascii.c index 7020e940f74..a2620650d5e 100644 --- a/fs/nls/nls_ascii.c +++ b/fs/nls/nls_ascii.c @@ -148,7 +148,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_ascii(void) diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index fea6bd5831d..52ccd34b1e7 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -232,13 +232,14 @@ int utf16s_to_utf8s(const wchar_t *pwcs, int inlen, enum utf16_endian endian, } EXPORT_SYMBOL(utf16s_to_utf8s); -int register_nls(struct nls_table * nls) +int __register_nls(struct nls_table *nls, struct module *owner) { struct nls_table ** tmp = &tables; if (nls->next) return -EBUSY; + nls->owner = owner; spin_lock(&nls_lock); while (*tmp) { if (nls == *tmp) { @@ -252,6 +253,7 @@ int register_nls(struct nls_table * nls) spin_unlock(&nls_lock); return 0; } +EXPORT_SYMBOL(__register_nls); int unregister_nls(struct nls_table * nls) { @@ -538,7 +540,6 @@ struct nls_table *load_nls_default(void) return &default_table; } -EXPORT_SYMBOL(register_nls); EXPORT_SYMBOL(unregister_nls); EXPORT_SYMBOL(unload_nls); EXPORT_SYMBOL(load_nls); diff --git a/fs/nls/nls_cp1250.c b/fs/nls/nls_cp1250.c index c8471fe78e4..ace3e19d340 100644 --- a/fs/nls/nls_cp1250.c +++ b/fs/nls/nls_cp1250.c @@ -329,7 +329,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp1250(void) diff --git a/fs/nls/nls_cp1251.c b/fs/nls/nls_cp1251.c index 1939b46e772..9273ddfd08a 100644 --- a/fs/nls/nls_cp1251.c +++ b/fs/nls/nls_cp1251.c @@ -283,7 +283,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp1251(void) diff --git a/fs/nls/nls_cp1255.c b/fs/nls/nls_cp1255.c index 8120ae2e091..1caf5dfed85 100644 --- a/fs/nls/nls_cp1255.c +++ b/fs/nls/nls_cp1255.c @@ -365,7 +365,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp1255(void) diff --git a/fs/nls/nls_cp437.c b/fs/nls/nls_cp437.c index ff37a4628ce..7ddb830da3f 100644 --- a/fs/nls/nls_cp437.c +++ b/fs/nls/nls_cp437.c @@ -369,7 +369,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp437(void) diff --git a/fs/nls/nls_cp737.c b/fs/nls/nls_cp737.c index f5576b8be1b..c593f683a0c 100644 --- a/fs/nls/nls_cp737.c +++ b/fs/nls/nls_cp737.c @@ -332,7 +332,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp737(void) diff --git a/fs/nls/nls_cp775.c b/fs/nls/nls_cp775.c index 4905635d1c0..554c863745f 100644 --- a/fs/nls/nls_cp775.c +++ b/fs/nls/nls_cp775.c @@ -301,7 +301,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp775(void) diff --git a/fs/nls/nls_cp850.c b/fs/nls/nls_cp850.c index fe5bdad50e2..56cccd14b40 100644 --- a/fs/nls/nls_cp850.c +++ b/fs/nls/nls_cp850.c @@ -297,7 +297,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp850(void) diff --git a/fs/nls/nls_cp852.c b/fs/nls/nls_cp852.c index ceb1c0166dd..7cdc05ac1d4 100644 --- a/fs/nls/nls_cp852.c +++ b/fs/nls/nls_cp852.c @@ -319,7 +319,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp852(void) diff --git a/fs/nls/nls_cp855.c b/fs/nls/nls_cp855.c index cc7f5fb2e0c..7426eea0566 100644 --- a/fs/nls/nls_cp855.c +++ b/fs/nls/nls_cp855.c @@ -281,7 +281,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp855(void) diff --git a/fs/nls/nls_cp857.c b/fs/nls/nls_cp857.c index e418e198e8d..098309733eb 100644 --- a/fs/nls/nls_cp857.c +++ b/fs/nls/nls_cp857.c @@ -283,7 +283,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp857(void) diff --git a/fs/nls/nls_cp860.c b/fs/nls/nls_cp860.c index a86c97d1aa3..84224478e73 100644 --- a/fs/nls/nls_cp860.c +++ b/fs/nls/nls_cp860.c @@ -346,7 +346,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp860(void) diff --git a/fs/nls/nls_cp861.c b/fs/nls/nls_cp861.c index bd920227acd..dc873e4be09 100644 --- a/fs/nls/nls_cp861.c +++ b/fs/nls/nls_cp861.c @@ -369,7 +369,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp861(void) diff --git a/fs/nls/nls_cp862.c b/fs/nls/nls_cp862.c index e9b68eb3daf..d5263e3c556 100644 --- a/fs/nls/nls_cp862.c +++ b/fs/nls/nls_cp862.c @@ -403,7 +403,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp862(void) diff --git a/fs/nls/nls_cp863.c b/fs/nls/nls_cp863.c index f8a9b07ab4e..051c9832e36 100644 --- a/fs/nls/nls_cp863.c +++ b/fs/nls/nls_cp863.c @@ -363,7 +363,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp863(void) diff --git a/fs/nls/nls_cp864.c b/fs/nls/nls_cp864.c index 8d31f435fc6..97eb1273b2f 100644 --- a/fs/nls/nls_cp864.c +++ b/fs/nls/nls_cp864.c @@ -389,7 +389,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp864(void) diff --git a/fs/nls/nls_cp865.c b/fs/nls/nls_cp865.c index 4bd902fe3ec..11121422852 100644 --- a/fs/nls/nls_cp865.c +++ b/fs/nls/nls_cp865.c @@ -369,7 +369,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp865(void) diff --git a/fs/nls/nls_cp866.c b/fs/nls/nls_cp866.c index bdc7cb39139..ffdcbc3fc38 100644 --- a/fs/nls/nls_cp866.c +++ b/fs/nls/nls_cp866.c @@ -287,7 +287,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp866(void) diff --git a/fs/nls/nls_cp869.c b/fs/nls/nls_cp869.c index 9f283a2b151..3b5a3458935 100644 --- a/fs/nls/nls_cp869.c +++ b/fs/nls/nls_cp869.c @@ -297,7 +297,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp869(void) diff --git a/fs/nls/nls_cp874.c b/fs/nls/nls_cp874.c index 0b3c4886f8c..8dfaa10710f 100644 --- a/fs/nls/nls_cp874.c +++ b/fs/nls/nls_cp874.c @@ -256,7 +256,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp874(void) diff --git a/fs/nls/nls_cp932.c b/fs/nls/nls_cp932.c index 0ffed6f1ceb..67b7398e848 100644 --- a/fs/nls/nls_cp932.c +++ b/fs/nls/nls_cp932.c @@ -7914,7 +7914,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp932(void) diff --git a/fs/nls/nls_cp936.c b/fs/nls/nls_cp936.c index 82770301bc3..c96546cfec9 100644 --- a/fs/nls/nls_cp936.c +++ b/fs/nls/nls_cp936.c @@ -11092,7 +11092,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp936(void) diff --git a/fs/nls/nls_cp949.c b/fs/nls/nls_cp949.c index 8a7a2fe85c6..199171e97aa 100644 --- a/fs/nls/nls_cp949.c +++ b/fs/nls/nls_cp949.c @@ -13927,7 +13927,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp949(void) diff --git a/fs/nls/nls_cp950.c b/fs/nls/nls_cp950.c index ef2536829aa..8e141870820 100644 --- a/fs/nls/nls_cp950.c +++ b/fs/nls/nls_cp950.c @@ -9463,7 +9463,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp950(void) diff --git a/fs/nls/nls_euc-jp.c b/fs/nls/nls_euc-jp.c index 7424929a278..162b3f16035 100644 --- a/fs/nls/nls_euc-jp.c +++ b/fs/nls/nls_euc-jp.c @@ -553,7 +553,6 @@ static struct nls_table table = { .charset = "euc-jp", .uni2char = uni2char, .char2uni = char2uni, - .owner = THIS_MODULE, }; static int __init init_nls_euc_jp(void) diff --git a/fs/nls/nls_iso8859-1.c b/fs/nls/nls_iso8859-1.c index 7b951bb5849..69ac020d43b 100644 --- a/fs/nls/nls_iso8859-1.c +++ b/fs/nls/nls_iso8859-1.c @@ -239,7 +239,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_1(void) diff --git a/fs/nls/nls_iso8859-13.c b/fs/nls/nls_iso8859-13.c index c4d52ea9f09..afb3f8f275f 100644 --- a/fs/nls/nls_iso8859-13.c +++ b/fs/nls/nls_iso8859-13.c @@ -267,7 +267,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_13(void) diff --git a/fs/nls/nls_iso8859-14.c b/fs/nls/nls_iso8859-14.c index dc02600c7fe..046370f0b6f 100644 --- a/fs/nls/nls_iso8859-14.c +++ b/fs/nls/nls_iso8859-14.c @@ -323,7 +323,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_14(void) diff --git a/fs/nls/nls_iso8859-15.c b/fs/nls/nls_iso8859-15.c index 3c7dfc832ef..7e34a841a05 100644 --- a/fs/nls/nls_iso8859-15.c +++ b/fs/nls/nls_iso8859-15.c @@ -289,7 +289,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_15(void) diff --git a/fs/nls/nls_iso8859-2.c b/fs/nls/nls_iso8859-2.c index a2d2197e4c7..7dd57118174 100644 --- a/fs/nls/nls_iso8859-2.c +++ b/fs/nls/nls_iso8859-2.c @@ -290,7 +290,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_2(void) diff --git a/fs/nls/nls_iso8859-3.c b/fs/nls/nls_iso8859-3.c index a61e0daa3a8..740b75ec449 100644 --- a/fs/nls/nls_iso8859-3.c +++ b/fs/nls/nls_iso8859-3.c @@ -290,7 +290,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_3(void) diff --git a/fs/nls/nls_iso8859-4.c b/fs/nls/nls_iso8859-4.c index e8ff555483b..8826021e32f 100644 --- a/fs/nls/nls_iso8859-4.c +++ b/fs/nls/nls_iso8859-4.c @@ -290,7 +290,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_4(void) diff --git a/fs/nls/nls_iso8859-5.c b/fs/nls/nls_iso8859-5.c index 4721e893012..7c04057a1ad 100644 --- a/fs/nls/nls_iso8859-5.c +++ b/fs/nls/nls_iso8859-5.c @@ -254,7 +254,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_5(void) diff --git a/fs/nls/nls_iso8859-6.c b/fs/nls/nls_iso8859-6.c index 01a517d6d30..d4a881400d7 100644 --- a/fs/nls/nls_iso8859-6.c +++ b/fs/nls/nls_iso8859-6.c @@ -245,7 +245,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_6(void) diff --git a/fs/nls/nls_iso8859-7.c b/fs/nls/nls_iso8859-7.c index 2d27b93ef19..37b75d825a7 100644 --- a/fs/nls/nls_iso8859-7.c +++ b/fs/nls/nls_iso8859-7.c @@ -299,7 +299,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_7(void) diff --git a/fs/nls/nls_iso8859-9.c b/fs/nls/nls_iso8859-9.c index 694bf070c72..557b98250d3 100644 --- a/fs/nls/nls_iso8859-9.c +++ b/fs/nls/nls_iso8859-9.c @@ -254,7 +254,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_9(void) diff --git a/fs/nls/nls_koi8-r.c b/fs/nls/nls_koi8-r.c index 43875310540..811f232fccf 100644 --- a/fs/nls/nls_koi8-r.c +++ b/fs/nls/nls_koi8-r.c @@ -305,7 +305,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_koi8_r(void) diff --git a/fs/nls/nls_koi8-ru.c b/fs/nls/nls_koi8-ru.c index e7bc1d75c78..a80a741a867 100644 --- a/fs/nls/nls_koi8-ru.c +++ b/fs/nls/nls_koi8-ru.c @@ -55,7 +55,6 @@ static struct nls_table table = { .charset = "koi8-ru", .uni2char = uni2char, .char2uni = char2uni, - .owner = THIS_MODULE, }; static int __init init_nls_koi8_ru(void) diff --git a/fs/nls/nls_koi8-u.c b/fs/nls/nls_koi8-u.c index 8c9f0292b5a..7e029e4c188 100644 --- a/fs/nls/nls_koi8-u.c +++ b/fs/nls/nls_koi8-u.c @@ -312,7 +312,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_koi8_u(void) diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c index 0d60a44acac..afcfbc4a14d 100644 --- a/fs/nls/nls_utf8.c +++ b/fs/nls/nls_utf8.c @@ -46,7 +46,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = identity, /* no conversion */ .charset2upper = identity, - .owner = THIS_MODULE, }; static int __init init_nls_utf8(void) diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 1fedd5f7ccc..abc8cbcfe90 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -82,20 +82,23 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) * events. */ static int dnotify_handle_event(struct fsnotify_group *group, + struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - struct fsnotify_event *event) + u32 mask, void *data, int data_type, + const unsigned char *file_name, u32 cookie) { struct dnotify_mark *dn_mark; - struct inode *to_tell; struct dnotify_struct *dn; struct dnotify_struct **prev; struct fown_struct *fown; - __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD; + __u32 test_mask = mask & ~FS_EVENT_ON_CHILD; - BUG_ON(vfsmount_mark); + /* not a dir, dnotify doesn't care */ + if (!S_ISDIR(inode->i_mode)) + return 0; - to_tell = event->to_tell; + BUG_ON(vfsmount_mark); dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark); @@ -122,23 +125,6 @@ static int dnotify_handle_event(struct fsnotify_group *group, return 0; } -/* - * Given an inode and mask determine if dnotify would be interested in sending - * userspace notification for that pair. - */ -static bool dnotify_should_send_event(struct fsnotify_group *group, - struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - /* not a dir, dnotify doesn't care */ - if (!S_ISDIR(inode->i_mode)) - return false; - - return true; -} - static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) { struct dnotify_mark *dn_mark = container_of(fsn_mark, @@ -152,10 +138,6 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) static struct fsnotify_ops dnotify_fsnotify_ops = { .handle_event = dnotify_handle_event, - .should_send_event = dnotify_should_send_event, - .free_group_priv = NULL, - .freeing_mark = NULL, - .free_event_priv = NULL, }; /* diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 0c2f9122b26..ee9cb3795c2 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -9,91 +9,59 @@ #include <linux/types.h> #include <linux/wait.h> -static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new) +#include "fanotify.h" + +static bool should_merge(struct fsnotify_event *old_fsn, + struct fsnotify_event *new_fsn) { - pr_debug("%s: old=%p new=%p\n", __func__, old, new); + struct fanotify_event_info *old, *new; - if (old->to_tell == new->to_tell && - old->data_type == new->data_type && - old->tgid == new->tgid) { - switch (old->data_type) { - case (FSNOTIFY_EVENT_PATH): -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - /* dont merge two permission events */ - if ((old->mask & FAN_ALL_PERM_EVENTS) && - (new->mask & FAN_ALL_PERM_EVENTS)) - return false; -#endif - if ((old->path.mnt == new->path.mnt) && - (old->path.dentry == new->path.dentry)) - return true; - break; - case (FSNOTIFY_EVENT_NONE): - return true; - default: - BUG(); - }; - } + pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn); + old = FANOTIFY_E(old_fsn); + new = FANOTIFY_E(new_fsn); + + if (old_fsn->inode == new_fsn->inode && old->tgid == new->tgid && + old->path.mnt == new->path.mnt && + old->path.dentry == new->path.dentry) + return true; return false; } /* and the list better be locked by something too! */ -static struct fsnotify_event *fanotify_merge(struct list_head *list, - struct fsnotify_event *event) +static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) { - struct fsnotify_event_holder *test_holder; - struct fsnotify_event *test_event = NULL; - struct fsnotify_event *new_event; + struct fsnotify_event *test_event; + bool do_merge = false; pr_debug("%s: list=%p event=%p\n", __func__, list, event); +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + /* + * Don't merge a permission event with any other event so that we know + * the event structure we have created in fanotify_handle_event() is the + * one we should check for permission response. + */ + if (event->mask & FAN_ALL_PERM_EVENTS) + return 0; +#endif - list_for_each_entry_reverse(test_holder, list, event_list) { - if (should_merge(test_holder->event, event)) { - test_event = test_holder->event; + list_for_each_entry_reverse(test_event, list, list) { + if (should_merge(test_event, event)) { + do_merge = true; break; } } - if (!test_event) - return NULL; - - fsnotify_get_event(test_event); - - /* if they are exactly the same we are done */ - if (test_event->mask == event->mask) - return test_event; - - /* - * if the refcnt == 2 this is the only queue - * for this event and so we can update the mask - * in place. - */ - if (atomic_read(&test_event->refcnt) == 2) { - test_event->mask |= event->mask; - return test_event; - } - - new_event = fsnotify_clone_event(test_event); - - /* done with test_event */ - fsnotify_put_event(test_event); - - /* couldn't allocate memory, merge was not possible */ - if (unlikely(!new_event)) - return ERR_PTR(-ENOMEM); - - /* build new event and replace it on the list */ - new_event->mask = (test_event->mask | event->mask); - fsnotify_replace_event(test_holder, new_event); + if (!do_merge) + return 0; - /* we hold a reference on new_event from clone_event */ - return new_event; + test_event->mask |= event->mask; + return 1; } #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS -static int fanotify_get_response_from_access(struct fsnotify_group *group, - struct fsnotify_event *event) +static int fanotify_get_response(struct fsnotify_group *group, + struct fanotify_perm_event_info *event) { int ret; @@ -106,7 +74,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group, return 0; /* userspace responded, convert to something usable */ - spin_lock(&event->lock); switch (event->response) { case FAN_ALLOW: ret = 0; @@ -116,7 +83,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group, ret = -EPERM; } event->response = 0; - spin_unlock(&event->lock); pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__, group, event, ret); @@ -125,58 +91,17 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group, } #endif -static int fanotify_handle_event(struct fsnotify_group *group, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *fanotify_mark, - struct fsnotify_event *event) -{ - int ret = 0; - struct fsnotify_event *notify_event = NULL; - - BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); - BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); - BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); - BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE); - BUILD_BUG_ON(FAN_OPEN != FS_OPEN); - BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); - BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); - BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); - BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); - BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); - - pr_debug("%s: group=%p event=%p\n", __func__, group, event); - - notify_event = fsnotify_add_notify_event(group, event, NULL, fanotify_merge); - if (IS_ERR(notify_event)) - return PTR_ERR(notify_event); - -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - if (event->mask & FAN_ALL_PERM_EVENTS) { - /* if we merged we need to wait on the new event */ - if (notify_event) - event = notify_event; - ret = fanotify_get_response_from_access(group, event); - } -#endif - - if (notify_event) - fsnotify_put_event(notify_event); - - return ret; -} - -static bool fanotify_should_send_event(struct fsnotify_group *group, - struct inode *to_tell, - struct fsnotify_mark *inode_mark, +static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmnt_mark, - __u32 event_mask, void *data, int data_type) + u32 event_mask, + void *data, int data_type) { __u32 marks_mask, marks_ignored_mask; struct path *path = data; - pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p " - "mask=%x data=%p data_type=%d\n", __func__, group, to_tell, - inode_mark, vfsmnt_mark, event_mask, data, data_type); + pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p" + " data_type=%d\n", __func__, inode_mark, vfsmnt_mark, + event_mask, data, data_type); /* if we don't have enough info to send an event to userspace say no */ if (data_type != FSNOTIFY_EVENT_PATH) @@ -217,6 +142,93 @@ static bool fanotify_should_send_event(struct fsnotify_group *group, return false; } +struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask, + struct path *path) +{ + struct fanotify_event_info *event; + +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (mask & FAN_ALL_PERM_EVENTS) { + struct fanotify_perm_event_info *pevent; + + pevent = kmem_cache_alloc(fanotify_perm_event_cachep, + GFP_KERNEL); + if (!pevent) + return NULL; + event = &pevent->fae; + pevent->response = 0; + goto init; + } +#endif + event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); + if (!event) + return NULL; +init: __maybe_unused + fsnotify_init_event(&event->fse, inode, mask); + event->tgid = get_pid(task_tgid(current)); + if (path) { + event->path = *path; + path_get(&event->path); + } else { + event->path.mnt = NULL; + event->path.dentry = NULL; + } + return event; +} + +static int fanotify_handle_event(struct fsnotify_group *group, + struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *fanotify_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name, u32 cookie) +{ + int ret = 0; + struct fanotify_event_info *event; + struct fsnotify_event *fsn_event; + + BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); + BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); + BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); + BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE); + BUILD_BUG_ON(FAN_OPEN != FS_OPEN); + BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); + BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); + BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); + BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); + BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); + + if (!fanotify_should_send_event(inode_mark, fanotify_mark, mask, data, + data_type)) + return 0; + + pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, + mask); + + event = fanotify_alloc_event(inode, mask, data); + if (unlikely(!event)) + return -ENOMEM; + + fsn_event = &event->fse; + ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge); + if (ret) { + /* Permission events shouldn't be merged */ + BUG_ON(ret == 1 && mask & FAN_ALL_PERM_EVENTS); + /* Our event wasn't used in the end. Free it. */ + fsnotify_destroy_event(group, fsn_event); + + return 0; + } + +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (mask & FAN_ALL_PERM_EVENTS) { + ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event)); + fsnotify_destroy_event(group, fsn_event); + } +#endif + return ret; +} + static void fanotify_free_group_priv(struct fsnotify_group *group) { struct user_struct *user; @@ -226,10 +238,25 @@ static void fanotify_free_group_priv(struct fsnotify_group *group) free_uid(user); } +static void fanotify_free_event(struct fsnotify_event *fsn_event) +{ + struct fanotify_event_info *event; + + event = FANOTIFY_E(fsn_event); + path_put(&event->path); + put_pid(event->tgid); +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (fsn_event->mask & FAN_ALL_PERM_EVENTS) { + kmem_cache_free(fanotify_perm_event_cachep, + FANOTIFY_PE(fsn_event)); + return; + } +#endif + kmem_cache_free(fanotify_event_cachep, event); +} + const struct fsnotify_ops fanotify_fsnotify_ops = { .handle_event = fanotify_handle_event, - .should_send_event = fanotify_should_send_event, .free_group_priv = fanotify_free_group_priv, - .free_event_priv = NULL, - .freeing_mark = NULL, + .free_event = fanotify_free_event, }; diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h new file mode 100644 index 00000000000..2a5fb14115d --- /dev/null +++ b/fs/notify/fanotify/fanotify.h @@ -0,0 +1,50 @@ +#include <linux/fsnotify_backend.h> +#include <linux/path.h> +#include <linux/slab.h> + +extern struct kmem_cache *fanotify_event_cachep; +extern struct kmem_cache *fanotify_perm_event_cachep; + +/* + * Structure for normal fanotify events. It gets allocated in + * fanotify_handle_event() and freed when the information is retrieved by + * userspace + */ +struct fanotify_event_info { + struct fsnotify_event fse; + /* + * We hold ref to this path so it may be dereferenced at any point + * during this object's lifetime + */ + struct path path; + struct pid *tgid; +}; + +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS +/* + * Structure for permission fanotify events. It gets allocated and freed in + * fanotify_handle_event() since we wait there for user response. When the + * information is retrieved by userspace the structure is moved from + * group->notification_list to group->fanotify_data.access_list to wait for + * user response. + */ +struct fanotify_perm_event_info { + struct fanotify_event_info fae; + int response; /* userspace answer to question */ + int fd; /* fd we passed to userspace for this event */ +}; + +static inline struct fanotify_perm_event_info * +FANOTIFY_PE(struct fsnotify_event *fse) +{ + return container_of(fse, struct fanotify_perm_event_info, fae.fse); +} +#endif + +static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) +{ + return container_of(fse, struct fanotify_event_info, fse); +} + +struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask, + struct path *path); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index e44cb6427df..3fdc8a3e113 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -19,21 +19,30 @@ #include "../../mount.h" #include "../fdinfo.h" +#include "fanotify.h" #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 #define FANOTIFY_DEFAULT_MAX_MARKS 8192 #define FANOTIFY_DEFAULT_MAX_LISTENERS 128 +/* + * All flags that may be specified in parameter event_f_flags of fanotify_init. + * + * Internal and external open flags are stored together in field f_flags of + * struct file. Only external open flags shall be allowed in event_f_flags. + * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be + * excluded. + */ +#define FANOTIFY_INIT_ALL_EVENT_F_BITS ( \ + O_ACCMODE | O_APPEND | O_NONBLOCK | \ + __O_SYNC | O_DSYNC | O_CLOEXEC | \ + O_LARGEFILE | O_NOATIME ) + extern const struct fsnotify_ops fanotify_fsnotify_ops; static struct kmem_cache *fanotify_mark_cache __read_mostly; -static struct kmem_cache *fanotify_response_event_cache __read_mostly; - -struct fanotify_response_event { - struct list_head list; - __s32 fd; - struct fsnotify_event *event; -}; +struct kmem_cache *fanotify_event_cachep __read_mostly; +struct kmem_cache *fanotify_perm_event_cachep __read_mostly; /* * Get an fsnotify notification event if one exists and is small @@ -61,8 +70,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, } static int create_fd(struct fsnotify_group *group, - struct fsnotify_event *event, - struct file **file) + struct fanotify_event_info *event, + struct file **file) { int client_fd; struct file *new_file; @@ -73,12 +82,6 @@ static int create_fd(struct fsnotify_group *group, if (client_fd < 0) return client_fd; - if (event->data_type != FSNOTIFY_EVENT_PATH) { - WARN_ON(1); - put_unused_fd(client_fd); - return -EINVAL; - } - /* * we need a new file handle for the userspace program so it can read even if it was * originally opened O_WRONLY. @@ -109,23 +112,25 @@ static int create_fd(struct fsnotify_group *group, } static int fill_event_metadata(struct fsnotify_group *group, - struct fanotify_event_metadata *metadata, - struct fsnotify_event *event, - struct file **file) + struct fanotify_event_metadata *metadata, + struct fsnotify_event *fsn_event, + struct file **file) { int ret = 0; + struct fanotify_event_info *event; pr_debug("%s: group=%p metadata=%p event=%p\n", __func__, - group, metadata, event); + group, metadata, fsn_event); *file = NULL; + event = container_of(fsn_event, struct fanotify_event_info, fse); metadata->event_len = FAN_EVENT_METADATA_LEN; metadata->metadata_len = FAN_EVENT_METADATA_LEN; metadata->vers = FANOTIFY_METADATA_VERSION; metadata->reserved = 0; - metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; + metadata->mask = fsn_event->mask & FAN_ALL_OUTGOING_EVENTS; metadata->pid = pid_vnr(event->tgid); - if (unlikely(event->mask & FAN_Q_OVERFLOW)) + if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW)) metadata->fd = FAN_NOFD; else { metadata->fd = create_fd(group, event, file); @@ -137,33 +142,34 @@ static int fill_event_metadata(struct fsnotify_group *group, } #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS -static struct fanotify_response_event *dequeue_re(struct fsnotify_group *group, - __s32 fd) +static struct fanotify_perm_event_info *dequeue_event( + struct fsnotify_group *group, int fd) { - struct fanotify_response_event *re, *return_re = NULL; + struct fanotify_perm_event_info *event, *return_e = NULL; - mutex_lock(&group->fanotify_data.access_mutex); - list_for_each_entry(re, &group->fanotify_data.access_list, list) { - if (re->fd != fd) + spin_lock(&group->fanotify_data.access_lock); + list_for_each_entry(event, &group->fanotify_data.access_list, + fae.fse.list) { + if (event->fd != fd) continue; - list_del_init(&re->list); - return_re = re; + list_del_init(&event->fae.fse.list); + return_e = event; break; } - mutex_unlock(&group->fanotify_data.access_mutex); + spin_unlock(&group->fanotify_data.access_lock); - pr_debug("%s: found return_re=%p\n", __func__, return_re); + pr_debug("%s: found return_re=%p\n", __func__, return_e); - return return_re; + return return_e; } static int process_access_response(struct fsnotify_group *group, struct fanotify_response *response_struct) { - struct fanotify_response_event *re; - __s32 fd = response_struct->fd; - __u32 response = response_struct->response; + struct fanotify_perm_event_info *event; + int fd = response_struct->fd; + int response = response_struct->response; pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group, fd, response); @@ -183,58 +189,15 @@ static int process_access_response(struct fsnotify_group *group, if (fd < 0) return -EINVAL; - re = dequeue_re(group, fd); - if (!re) + event = dequeue_event(group, fd); + if (!event) return -ENOENT; - re->event->response = response; - + event->response = response; wake_up(&group->fanotify_data.access_waitq); - kmem_cache_free(fanotify_response_event_cache, re); - return 0; } - -static int prepare_for_access_response(struct fsnotify_group *group, - struct fsnotify_event *event, - __s32 fd) -{ - struct fanotify_response_event *re; - - if (!(event->mask & FAN_ALL_PERM_EVENTS)) - return 0; - - re = kmem_cache_alloc(fanotify_response_event_cache, GFP_KERNEL); - if (!re) - return -ENOMEM; - - re->event = event; - re->fd = fd; - - mutex_lock(&group->fanotify_data.access_mutex); - - if (atomic_read(&group->fanotify_data.bypass_perm)) { - mutex_unlock(&group->fanotify_data.access_mutex); - kmem_cache_free(fanotify_response_event_cache, re); - event->response = FAN_ALLOW; - return 0; - } - - list_add_tail(&re->list, &group->fanotify_data.access_list); - mutex_unlock(&group->fanotify_data.access_mutex); - - return 0; -} - -#else -static int prepare_for_access_response(struct fsnotify_group *group, - struct fsnotify_event *event, - __s32 fd) -{ - return 0; -} - #endif static ssize_t copy_event_to_user(struct fsnotify_group *group, @@ -249,7 +212,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f); if (ret < 0) - goto out; + return ret; fd = fanotify_event_metadata.fd; ret = -EFAULT; @@ -257,9 +220,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, fanotify_event_metadata.event_len)) goto out_close_fd; - ret = prepare_for_access_response(group, event, fd); - if (ret) - goto out_close_fd; +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (event->mask & FAN_ALL_PERM_EVENTS) + FANOTIFY_PE(event)->fd = fd; +#endif if (fd != FAN_NOFD) fd_install(fd, f); @@ -270,13 +234,6 @@ out_close_fd: put_unused_fd(fd); fput(f); } -out: -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - if (event->mask & FAN_ALL_PERM_EVENTS) { - event->response = FAN_DENY; - wake_up(&group->fanotify_data.access_waitq); - } -#endif return ret; } @@ -316,30 +273,50 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, kevent = get_one_event(group, count); mutex_unlock(&group->notification_mutex); - if (kevent) { + if (IS_ERR(kevent)) { ret = PTR_ERR(kevent); - if (IS_ERR(kevent)) + break; + } + + if (!kevent) { + ret = -EAGAIN; + if (file->f_flags & O_NONBLOCK) break; - ret = copy_event_to_user(group, kevent, buf); - fsnotify_put_event(kevent); - if (ret < 0) + + ret = -ERESTARTSYS; + if (signal_pending(current)) + break; + + if (start != buf) break; - buf += ret; - count -= ret; + schedule(); continue; } - ret = -EAGAIN; - if (file->f_flags & O_NONBLOCK) - break; - ret = -ERESTARTSYS; - if (signal_pending(current)) - break; - - if (start != buf) - break; - - schedule(); + ret = copy_event_to_user(group, kevent, buf); + /* + * Permission events get queued to wait for response. Other + * events can be destroyed now. + */ + if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) { + fsnotify_destroy_event(group, kevent); + if (ret < 0) + break; + } else { +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (ret < 0) { + FANOTIFY_PE(kevent)->response = FAN_DENY; + wake_up(&group->fanotify_data.access_waitq); + break; + } + spin_lock(&group->fanotify_data.access_lock); + list_add_tail(&kevent->list, + &group->fanotify_data.access_list); + spin_unlock(&group->fanotify_data.access_lock); +#endif + } + buf += ret; + count -= ret; } finish_wait(&group->notification_waitq, &wait); @@ -380,22 +357,21 @@ static int fanotify_release(struct inode *ignored, struct file *file) struct fsnotify_group *group = file->private_data; #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - struct fanotify_response_event *re, *lre; + struct fanotify_perm_event_info *event, *next; - mutex_lock(&group->fanotify_data.access_mutex); + spin_lock(&group->fanotify_data.access_lock); atomic_inc(&group->fanotify_data.bypass_perm); - list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) { - pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group, - re, re->event); - - list_del_init(&re->list); - re->event->response = FAN_ALLOW; + list_for_each_entry_safe(event, next, &group->fanotify_data.access_list, + fae.fse.list) { + pr_debug("%s: found group=%p event=%p\n", __func__, group, + event); - kmem_cache_free(fanotify_response_event_cache, re); + list_del_init(&event->fae.fse.list); + event->response = FAN_ALLOW; } - mutex_unlock(&group->fanotify_data.access_mutex); + spin_unlock(&group->fanotify_data.access_lock); wake_up(&group->fanotify_data.access_waitq); #endif @@ -409,7 +385,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fsnotify_group *group; - struct fsnotify_event_holder *holder; + struct fsnotify_event *fsn_event; void __user *p; int ret = -ENOTTY; size_t send_len = 0; @@ -421,7 +397,7 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar switch (cmd) { case FIONREAD: mutex_lock(&group->notification_mutex); - list_for_each_entry(holder, &group->notification_list, event_list) + list_for_each_entry(fsn_event, &group->notification_list, list) send_len += FAN_EVENT_METADATA_LEN; mutex_unlock(&group->notification_mutex); ret = put_user(send_len, (int __user *) p); @@ -695,6 +671,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) struct fsnotify_group *group; int f_flags, fd; struct user_struct *user; + struct fanotify_event_info *oevent; pr_debug("%s: flags=%d event_f_flags=%d\n", __func__, flags, event_f_flags); @@ -705,6 +682,18 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) if (flags & ~FAN_ALL_INIT_FLAGS) return -EINVAL; + if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS) + return -EINVAL; + + switch (event_f_flags & O_ACCMODE) { + case O_RDONLY: + case O_RDWR: + case O_WRONLY: + break; + default: + return -EINVAL; + } + user = get_current_user(); if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) { free_uid(user); @@ -727,9 +716,18 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->fanotify_data.user = user; atomic_inc(&user->fanotify_listeners); + oevent = fanotify_alloc_event(NULL, FS_Q_OVERFLOW, NULL); + if (unlikely(!oevent)) { + fd = -ENOMEM; + goto out_destroy_group; + } + group->overflow_event = &oevent->fse; + + if (force_o_largefile()) + event_f_flags |= O_LARGEFILE; group->fanotify_data.f_flags = event_f_flags; #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - mutex_init(&group->fanotify_data.access_mutex); + spin_lock_init(&group->fanotify_data.access_lock); init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); atomic_set(&group->fanotify_data.bypass_perm, 0); @@ -803,7 +801,10 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, case FAN_MARK_REMOVE: if (!mask) return -EINVAL; + break; case FAN_MARK_FLUSH: + if (flags & ~(FAN_MARK_MOUNT | FAN_MARK_FLUSH)) + return -EINVAL; break; default: return -EINVAL; @@ -840,6 +841,15 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, group->priority == FS_PRIO_0) goto fput_and_out; + if (flags & FAN_MARK_FLUSH) { + ret = 0; + if (flags & FAN_MARK_MOUNT) + fsnotify_clear_vfsmount_marks_by_group(group); + else + fsnotify_clear_inode_marks_by_group(group); + goto fput_and_out; + } + ret = fanotify_find_path(dfd, pathname, &path, flags); if (ret) goto fput_and_out; @@ -851,7 +861,7 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, mnt = path.mnt; /* create/update an inode mark */ - switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { + switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) { case FAN_MARK_ADD: if (flags & FAN_MARK_MOUNT) ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags); @@ -864,12 +874,6 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, else ret = fanotify_remove_inode_mark(group, inode, mask, flags); break; - case FAN_MARK_FLUSH: - if (flags & FAN_MARK_MOUNT) - fsnotify_clear_vfsmount_marks_by_group(group); - else - fsnotify_clear_inode_marks_by_group(group); - break; default: ret = -EINVAL; } @@ -888,9 +892,9 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark, { return sys_fanotify_mark(fanotify_fd, flags, #ifdef __BIG_ENDIAN - ((__u64)mask1 << 32) | mask0, -#else ((__u64)mask0 << 32) | mask1, +#else + ((__u64)mask1 << 32) | mask0, #endif dfd, pathname); } @@ -904,8 +908,11 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark, static int __init fanotify_user_setup(void) { fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); - fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event, - SLAB_PANIC); + fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + fanotify_perm_event_cachep = KMEM_CACHE(fanotify_perm_event_info, + SLAB_PANIC); +#endif return 0; } diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 4bb21d67d9b..9d3e9c50066 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -128,8 +128,7 @@ static int send_to_group(struct inode *to_tell, struct fsnotify_mark *vfsmount_mark, __u32 mask, void *data, int data_is, u32 cookie, - const unsigned char *file_name, - struct fsnotify_event **event) + const unsigned char *file_name) { struct fsnotify_group *group = NULL; __u32 inode_test_mask = 0; @@ -170,27 +169,17 @@ static int send_to_group(struct inode *to_tell, pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p" " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x" - " data=%p data_is=%d cookie=%d event=%p\n", + " data=%p data_is=%d cookie=%d\n", __func__, group, to_tell, mask, inode_mark, inode_test_mask, vfsmount_mark, vfsmount_test_mask, data, - data_is, cookie, *event); + data_is, cookie); if (!inode_test_mask && !vfsmount_test_mask) return 0; - if (group->ops->should_send_event(group, to_tell, inode_mark, - vfsmount_mark, mask, data, - data_is) == false) - return 0; - - if (!*event) { - *event = fsnotify_create_event(to_tell, mask, data, - data_is, file_name, - cookie, GFP_KERNEL); - if (!*event) - return -ENOMEM; - } - return group->ops->handle_event(group, inode_mark, vfsmount_mark, *event); + return group->ops->handle_event(group, to_tell, inode_mark, + vfsmount_mark, mask, data, data_is, + file_name, cookie); } /* @@ -205,7 +194,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; struct fsnotify_group *inode_group, *vfsmount_group; - struct fsnotify_event *event = NULL; struct mount *mnt; int idx, ret = 0; /* global tests shouldn't care about events on child only the specific event */ @@ -258,18 +246,18 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, if (inode_group > vfsmount_group) { /* handle inode */ - ret = send_to_group(to_tell, inode_mark, NULL, mask, data, - data_is, cookie, file_name, &event); + ret = send_to_group(to_tell, inode_mark, NULL, mask, + data, data_is, cookie, file_name); /* we didn't use the vfsmount_mark */ vfsmount_group = NULL; } else if (vfsmount_group > inode_group) { - ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, data, - data_is, cookie, file_name, &event); + ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, + data, data_is, cookie, file_name); inode_group = NULL; } else { ret = send_to_group(to_tell, inode_mark, vfsmount_mark, - mask, data, data_is, cookie, file_name, - &event); + mask, data, data_is, cookie, + file_name); } if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) @@ -285,12 +273,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, ret = 0; out: srcu_read_unlock(&fsnotify_mark_srcu, idx); - /* - * fsnotify_create_event() took a reference so the event can't be cleaned - * up while we are still trying to add it to lists, drop that one. - */ - if (event) - fsnotify_put_event(event); return ret; } diff --git a/fs/notify/group.c b/fs/notify/group.c index bd2625bd88b..ad199598045 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -55,6 +55,13 @@ void fsnotify_destroy_group(struct fsnotify_group *group) /* clear the notification queue of all events */ fsnotify_flush_notify(group); + /* + * Destroy overflow event (we cannot use fsnotify_destroy_event() as + * that deliberately ignores overflow events. + */ + if (group->overflow_event) + group->ops->free_event(group->overflow_event); + fsnotify_put_group(group); } diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index b6642e4de4b..ed855ef6f07 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -2,11 +2,12 @@ #include <linux/inotify.h> #include <linux/slab.h> /* struct kmem_cache */ -extern struct kmem_cache *event_priv_cachep; - -struct inotify_event_private_data { - struct fsnotify_event_private_data fsnotify_event_priv_data; +struct inotify_event_info { + struct fsnotify_event fse; int wd; + u32 sync_cookie; + int name_len; + char name[]; }; struct inotify_inode_mark { @@ -14,8 +15,18 @@ struct inotify_inode_mark { int wd; }; +static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse) +{ + return container_of(fse, struct inotify_event_info, fse); +} + extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group); -extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv); +extern int inotify_handle_event(struct fsnotify_group *group, + struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name, u32 cookie); extern const struct fsnotify_ops inotify_fsnotify_ops; diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 4216308b81b..43ab1e1a07a 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -34,107 +34,90 @@ #include "inotify.h" /* - * Check if 2 events contain the same information. We do not compare private data - * but at this moment that isn't a problem for any know fsnotify listeners. + * Check if 2 events contain the same information. */ -static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new) +static bool event_compare(struct fsnotify_event *old_fsn, + struct fsnotify_event *new_fsn) { - if ((old->mask == new->mask) && - (old->to_tell == new->to_tell) && - (old->data_type == new->data_type) && - (old->name_len == new->name_len)) { - switch (old->data_type) { - case (FSNOTIFY_EVENT_INODE): - /* remember, after old was put on the wait_q we aren't - * allowed to look at the inode any more, only thing - * left to check was if the file_name is the same */ - if (!old->name_len || - !strcmp(old->file_name, new->file_name)) - return true; - break; - case (FSNOTIFY_EVENT_PATH): - if ((old->path.mnt == new->path.mnt) && - (old->path.dentry == new->path.dentry)) - return true; - break; - case (FSNOTIFY_EVENT_NONE): - if (old->mask & FS_Q_OVERFLOW) - return true; - else if (old->mask & FS_IN_IGNORED) - return false; - return true; - }; - } + struct inotify_event_info *old, *new; + + if (old_fsn->mask & FS_IN_IGNORED) + return false; + old = INOTIFY_E(old_fsn); + new = INOTIFY_E(new_fsn); + if ((old_fsn->mask == new_fsn->mask) && + (old_fsn->inode == new_fsn->inode) && + (old->name_len == new->name_len) && + (!old->name_len || !strcmp(old->name, new->name))) + return true; return false; } -static struct fsnotify_event *inotify_merge(struct list_head *list, - struct fsnotify_event *event) +static int inotify_merge(struct list_head *list, + struct fsnotify_event *event) { - struct fsnotify_event_holder *last_holder; struct fsnotify_event *last_event; - /* and the list better be locked by something too */ - spin_lock(&event->lock); - - last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list); - last_event = last_holder->event; - if (event_compare(last_event, event)) - fsnotify_get_event(last_event); - else - last_event = NULL; - - spin_unlock(&event->lock); - - return last_event; + last_event = list_entry(list->prev, struct fsnotify_event, list); + return event_compare(last_event, event); } -static int inotify_handle_event(struct fsnotify_group *group, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - struct fsnotify_event *event) +int inotify_handle_event(struct fsnotify_group *group, + struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name, u32 cookie) { struct inotify_inode_mark *i_mark; - struct inode *to_tell; - struct inotify_event_private_data *event_priv; - struct fsnotify_event_private_data *fsn_event_priv; - struct fsnotify_event *added_event; - int wd, ret = 0; + struct inotify_event_info *event; + struct fsnotify_event *fsn_event; + int ret; + int len = 0; + int alloc_len = sizeof(struct inotify_event_info); BUG_ON(vfsmount_mark); - pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group, - event, event->to_tell, event->mask); + if ((inode_mark->mask & FS_EXCL_UNLINK) && + (data_type == FSNOTIFY_EVENT_PATH)) { + struct path *path = data; + + if (d_unlinked(path->dentry)) + return 0; + } + if (file_name) { + len = strlen(file_name); + alloc_len += len + 1; + } - to_tell = event->to_tell; + pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, + mask); i_mark = container_of(inode_mark, struct inotify_inode_mark, fsn_mark); - wd = i_mark->wd; - event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); - if (unlikely(!event_priv)) + event = kmalloc(alloc_len, GFP_KERNEL); + if (unlikely(!event)) return -ENOMEM; - fsn_event_priv = &event_priv->fsnotify_event_priv_data; - - fsnotify_get_group(group); - fsn_event_priv->group = group; - event_priv->wd = wd; - - added_event = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge); - if (added_event) { - inotify_free_event_priv(fsn_event_priv); - if (!IS_ERR(added_event)) - fsnotify_put_event(added_event); - else - ret = PTR_ERR(added_event); + fsn_event = &event->fse; + fsnotify_init_event(fsn_event, inode, mask); + event->wd = i_mark->wd; + event->sync_cookie = cookie; + event->name_len = len; + if (len) + strcpy(event->name, file_name); + + ret = fsnotify_add_notify_event(group, fsn_event, inotify_merge); + if (ret) { + /* Our event wasn't used in the end. Free it. */ + fsnotify_destroy_event(group, fsn_event); } if (inode_mark->mask & IN_ONESHOT) fsnotify_destroy_mark(inode_mark, group); - return ret; + return 0; } static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group) @@ -142,22 +125,6 @@ static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify inotify_ignored_and_remove_idr(fsn_mark, group); } -static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - if ((inode_mark->mask & FS_EXCL_UNLINK) && - (data_type == FSNOTIFY_EVENT_PATH)) { - struct path *path = data; - - if (d_unlinked(path->dentry)) - return false; - } - - return true; -} - /* * This is NEVER supposed to be called. Inotify marks should either have been * removed from the idr when the watch was removed or in the @@ -202,22 +169,14 @@ static void inotify_free_group_priv(struct fsnotify_group *group) free_uid(group->inotify_data.user); } -void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv) +static void inotify_free_event(struct fsnotify_event *fsn_event) { - struct inotify_event_private_data *event_priv; - - - event_priv = container_of(fsn_event_priv, struct inotify_event_private_data, - fsnotify_event_priv_data); - - fsnotify_put_group(fsn_event_priv->group); - kmem_cache_free(event_priv_cachep, event_priv); + kfree(INOTIFY_E(fsn_event)); } const struct fsnotify_ops inotify_fsnotify_ops = { .handle_event = inotify_handle_event, - .should_send_event = inotify_should_send_event, .free_group_priv = inotify_free_group_priv, - .free_event_priv = inotify_free_event_priv, + .free_event = inotify_free_event, .freeing_mark = inotify_freeing_mark, }; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 60f954a891a..cc423a30a0c 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -50,7 +50,6 @@ static int inotify_max_queued_events __read_mostly; static int inotify_max_user_watches __read_mostly; static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; -struct kmem_cache *event_priv_cachep __read_mostly; #ifdef CONFIG_SYSCTL @@ -58,7 +57,7 @@ struct kmem_cache *event_priv_cachep __read_mostly; static int zero; -ctl_table inotify_table[] = { +struct ctl_table inotify_table[] = { { .procname = "max_user_instances", .data = &inotify_max_user_instances, @@ -124,6 +123,16 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait) return ret; } +static int round_event_name_len(struct fsnotify_event *fsn_event) +{ + struct inotify_event_info *event; + + event = INOTIFY_E(fsn_event); + if (!event->name_len) + return 0; + return roundup(event->name_len + 1, sizeof(struct inotify_event)); +} + /* * Get an inotify_kernel_event if one exists and is small * enough to fit in "count". Return an error pointer if @@ -144,9 +153,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - if (event->name_len) - event_size += roundup(event->name_len + 1, event_size); - + event_size += round_event_name_len(event); if (event_size > count) return ERR_PTR(-EINVAL); @@ -164,40 +171,27 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, * buffer we had in "get_one_event()" above. */ static ssize_t copy_event_to_user(struct fsnotify_group *group, - struct fsnotify_event *event, + struct fsnotify_event *fsn_event, char __user *buf) { struct inotify_event inotify_event; - struct fsnotify_event_private_data *fsn_priv; - struct inotify_event_private_data *priv; + struct inotify_event_info *event; size_t event_size = sizeof(struct inotify_event); - size_t name_len = 0; + size_t name_len; + size_t pad_name_len; - pr_debug("%s: group=%p event=%p\n", __func__, group, event); - - /* we get the inotify watch descriptor from the event private data */ - spin_lock(&event->lock); - fsn_priv = fsnotify_remove_priv_from_event(group, event); - spin_unlock(&event->lock); - - if (!fsn_priv) - inotify_event.wd = -1; - else { - priv = container_of(fsn_priv, struct inotify_event_private_data, - fsnotify_event_priv_data); - inotify_event.wd = priv->wd; - inotify_free_event_priv(fsn_priv); - } + pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event); + event = INOTIFY_E(fsn_event); + name_len = event->name_len; /* - * round up event->name_len so it is a multiple of event_size + * round up name length so it is a multiple of event_size * plus an extra byte for the terminating '\0'. */ - if (event->name_len) - name_len = roundup(event->name_len + 1, event_size); - inotify_event.len = name_len; - - inotify_event.mask = inotify_mask_to_arg(event->mask); + pad_name_len = round_event_name_len(fsn_event); + inotify_event.len = pad_name_len; + inotify_event.mask = inotify_mask_to_arg(fsn_event->mask); + inotify_event.wd = event->wd; inotify_event.cookie = event->sync_cookie; /* send the main event */ @@ -209,20 +203,18 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, /* * fsnotify only stores the pathname, so here we have to send the pathname * and then pad that pathname out to a multiple of sizeof(inotify_event) - * with zeros. I get my zeros from the nul_inotify_event. + * with zeros. */ - if (name_len) { - unsigned int len_to_zero = name_len - event->name_len; + if (pad_name_len) { /* copy the path name */ - if (copy_to_user(buf, event->file_name, event->name_len)) + if (copy_to_user(buf, event->name, name_len)) return -EFAULT; - buf += event->name_len; + buf += name_len; /* fill userspace with 0's */ - if (clear_user(buf, len_to_zero)) + if (clear_user(buf, pad_name_len - name_len)) return -EFAULT; - buf += len_to_zero; - event_size += name_len; + event_size += pad_name_len; } return event_size; @@ -254,7 +246,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf, if (IS_ERR(kevent)) break; ret = copy_event_to_user(group, kevent, buf); - fsnotify_put_event(kevent); + fsnotify_destroy_event(group, kevent); if (ret < 0) break; buf += ret; @@ -297,8 +289,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fsnotify_group *group; - struct fsnotify_event_holder *holder; - struct fsnotify_event *event; + struct fsnotify_event *fsn_event; void __user *p; int ret = -ENOTTY; size_t send_len = 0; @@ -311,12 +302,10 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case FIONREAD: mutex_lock(&group->notification_mutex); - list_for_each_entry(holder, &group->notification_list, event_list) { - event = holder->event; + list_for_each_entry(fsn_event, &group->notification_list, + list) { send_len += sizeof(struct inotify_event); - if (event->name_len) - send_len += roundup(event->name_len + 1, - sizeof(struct inotify_event)); + send_len += round_event_name_len(fsn_event); } mutex_unlock(&group->notification_mutex); ret = put_user(send_len, (int __user *) p); @@ -503,43 +492,12 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group) { struct inotify_inode_mark *i_mark; - struct fsnotify_event *ignored_event, *notify_event; - struct inotify_event_private_data *event_priv; - struct fsnotify_event_private_data *fsn_event_priv; - int ret; - - i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); - - ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, - FSNOTIFY_EVENT_NONE, NULL, 0, - GFP_NOFS); - if (!ignored_event) - goto skip_send_ignore; - - event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS); - if (unlikely(!event_priv)) - goto skip_send_ignore; - - fsn_event_priv = &event_priv->fsnotify_event_priv_data; - - fsnotify_get_group(group); - fsn_event_priv->group = group; - event_priv->wd = i_mark->wd; - - notify_event = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL); - if (notify_event) { - if (IS_ERR(notify_event)) - ret = PTR_ERR(notify_event); - else - fsnotify_put_event(notify_event); - inotify_free_event_priv(fsn_event_priv); - } -skip_send_ignore: - /* matches the reference taken when the event was created */ - if (ignored_event) - fsnotify_put_event(ignored_event); + /* Queue ignore event for the watch */ + inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED, + NULL, FSNOTIFY_EVENT_NONE, NULL, 0); + i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); /* remove this mark from the idr */ inotify_remove_from_idr(group, i_mark); @@ -675,11 +633,23 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod static struct fsnotify_group *inotify_new_group(unsigned int max_events) { struct fsnotify_group *group; + struct inotify_event_info *oevent; group = fsnotify_alloc_group(&inotify_fsnotify_ops); if (IS_ERR(group)) return group; + oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL); + if (unlikely(!oevent)) { + fsnotify_destroy_group(group); + return ERR_PTR(-ENOMEM); + } + group->overflow_event = &oevent->fse; + fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW); + oevent->wd = -1; + oevent->sync_cookie = 0; + oevent->name_len = 0; + group->max_events = max_events; spin_lock_init(&group->inotify_data.idr_lock); @@ -836,7 +806,6 @@ static int __init inotify_user_setup(void) BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); - event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); inotify_max_queued_events = 16384; inotify_max_user_instances = 128; diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 923fe4a5f50..d90deaa08e7 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -340,7 +340,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, static int fsnotify_mark_destroy(void *ignored) { struct fsnotify_mark *mark, *next; - LIST_HEAD(private_destroy_list); + struct list_head private_destroy_list; for (;;) { spin_lock(&destroy_lock); diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 7b51b05f160..1e58402171a 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -48,15 +48,6 @@ #include <linux/fsnotify_backend.h> #include "fsnotify.h" -static struct kmem_cache *fsnotify_event_cachep; -static struct kmem_cache *fsnotify_event_holder_cachep; -/* - * This is a magic event we send when the q is too full. Since it doesn't - * hold real event information we just keep one system wide and use it any time - * it is needed. It's refcnt is set 1 at kernel init time and will never - * get set to 0 so it will never get 'freed' - */ -static struct fsnotify_event *q_overflow_event; static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0); /** @@ -76,186 +67,82 @@ bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) return list_empty(&group->notification_list) ? true : false; } -void fsnotify_get_event(struct fsnotify_event *event) +void fsnotify_destroy_event(struct fsnotify_group *group, + struct fsnotify_event *event) { - atomic_inc(&event->refcnt); -} - -void fsnotify_put_event(struct fsnotify_event *event) -{ - if (!event) + /* Overflow events are per-group and we don't want to free them */ + if (!event || event->mask == FS_Q_OVERFLOW) return; - if (atomic_dec_and_test(&event->refcnt)) { - pr_debug("%s: event=%p\n", __func__, event); - - if (event->data_type == FSNOTIFY_EVENT_PATH) - path_put(&event->path); - - BUG_ON(!list_empty(&event->private_data_list)); - - kfree(event->file_name); - put_pid(event->tgid); - kmem_cache_free(fsnotify_event_cachep, event); - } -} - -struct fsnotify_event_holder *fsnotify_alloc_event_holder(void) -{ - return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL); -} - -void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder) -{ - if (holder) - kmem_cache_free(fsnotify_event_holder_cachep, holder); -} - -/* - * Find the private data that the group previously attached to this event when - * the group added the event to the notification queue (fsnotify_add_notify_event) - */ -struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event) -{ - struct fsnotify_event_private_data *lpriv; - struct fsnotify_event_private_data *priv = NULL; - - assert_spin_locked(&event->lock); - - list_for_each_entry(lpriv, &event->private_data_list, event_list) { - if (lpriv->group == group) { - priv = lpriv; - list_del(&priv->event_list); - break; - } - } - return priv; + group->ops->free_event(event); } /* * Add an event to the group notification queue. The group can later pull this - * event off the queue to deal with. If the event is successfully added to the - * group's notification queue, a reference is taken on event. + * event off the queue to deal with. The function returns 0 if the event was + * added to the queue, 1 if the event was merged with some other queued event, + * 2 if the queue of events has overflown. */ -struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event, - struct fsnotify_event_private_data *priv, - struct fsnotify_event *(*merge)(struct list_head *, - struct fsnotify_event *)) +int fsnotify_add_notify_event(struct fsnotify_group *group, + struct fsnotify_event *event, + int (*merge)(struct list_head *, + struct fsnotify_event *)) { - struct fsnotify_event *return_event = NULL; - struct fsnotify_event_holder *holder = NULL; + int ret = 0; struct list_head *list = &group->notification_list; - pr_debug("%s: group=%p event=%p priv=%p\n", __func__, group, event, priv); - - /* - * There is one fsnotify_event_holder embedded inside each fsnotify_event. - * Check if we expect to be able to use that holder. If not alloc a new - * holder. - * For the overflow event it's possible that something will use the in - * event holder before we get the lock so we may need to jump back and - * alloc a new holder, this can't happen for most events... - */ - if (!list_empty(&event->holder.event_list)) { -alloc_holder: - holder = fsnotify_alloc_event_holder(); - if (!holder) - return ERR_PTR(-ENOMEM); - } + pr_debug("%s: group=%p event=%p\n", __func__, group, event); mutex_lock(&group->notification_mutex); if (group->q_len >= group->max_events) { - event = q_overflow_event; - - /* - * we need to return the overflow event - * which means we need a ref - */ - fsnotify_get_event(event); - return_event = event; - - /* sorry, no private data on the overflow event */ - priv = NULL; - } - - if (!list_empty(list) && merge) { - struct fsnotify_event *tmp; - - tmp = merge(list, event); - if (tmp) { + ret = 2; + /* Queue overflow event only if it isn't already queued */ + if (!list_empty(&group->overflow_event->list)) { mutex_unlock(&group->notification_mutex); - - if (return_event) - fsnotify_put_event(return_event); - if (holder != &event->holder) - fsnotify_destroy_event_holder(holder); - return tmp; + return ret; } + event = group->overflow_event; + goto queue; } - spin_lock(&event->lock); - - if (list_empty(&event->holder.event_list)) { - if (unlikely(holder)) - fsnotify_destroy_event_holder(holder); - holder = &event->holder; - } else if (unlikely(!holder)) { - /* between the time we checked above and got the lock the in - * event holder was used, go back and get a new one */ - spin_unlock(&event->lock); - mutex_unlock(&group->notification_mutex); - - if (return_event) { - fsnotify_put_event(return_event); - return_event = NULL; + if (!list_empty(list) && merge) { + ret = merge(list, event); + if (ret) { + mutex_unlock(&group->notification_mutex); + return ret; } - - goto alloc_holder; } +queue: group->q_len++; - holder->event = event; - - fsnotify_get_event(event); - list_add_tail(&holder->event_list, list); - if (priv) - list_add_tail(&priv->event_list, &event->private_data_list); - spin_unlock(&event->lock); + list_add_tail(&event->list, list); mutex_unlock(&group->notification_mutex); wake_up(&group->notification_waitq); kill_fasync(&group->fsn_fa, SIGIO, POLL_IN); - return return_event; + return ret; } /* - * Remove and return the first event from the notification list. There is a - * reference held on this event since it was on the list. It is the responsibility - * of the caller to drop this reference. + * Remove and return the first event from the notification list. It is the + * responsibility of the caller to destroy the obtained event */ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group) { struct fsnotify_event *event; - struct fsnotify_event_holder *holder; BUG_ON(!mutex_is_locked(&group->notification_mutex)); pr_debug("%s: group=%p\n", __func__, group); - holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); - - event = holder->event; - - spin_lock(&event->lock); - holder->event = NULL; - list_del_init(&holder->event_list); - spin_unlock(&event->lock); - - /* event == holder means we are referenced through the in event holder */ - if (holder != &event->holder) - fsnotify_destroy_event_holder(holder); - + event = list_first_entry(&group->notification_list, + struct fsnotify_event, list); + /* + * We need to init list head for the case of overflow event so that + * check in fsnotify_add_notify_events() works + */ + list_del_init(&event->list); group->q_len--; return event; @@ -266,15 +153,10 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group */ struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) { - struct fsnotify_event *event; - struct fsnotify_event_holder *holder; - BUG_ON(!mutex_is_locked(&group->notification_mutex)); - holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); - event = holder->event; - - return event; + return list_first_entry(&group->notification_list, + struct fsnotify_event, list); } /* @@ -284,181 +166,31 @@ struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) void fsnotify_flush_notify(struct fsnotify_group *group) { struct fsnotify_event *event; - struct fsnotify_event_private_data *priv; mutex_lock(&group->notification_mutex); while (!fsnotify_notify_queue_is_empty(group)) { event = fsnotify_remove_notify_event(group); - /* if they don't implement free_event_priv they better not have attached any */ - if (group->ops->free_event_priv) { - spin_lock(&event->lock); - priv = fsnotify_remove_priv_from_event(group, event); - spin_unlock(&event->lock); - if (priv) - group->ops->free_event_priv(priv); - } - fsnotify_put_event(event); /* matches fsnotify_add_notify_event */ + fsnotify_destroy_event(group, event); } mutex_unlock(&group->notification_mutex); } -static void initialize_event(struct fsnotify_event *event) -{ - INIT_LIST_HEAD(&event->holder.event_list); - atomic_set(&event->refcnt, 1); - - spin_lock_init(&event->lock); - - INIT_LIST_HEAD(&event->private_data_list); -} - -/* - * Caller damn well better be holding whatever mutex is protecting the - * old_holder->event_list and the new_event must be a clean event which - * cannot be found anywhere else in the kernel. - */ -int fsnotify_replace_event(struct fsnotify_event_holder *old_holder, - struct fsnotify_event *new_event) -{ - struct fsnotify_event *old_event = old_holder->event; - struct fsnotify_event_holder *new_holder = &new_event->holder; - - enum event_spinlock_class { - SPINLOCK_OLD, - SPINLOCK_NEW, - }; - - pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, new_event); - - /* - * if the new_event's embedded holder is in use someone - * screwed up and didn't give us a clean new event. - */ - BUG_ON(!list_empty(&new_holder->event_list)); - - spin_lock_nested(&old_event->lock, SPINLOCK_OLD); - spin_lock_nested(&new_event->lock, SPINLOCK_NEW); - - new_holder->event = new_event; - list_replace_init(&old_holder->event_list, &new_holder->event_list); - - spin_unlock(&new_event->lock); - spin_unlock(&old_event->lock); - - /* event == holder means we are referenced through the in event holder */ - if (old_holder != &old_event->holder) - fsnotify_destroy_event_holder(old_holder); - - fsnotify_get_event(new_event); /* on the list take reference */ - fsnotify_put_event(old_event); /* off the list, drop reference */ - - return 0; -} - -struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event) -{ - struct fsnotify_event *event; - - event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL); - if (!event) - return NULL; - - pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, event); - - memcpy(event, old_event, sizeof(*event)); - initialize_event(event); - - if (event->name_len) { - event->file_name = kstrdup(old_event->file_name, GFP_KERNEL); - if (!event->file_name) { - kmem_cache_free(fsnotify_event_cachep, event); - return NULL; - } - } - event->tgid = get_pid(old_event->tgid); - if (event->data_type == FSNOTIFY_EVENT_PATH) - path_get(&event->path); - - return event; -} - /* * fsnotify_create_event - Allocate a new event which will be sent to each * group's handle_event function if the group was interested in this * particular event. * - * @to_tell the inode which is supposed to receive the event (sometimes a + * @inode the inode which is supposed to receive the event (sometimes a * parent of the inode to which the event happened. * @mask what actually happened. * @data pointer to the object which was actually affected * @data_type flag indication if the data is a file, path, inode, nothing... * @name the filename, if available */ -struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, - int data_type, const unsigned char *name, - u32 cookie, gfp_t gfp) +void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode, + u32 mask) { - struct fsnotify_event *event; - - event = kmem_cache_zalloc(fsnotify_event_cachep, gfp); - if (!event) - return NULL; - - pr_debug("%s: event=%p to_tell=%p mask=%x data=%p data_type=%d\n", - __func__, event, to_tell, mask, data, data_type); - - initialize_event(event); - - if (name) { - event->file_name = kstrdup(name, gfp); - if (!event->file_name) { - kmem_cache_free(fsnotify_event_cachep, event); - return NULL; - } - event->name_len = strlen(event->file_name); - } - - event->tgid = get_pid(task_tgid(current)); - event->sync_cookie = cookie; - event->to_tell = to_tell; - event->data_type = data_type; - - switch (data_type) { - case FSNOTIFY_EVENT_PATH: { - struct path *path = data; - event->path.dentry = path->dentry; - event->path.mnt = path->mnt; - path_get(&event->path); - break; - } - case FSNOTIFY_EVENT_INODE: - event->inode = data; - break; - case FSNOTIFY_EVENT_NONE: - event->inode = NULL; - event->path.dentry = NULL; - event->path.mnt = NULL; - break; - default: - BUG(); - } - + INIT_LIST_HEAD(&event->list); + event->inode = inode; event->mask = mask; - - return event; -} - -static __init int fsnotify_notification_init(void) -{ - fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC); - fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC); - - q_overflow_event = fsnotify_create_event(NULL, FS_Q_OVERFLOW, NULL, - FSNOTIFY_EVENT_NONE, NULL, 0, - GFP_KERNEL); - if (!q_overflow_event) - panic("unable to allocate fsnotify q_overflow_event\n"); - - return 0; } -subsys_initcall(fsnotify_notification_init); diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index a27e3fecefa..250ed5b20c8 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -1748,7 +1748,6 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) if (page) { set_page_dirty(page); unlock_page(page); - mark_page_accessed(page); page_cache_release(page); } ntfs_debug("Done."); diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c index ee4144ce5d7..f82498c35e7 100644 --- a/fs/ntfs/compress.c +++ b/fs/ntfs/compress.c @@ -58,7 +58,7 @@ typedef enum { /** * ntfs_compression_buffer - one buffer for the decompression engine */ -static u8 *ntfs_compression_buffer = NULL; +static u8 *ntfs_compression_buffer; /** * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c index 807150e2c2b..dd6103cc93c 100644 --- a/fs/ntfs/debug.c +++ b/fs/ntfs/debug.c @@ -18,16 +18,9 @@ * distribution in the file COPYING); if not, write to the Free Software * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "debug.h" -/* - * A static buffer to hold the error string being displayed and a spinlock - * to protect concurrent accesses to it. - */ -static char err_buf[1024]; -static DEFINE_SPINLOCK(err_buf_lock); - /** * __ntfs_warning - output a warning to the syslog * @function: name of function outputting the warning @@ -50,6 +43,7 @@ static DEFINE_SPINLOCK(err_buf_lock); void __ntfs_warning(const char *function, const struct super_block *sb, const char *fmt, ...) { + struct va_format vaf; va_list args; int flen = 0; @@ -59,17 +53,15 @@ void __ntfs_warning(const char *function, const struct super_block *sb, #endif if (function) flen = strlen(function); - spin_lock(&err_buf_lock); va_start(args, fmt); - vsnprintf(err_buf, sizeof(err_buf), fmt, args); - va_end(args); + vaf.fmt = fmt; + vaf.va = &args; if (sb) - printk(KERN_ERR "NTFS-fs warning (device %s): %s(): %s\n", - sb->s_id, flen ? function : "", err_buf); + pr_warn("(device %s): %s(): %pV\n", + sb->s_id, flen ? function : "", &vaf); else - printk(KERN_ERR "NTFS-fs warning: %s(): %s\n", - flen ? function : "", err_buf); - spin_unlock(&err_buf_lock); + pr_warn("%s(): %pV\n", flen ? function : "", &vaf); + va_end(args); } /** @@ -94,6 +86,7 @@ void __ntfs_warning(const char *function, const struct super_block *sb, void __ntfs_error(const char *function, const struct super_block *sb, const char *fmt, ...) { + struct va_format vaf; va_list args; int flen = 0; @@ -103,17 +96,15 @@ void __ntfs_error(const char *function, const struct super_block *sb, #endif if (function) flen = strlen(function); - spin_lock(&err_buf_lock); va_start(args, fmt); - vsnprintf(err_buf, sizeof(err_buf), fmt, args); - va_end(args); + vaf.fmt = fmt; + vaf.va = &args; if (sb) - printk(KERN_ERR "NTFS-fs error (device %s): %s(): %s\n", - sb->s_id, flen ? function : "", err_buf); + pr_err("(device %s): %s(): %pV\n", + sb->s_id, flen ? function : "", &vaf); else - printk(KERN_ERR "NTFS-fs error: %s(): %s\n", - flen ? function : "", err_buf); - spin_unlock(&err_buf_lock); + pr_err("%s(): %pV\n", flen ? function : "", &vaf); + va_end(args); } #ifdef DEBUG @@ -124,6 +115,7 @@ int debug_msgs = 0; void __ntfs_debug (const char *file, int line, const char *function, const char *fmt, ...) { + struct va_format vaf; va_list args; int flen = 0; @@ -131,13 +123,11 @@ void __ntfs_debug (const char *file, int line, const char *function, return; if (function) flen = strlen(function); - spin_lock(&err_buf_lock); va_start(args, fmt); - vsnprintf(err_buf, sizeof(err_buf), fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + pr_debug("(%s, %d): %s(): %pV", file, line, flen ? function : "", &vaf); va_end(args); - printk(KERN_DEBUG "NTFS-fs DEBUG (%s, %d): %s(): %s\n", file, line, - flen ? function : "", err_buf); - spin_unlock(&err_buf_lock); } /* Dump a runlist. Caller has to provide synchronisation for @rl. */ @@ -149,12 +139,12 @@ void ntfs_debug_dump_runlist(const runlist_element *rl) if (!debug_msgs) return; - printk(KERN_DEBUG "NTFS-fs DEBUG: Dumping runlist (values in hex):\n"); + pr_debug("Dumping runlist (values in hex):\n"); if (!rl) { - printk(KERN_DEBUG "Run list not present.\n"); + pr_debug("Run list not present.\n"); return; } - printk(KERN_DEBUG "VCN LCN Run length\n"); + pr_debug("VCN LCN Run length\n"); for (i = 0; ; i++) { LCN lcn = (rl + i)->lcn; @@ -163,13 +153,13 @@ void ntfs_debug_dump_runlist(const runlist_element *rl) if (index > -LCN_ENOENT - 1) index = 3; - printk(KERN_DEBUG "%-16Lx %s %-16Lx%s\n", + pr_debug("%-16Lx %s %-16Lx%s\n", (long long)(rl + i)->vcn, lcn_str[index], (long long)(rl + i)->length, (rl + i)->length ? "" : " (runlist end)"); } else - printk(KERN_DEBUG "%-16Lx %-16Lx %-16Lx%s\n", + pr_debug("%-16Lx %-16Lx %-16Lx%s\n", (long long)(rl + i)->vcn, (long long)(rl + i)->lcn, (long long)(rl + i)->length, diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h index 53c27eaf230..61bf091e32a 100644 --- a/fs/ntfs/debug.h +++ b/fs/ntfs/debug.h @@ -48,7 +48,12 @@ extern void ntfs_debug_dump_runlist(const runlist_element *rl); #else /* !DEBUG */ -#define ntfs_debug(f, a...) do {} while (0) +#define ntfs_debug(fmt, ...) \ +do { \ + if (0) \ + no_printk(fmt, ##__VA_ARGS__); \ +} while (0) + #define ntfs_debug_dump_runlist(rl) do {} while (0) #endif /* !DEBUG */ diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index ea4ba9daeb4..5c9e2c81cb1 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2060,7 +2060,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, } do { unlock_page(pages[--do_pages]); - mark_page_accessed(pages[do_pages]); page_cache_release(pages[do_pages]); } while (do_pages); if (unlikely(status)) @@ -2091,10 +2090,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb, size_t count; /* after file limit checks */ ssize_t written, err; - count = 0; - err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); - if (err) - return err; + count = iov_length(iov, nr_segs); pos = *ppos; /* We can write back this queue in page reclaim. */ current->backing_dev_info = mapping->backing_dev_info; @@ -2134,7 +2130,7 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); if (ret > 0) { - int err = generic_write_sync(file, pos, ret); + int err = generic_write_sync(file, iocb->ki_pos - ret, ret); if (err < 0) ret = err; } @@ -2203,8 +2199,8 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, const struct file_operations ntfs_file_ops = { .llseek = generic_file_llseek, /* Seek inside file. */ - .read = do_sync_read, /* Read from file. */ - .aio_read = generic_file_aio_read, /* Async read from file. */ + .read = new_sync_read, /* Read from file. */ + .read_iter = generic_file_read_iter, /* Async read from file. */ #ifdef NTFS_RW .write = do_sync_write, /* Write to file. */ .aio_write = ntfs_file_aio_write, /* Async write to file. */ diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 2778b0255dc..f47af5e6e23 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -55,7 +55,7 @@ * * Return 1 if the attributes match and 0 if not. * - * NOTE: This function runs with the inode->i_lock spin lock held so it is not + * NOTE: This function runs with the inode_hash_lock spin lock held so it is not * allowed to sleep. */ int ntfs_test_inode(struct inode *vi, ntfs_attr *na) @@ -1704,8 +1704,6 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) iput(bvi); skip_large_index_stuff: /* Setup the operations for this index inode. */ - vi->i_op = NULL; - vi->i_fop = NULL; vi->i_mapping->a_ops = &ntfs_mst_aops; vi->i_blocks = ni->allocated_size >> 9; /* @@ -2259,7 +2257,7 @@ void ntfs_evict_big_inode(struct inode *vi) { ntfs_inode *ni = NTFS_I(vi); - truncate_inode_pages(&vi->i_data, 0); + truncate_inode_pages_final(&vi->i_data); clear_inode(vi); #ifdef NTFS_RW diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 82650d52d91..6c3296e546c 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -19,6 +19,7 @@ * distribution in the file COPYING); if not, write to the Free Software * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/stddef.h> #include <linux/init.h> @@ -49,8 +50,8 @@ static unsigned long ntfs_nr_compression_users; /* A global default upcase table and a corresponding reference count. */ -static ntfschar *default_upcase = NULL; -static unsigned long ntfs_nr_upcase_users = 0; +static ntfschar *default_upcase; +static unsigned long ntfs_nr_upcase_users; /* Error constants/strings used in inode.c::ntfs_show_options(). */ typedef enum { @@ -468,6 +469,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) ntfs_debug("Entering with remount options string: %s", opt); + sync_filesystem(sb); + #ifndef NTFS_RW /* For read-only compiled driver, enforce read-only flag. */ *flags |= MS_RDONLY; @@ -1894,7 +1897,7 @@ get_ctx_vol_failed: vol->minor_ver = vi->minor_ver; ntfs_attr_put_search_ctx(ctx); unmap_mft_record(NTFS_I(vol->vol_ino)); - printk(KERN_INFO "NTFS volume version %i.%i.\n", vol->major_ver, + pr_info("volume version %i.%i.\n", vol->major_ver, vol->minor_ver); if (vol->major_ver < 3 && NVolSparseEnabled(vol)) { ntfs_warning(vol->sb, "Disabling sparse support due to NTFS " @@ -3093,7 +3096,7 @@ static int __init init_ntfs_fs(void) int err = 0; /* This may be ugly but it results in pretty output so who cares. (-8 */ - printk(KERN_INFO "NTFS driver " NTFS_VERSION " [Flags: R/" + pr_info("driver " NTFS_VERSION " [Flags: R/" #ifdef NTFS_RW "W" #else @@ -3113,16 +3116,15 @@ static int __init init_ntfs_fs(void) sizeof(ntfs_index_context), 0 /* offset */, SLAB_HWCACHE_ALIGN, NULL /* ctor */); if (!ntfs_index_ctx_cache) { - printk(KERN_CRIT "NTFS: Failed to create %s!\n", - ntfs_index_ctx_cache_name); + pr_crit("Failed to create %s!\n", ntfs_index_ctx_cache_name); goto ictx_err_out; } ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name, sizeof(ntfs_attr_search_ctx), 0 /* offset */, SLAB_HWCACHE_ALIGN, NULL /* ctor */); if (!ntfs_attr_ctx_cache) { - printk(KERN_CRIT "NTFS: Failed to create %s!\n", - ntfs_attr_ctx_cache_name); + pr_crit("NTFS: Failed to create %s!\n", + ntfs_attr_ctx_cache_name); goto actx_err_out; } @@ -3130,8 +3132,7 @@ static int __init init_ntfs_fs(void) (NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0, SLAB_HWCACHE_ALIGN, NULL); if (!ntfs_name_cache) { - printk(KERN_CRIT "NTFS: Failed to create %s!\n", - ntfs_name_cache_name); + pr_crit("Failed to create %s!\n", ntfs_name_cache_name); goto name_err_out; } @@ -3139,8 +3140,7 @@ static int __init init_ntfs_fs(void) sizeof(ntfs_inode), 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); if (!ntfs_inode_cache) { - printk(KERN_CRIT "NTFS: Failed to create %s!\n", - ntfs_inode_cache_name); + pr_crit("Failed to create %s!\n", ntfs_inode_cache_name); goto inode_err_out; } @@ -3149,15 +3149,14 @@ static int __init init_ntfs_fs(void) SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, ntfs_big_inode_init_once); if (!ntfs_big_inode_cache) { - printk(KERN_CRIT "NTFS: Failed to create %s!\n", - ntfs_big_inode_cache_name); + pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name); goto big_inode_err_out; } /* Register the ntfs sysctls. */ err = ntfs_sysctl(1); if (err) { - printk(KERN_CRIT "NTFS: Failed to register NTFS sysctls!\n"); + pr_crit("Failed to register NTFS sysctls!\n"); goto sysctl_err_out; } @@ -3166,7 +3165,7 @@ static int __init init_ntfs_fs(void) ntfs_debug("NTFS driver registered successfully."); return 0; /* Success! */ } - printk(KERN_CRIT "NTFS: Failed to register NTFS filesystem driver!\n"); + pr_crit("Failed to register NTFS filesystem driver!\n"); /* Unregister the ntfs sysctls. */ ntfs_sysctl(0); @@ -3182,8 +3181,7 @@ actx_err_out: kmem_cache_destroy(ntfs_index_ctx_cache); ictx_err_out: if (!err) { - printk(KERN_CRIT "NTFS: Aborting NTFS filesystem driver " - "registration...\n"); + pr_crit("Aborting NTFS filesystem driver registration...\n"); err = -ENOMEM; } return err; diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c index 79a89184cb5..a503156ec15 100644 --- a/fs/ntfs/sysctl.c +++ b/fs/ntfs/sysctl.c @@ -34,7 +34,7 @@ #include "debug.h" /* Definition of the ntfs sysctl. */ -static ctl_table ntfs_sysctls[] = { +static struct ctl_table ntfs_sysctls[] = { { .procname = "ntfs-debug", .data = &debug_msgs, /* Data pointer and size. */ @@ -46,7 +46,7 @@ static ctl_table ntfs_sysctls[] = { }; /* Define the parent directory /proc/sys/fs. */ -static ctl_table sysctls_root[] = { +static struct ctl_table sysctls_root[] = { { .procname = "fs", .mode = 0555, @@ -56,7 +56,7 @@ static ctl_table sysctls_root[] = { }; /* Storage for the sysctls header. */ -static struct ctl_table_header *sysctls_root_table = NULL; +static struct ctl_table_header *sysctls_root_table; /** * ntfs_sysctl - add or remove the debug sysctl diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index f17e58b3298..ce210d4951a 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -38,7 +38,6 @@ ocfs2-objs := \ symlink.o \ sysfile.o \ uptodate.o \ - ver.o \ quota_local.o \ quota_global.o \ xattr.o \ diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index b4f788e0ca3..7e8282dcea2 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -160,36 +160,6 @@ static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode, return acl; } - -/* - * Get posix acl. - */ -static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type) -{ - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct buffer_head *di_bh = NULL; - struct posix_acl *acl; - int ret; - - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return NULL; - - ret = ocfs2_inode_lock(inode, &di_bh, 0); - if (ret < 0) { - mlog_errno(ret); - acl = ERR_PTR(ret); - return acl; - } - - acl = ocfs2_get_acl_nolock(inode, type, di_bh); - - ocfs2_inode_unlock(inode, 0); - - brelse(di_bh); - - return acl; -} - /* * Helper function to set i_mode in memory and disk. Some call paths * will not have di_bh or a journal handle to pass, in which case it @@ -235,6 +205,7 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh, di->i_mode = cpu_to_le16(inode->i_mode); di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); @@ -250,7 +221,7 @@ out: /* * Set the access or default ACL of an inode. */ -static int ocfs2_set_acl(handle_t *handle, +int ocfs2_set_acl(handle_t *handle, struct inode *inode, struct buffer_head *di_bh, int type, @@ -313,6 +284,11 @@ static int ocfs2_set_acl(handle_t *handle, return ret; } +int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL); +} + struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) { struct ocfs2_super *osb; @@ -334,200 +310,3 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) return acl; } - -int ocfs2_acl_chmod(struct inode *inode) -{ - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct posix_acl *acl; - int ret; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return 0; - - acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (ret) - return ret; - ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS, - acl, NULL, NULL); - posix_acl_release(acl); - return ret; -} - -/* - * Initialize the ACLs of a new inode. If parent directory has default ACL, - * then clone to new inode. Called from ocfs2_mknod. - */ -int ocfs2_init_acl(handle_t *handle, - struct inode *inode, - struct inode *dir, - struct buffer_head *di_bh, - struct buffer_head *dir_bh, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_alloc_context *data_ac) -{ - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct posix_acl *acl = NULL; - int ret = 0, ret2; - umode_t mode; - - if (!S_ISLNK(inode->i_mode)) { - if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { - acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT, - dir_bh); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (!acl) { - mode = inode->i_mode & ~current_umask(); - ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode); - if (ret) { - mlog_errno(ret); - goto cleanup; - } - } - } - if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { - if (S_ISDIR(inode->i_mode)) { - ret = ocfs2_set_acl(handle, inode, di_bh, - ACL_TYPE_DEFAULT, acl, - meta_ac, data_ac); - if (ret) - goto cleanup; - } - mode = inode->i_mode; - ret = posix_acl_create(&acl, GFP_NOFS, &mode); - if (ret < 0) - return ret; - - ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode); - if (ret2) { - mlog_errno(ret2); - ret = ret2; - goto cleanup; - } - if (ret > 0) { - ret = ocfs2_set_acl(handle, inode, - di_bh, ACL_TYPE_ACCESS, - acl, meta_ac, data_ac); - } - } -cleanup: - posix_acl_release(acl); - return ret; -} - -static size_t ocfs2_xattr_list_acl_access(struct dentry *dentry, - char *list, - size_t list_len, - const char *name, - size_t name_len, - int type) -{ - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return 0; - - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_ACCESS, size); - return size; -} - -static size_t ocfs2_xattr_list_acl_default(struct dentry *dentry, - char *list, - size_t list_len, - const char *name, - size_t name_len, - int type) -{ - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return 0; - - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); - return size; -} - -static int ocfs2_xattr_get_acl(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - struct posix_acl *acl; - int ret; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return -EOPNOTSUPP; - - acl = ocfs2_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - ret = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return ret; -} - -static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - struct inode *inode = dentry->d_inode; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct posix_acl *acl; - int ret = 0; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return -EOPNOTSUPP; - - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - ret = posix_acl_valid(acl); - if (ret) - goto cleanup; - } - } else - acl = NULL; - - ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL); - -cleanup: - posix_acl_release(acl); - return ret; -} - -const struct xattr_handler ocfs2_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = ocfs2_xattr_list_acl_access, - .get = ocfs2_xattr_get_acl, - .set = ocfs2_xattr_set_acl, -}; - -const struct xattr_handler ocfs2_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = ocfs2_xattr_list_acl_default, - .get = ocfs2_xattr_get_acl, - .set = ocfs2_xattr_set_acl, -}; diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 071fbd380f2..3fce68d0862 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -27,10 +27,13 @@ struct ocfs2_acl_entry { }; struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type); -extern int ocfs2_acl_chmod(struct inode *); -extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, - struct buffer_head *, struct buffer_head *, - struct ocfs2_alloc_context *, - struct ocfs2_alloc_context *); +int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int ocfs2_set_acl(handle_t *handle, + struct inode *inode, + struct buffer_head *di_bh, + int type, + struct posix_acl *acl, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac); #endif /* OCFS2_ACL_H */ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 17e6bdde96c..9d8fcf2f3b9 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1025,7 +1025,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle, for(i = count; i < (num_got + count); i++) { bhs[i] = sb_getblk(osb->sb, first_blkno); if (bhs[i] == NULL) { - status = -EIO; + status = -ENOMEM; mlog_errno(status); goto bail; } @@ -4742,6 +4742,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, enum ocfs2_alloc_restarted *reason_ret) { int status = 0, err = 0; + int need_free = 0; int free_extents; enum ocfs2_alloc_restarted reason = RESTART_NONE; u32 bit_off, num_bits; @@ -4796,7 +4797,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); - goto leave; + need_free = 1; + goto bail; } block = ocfs2_clusters_to_blocks(osb->sb, bit_off); @@ -4807,7 +4809,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, num_bits, flags, meta_ac); if (status < 0) { mlog_errno(status); - goto leave; + need_free = 1; + goto bail; } ocfs2_journal_dirty(handle, et->et_root_bh); @@ -4821,6 +4824,19 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, reason = RESTART_TRANS; } +bail: + if (need_free) { + if (data_ac->ac_which == OCFS2_AC_USE_LOCAL) + ocfs2_free_local_alloc_bits(osb, handle, data_ac, + bit_off, num_bits); + else + ocfs2_free_clusters(handle, + data_ac->ac_inode, + data_ac->ac_bh, + ocfs2_clusters_to_blocks(osb->sb, bit_off), + num_bits); + } + leave: if (reason_ret) *reason_ret = reason; @@ -5712,6 +5728,7 @@ int ocfs2_remove_btree_range(struct inode *inode, } ocfs2_et_update_clusters(et, -len); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, et->et_root_bh); @@ -6029,7 +6046,8 @@ static void ocfs2_truncate_log_worker(struct work_struct *work) void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, int cancel) { - if (osb->osb_tl_inode) { + if (osb->osb_tl_inode && + atomic_read(&osb->osb_tl_disable) == 0) { /* We want to push off log flushes while truncates are * still running. */ if (cancel) @@ -6206,6 +6224,8 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) int status; struct inode *tl_inode = osb->osb_tl_inode; + atomic_set(&osb->osb_tl_disable, 1); + if (tl_inode) { cancel_delayed_work(&osb->osb_truncate_log_wq); flush_workqueue(ocfs2_wq); @@ -6237,6 +6257,7 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb) * until we're sure all is well. */ INIT_DELAYED_WORK(&osb->osb_truncate_log_wq, ocfs2_truncate_log_worker); + atomic_set(&osb->osb_tl_disable, 0); osb->osb_tl_bh = tl_bh; osb->osb_tl_inode = tl_inode; @@ -6805,6 +6826,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct buffer_head *di_bh) { int ret, i, has_data, num_pages = 0; + int need_free = 0; + u32 bit_off, num; handle_t *handle; u64 uninitialized_var(block); struct ocfs2_inode_info *oi = OCFS2_I(inode); @@ -6850,7 +6873,6 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, } if (has_data) { - u32 bit_off, num; unsigned int page_end; u64 phys; @@ -6886,6 +6908,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages); if (ret) { mlog_errno(ret); + need_free = 1; goto out_commit; } @@ -6896,6 +6919,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, ret = ocfs2_read_inline_data(inode, pages[0], di_bh); if (ret) { mlog_errno(ret); + need_free = 1; goto out_commit; } @@ -6913,6 +6937,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); spin_unlock(&oi->ip_lock); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_dinode_new_extent_list(inode, di); ocfs2_journal_dirty(handle, di_bh); @@ -6927,6 +6952,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL); if (ret) { mlog_errno(ret); + need_free = 1; goto out_commit; } @@ -6938,6 +6964,18 @@ out_commit: dquot_free_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, 1)); + if (need_free) { + if (data_ac->ac_which == OCFS2_AC_USE_LOCAL) + ocfs2_free_local_alloc_bits(osb, handle, data_ac, + bit_off, num); + else + ocfs2_free_clusters(handle, + data_ac->ac_inode, + data_ac->ac_bh, + ocfs2_clusters_to_blocks(osb->sb, bit_off), + num); + } + ocfs2_commit_trans(osb, handle); out_unlock: @@ -7126,7 +7164,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, if (end > i_size_read(inode)) end = i_size_read(inode); - BUG_ON(start >= end); + BUG_ON(start > end); if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) || !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) || @@ -7176,6 +7214,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, di_bh); out_commit: @@ -7260,14 +7299,8 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) start = range->start >> osb->s_clustersize_bits; len = range->len >> osb->s_clustersize_bits; minlen = range->minlen >> osb->s_clustersize_bits; - trimmed = 0; - if (!len) { - range->len = 0; - return 0; - } - - if (minlen >= osb->bitmap_cpg) + if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize) return -EINVAL; main_bm_inode = ocfs2_get_system_file_inode(osb, @@ -7293,6 +7326,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) goto out_unlock; } + len = range->len >> osb->s_clustersize_bits; if (start + len > le32_to_cpu(main_bm->i_clusters)) len = le32_to_cpu(main_bm->i_clusters) - start; @@ -7307,6 +7341,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); last_bit = osb->bitmap_cpg; + trimmed = 0; for (group = first_group; group <= last_group;) { if (first_bit + len >= osb->bitmap_cpg) last_bit = osb->bitmap_cpg; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index f37d3c0e205..4a231a166cf 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -80,6 +80,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb, le32_to_cpu(fe->i_clusters))) { + err = -ENOMEM; mlog(ML_ERROR, "block offset is outside the allocated size: " "%llu\n", (unsigned long long)iblock); goto bail; @@ -92,6 +93,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, iblock; buffer_cache_bh = sb_getblk(osb->sb, blkno); if (!buffer_cache_bh) { + err = -ENOMEM; mlog(ML_ERROR, "couldn't getblock for symlink!\n"); goto bail; } @@ -569,7 +571,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, { struct inode *inode = file_inode(iocb->ki_filp); int level; - wait_queue_head_t *wq = ocfs2_ioend_wq(inode); /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); @@ -580,10 +581,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, if (ocfs2_iocb_is_unaligned_aio(iocb)) { ocfs2_iocb_clear_unaligned_aio(iocb); - if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) && - waitqueue_active(wq)) { - wake_up_all(wq); - } + mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); } ocfs2_iocb_clear_rw_locked(iocb); @@ -592,33 +590,17 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, ocfs2_rw_unlock(inode, level); } -/* - * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen - * from ext3. PageChecked() bits have been removed as OCFS2 does not - * do journalled data. - */ -static void ocfs2_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) -{ - journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; - - jbd2_journal_invalidatepage(journal, page, offset, length); -} - static int ocfs2_releasepage(struct page *page, gfp_t wait) { - journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; - if (!page_has_buffers(page)) return 0; - return jbd2_journal_try_to_free_buffers(journal, page, wait); + return try_to_free_buffers(page); } static ssize_t ocfs2_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, - loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file)->i_mapping->host; @@ -635,7 +617,7 @@ static ssize_t ocfs2_direct_IO(int rw, return 0; return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, - iov, offset, nr_segs, + iter, offset, ocfs2_direct_IO_get_blocks, ocfs2_dio_end_io, NULL, 0); } @@ -1802,8 +1784,7 @@ try_again: data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; credits = ocfs2_calc_extend_credits(inode->i_sb, - &di->id2.i_list, - clusters_to_alloc); + &di->id2.i_list); } @@ -1897,10 +1878,14 @@ out_commit: out: ocfs2_free_write_ctxt(wc); - if (data_ac) + if (data_ac) { ocfs2_free_alloc_context(data_ac); - if (meta_ac) + data_ac = NULL; + } + if (meta_ac) { ocfs2_free_alloc_context(meta_ac); + meta_ac = NULL; + } if (ret == -ENOSPC && try_free) { /* @@ -2053,6 +2038,7 @@ out_write_size: inode->i_mtime = inode->i_ctime = CURRENT_TIME; di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, wc->w_di_bh); ocfs2_commit_trans(osb, handle); @@ -2087,7 +2073,7 @@ const struct address_space_operations ocfs2_aops = { .write_end = ocfs2_write_end, .bmap = ocfs2_bmap, .direct_IO = ocfs2_direct_IO, - .invalidatepage = ocfs2_invalidatepage, + .invalidatepage = block_invalidatepage, .releasepage = ocfs2_releasepage, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index f671e49beb3..6cae155d54d 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -102,9 +102,4 @@ enum ocfs2_iocb_lock_bits { #define ocfs2_iocb_is_unaligned_aio(iocb) \ test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) -#define OCFS2_IOEND_WQ_HASH_SZ 37 -#define ocfs2_ioend_wq(v) (&ocfs2__ioend_wq[((unsigned long)(v)) %\ - OCFS2_IOEND_WQ_HASH_SZ]) -extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ]; - #endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 5d18ad10c27..1edcb141f63 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -90,7 +90,6 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, * information for this bh as it's not marked locally * uptodate. */ ret = -EIO; - put_bh(bh); mlog_errno(ret); } @@ -115,7 +114,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, if (bhs[i] == NULL) { bhs[i] = sb_getblk(osb->sb, block++); if (bhs[i] == NULL) { - status = -EIO; + status = -ENOMEM; mlog_errno(status); goto bail; } @@ -214,7 +213,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, bhs[i] = sb_getblk(sb, block++); if (bhs[i] == NULL) { ocfs2_metadata_cache_io_unlock(ci); - status = -EIO; + status = -ENOMEM; mlog_errno(status); goto bail; } @@ -420,7 +419,6 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb, if (!buffer_uptodate(bh)) { ret = -EIO; - put_bh(bh); mlog_errno(ret); } diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile index bc8c5e7d860..1aefc0350ec 100644 --- a/fs/ocfs2/cluster/Makefile +++ b/fs/ocfs2/cluster/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \ - quorum.o tcp.o netdebug.o ver.o + quorum.o tcp.o netdebug.o diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 363f0dcc924..73039295d0d 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -35,6 +35,7 @@ #include <linux/time.h> #include <linux/debugfs.h> #include <linux/slab.h> +#include <linux/bitmap.h> #include "heartbeat.h" #include "tcp.h" @@ -282,15 +283,6 @@ struct o2hb_bio_wait_ctxt { int wc_error; }; -static int o2hb_pop_count(void *map, int count) -{ - int i = -1, pop = 0; - - while ((i = find_next_bit(map, count, i + 1)) < count) - pop++; - return pop; -} - static void o2hb_write_timeout(struct work_struct *work) { int failed, quorum; @@ -307,9 +299,9 @@ static void o2hb_write_timeout(struct work_struct *work) spin_lock_irqsave(&o2hb_live_lock, flags); if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) set_bit(reg->hr_region_num, o2hb_failed_region_bitmap); - failed = o2hb_pop_count(&o2hb_failed_region_bitmap, + failed = bitmap_weight(o2hb_failed_region_bitmap, O2NM_MAX_REGIONS); - quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap, + quorum = bitmap_weight(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS); spin_unlock_irqrestore(&o2hb_live_lock, flags); @@ -421,7 +413,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, } /* Must put everything in 512 byte sectors for the bio... */ - bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9); + bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9); bio->bi_bdev = reg->hr_bdev; bio->bi_private = wc; bio->bi_end_io = o2hb_bio_end_io; @@ -765,7 +757,7 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg) * If global heartbeat active, unpin all regions if the * region count > CUT_OFF */ - if (o2hb_pop_count(&o2hb_quorum_region_bitmap, + if (bitmap_weight(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF) o2hb_region_unpin(NULL); unlock: @@ -954,23 +946,9 @@ out: return changed; } -/* This could be faster if we just implmented a find_last_bit, but I - * don't think the circumstances warrant it. */ -static int o2hb_highest_node(unsigned long *nodes, - int numbits) +static int o2hb_highest_node(unsigned long *nodes, int numbits) { - int highest, node; - - highest = numbits; - node = -1; - while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) { - if (node >= numbits) - break; - - highest = node; - } - - return highest; + return find_last_bit(nodes, numbits); } static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) @@ -1129,7 +1107,7 @@ static int o2hb_thread(void *data) mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n"); - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); /* Pin node */ o2nm_depend_this_node(); @@ -1829,7 +1807,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, live_threshold = O2HB_LIVE_THRESHOLD; if (o2hb_global_heartbeat_active()) { spin_lock(&o2hb_live_lock); - if (o2hb_pop_count(&o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1) + if (bitmap_weight(o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1) live_threshold <<= 1; spin_unlock(&o2hb_live_lock); } @@ -2180,7 +2158,7 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, if (!o2hb_dependent_users) goto unlock; - if (o2hb_pop_count(&o2hb_quorum_region_bitmap, + if (bitmap_weight(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF) o2hb_region_pin(NULL); @@ -2480,7 +2458,7 @@ static int o2hb_region_inc_user(const char *region_uuid) if (o2hb_dependent_users > 1) goto unlock; - if (o2hb_pop_count(&o2hb_quorum_region_bitmap, + if (bitmap_weight(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF) ret = o2hb_region_pin(NULL); diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index baa2b9ef7ee..2260fb9e650 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -199,7 +199,8 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits; #define mlog_errno(st) do { \ int _st = (st); \ if (_st != -ERESTARTSYS && _st != -EINTR && \ - _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC) \ + _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC && \ + _st != -EDQUOT) \ mlog(ML_ERROR, "status = %lld\n", (long long)_st); \ } while (0) diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index bb240647ca5..441c84e169e 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -29,7 +29,6 @@ #include "heartbeat.h" #include "masklog.h" #include "sys.h" -#include "ver.h" /* for now we operate under the assertion that there can be only one * cluster active at a time. Changing this will require trickling @@ -945,8 +944,6 @@ static int __init init_o2nm(void) { int ret = -1; - cluster_print_version(); - ret = o2hb_init(); if (ret) goto out; @@ -984,6 +981,7 @@ out: MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 cluster management"); module_init(init_o2nm) module_exit(exit_o2nm) diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c index a4b07730b2e..b7f57271d49 100644 --- a/fs/ocfs2/cluster/sys.c +++ b/fs/ocfs2/cluster/sys.c @@ -41,7 +41,7 @@ static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr, return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION); } static struct kobj_attribute attr_version = - __ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL); + __ATTR(interface_revision, S_IRUGO, version_show, NULL); static struct attribute *o2cb_attrs[] = { &attr_version.attr, diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 2cd2406b414..681691bc233 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -108,7 +108,7 @@ static struct rb_root o2net_handler_tree = RB_ROOT; static struct o2net_node o2net_nodes[O2NM_MAX_NODES]; /* XXX someday we'll need better accounting */ -static struct socket *o2net_listen_sock = NULL; +static struct socket *o2net_listen_sock; /* * listen work is only queued by the listening socket callbacks on the @@ -137,7 +137,7 @@ static int o2net_sys_err_translations[O2NET_ERR_MAX] = static void o2net_sc_connect_completed(struct work_struct *work); static void o2net_rx_until_empty(struct work_struct *work); static void o2net_shutdown_sc(struct work_struct *work); -static void o2net_listen_data_ready(struct sock *sk, int bytes); +static void o2net_listen_data_ready(struct sock *sk); static void o2net_sc_send_keep_req(struct work_struct *work); static void o2net_idle_timer(unsigned long data); static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); @@ -262,17 +262,17 @@ static void o2net_update_recv_stats(struct o2net_sock_container *sc) #endif /* CONFIG_OCFS2_FS_STATS */ -static inline int o2net_reconnect_delay(void) +static inline unsigned int o2net_reconnect_delay(void) { return o2nm_single_cluster->cl_reconnect_delay_ms; } -static inline int o2net_keepalive_delay(void) +static inline unsigned int o2net_keepalive_delay(void) { return o2nm_single_cluster->cl_keepalive_delay_ms; } -static inline int o2net_idle_timeout(void) +static inline unsigned int o2net_idle_timeout(void) { return o2nm_single_cluster->cl_idle_timeout_ms; } @@ -597,9 +597,9 @@ static void o2net_set_nn_state(struct o2net_node *nn, } /* see o2net_register_callbacks() */ -static void o2net_data_ready(struct sock *sk, int bytes) +static void o2net_data_ready(struct sock *sk) { - void (*ready)(struct sock *sk, int bytes); + void (*ready)(struct sock *sk); read_lock(&sk->sk_callback_lock); if (sk->sk_user_data) { @@ -613,7 +613,7 @@ static void o2net_data_ready(struct sock *sk, int bytes) } read_unlock(&sk->sk_callback_lock); - ready(sk, bytes); + ready(sk); } /* see o2net_register_callbacks() */ @@ -916,57 +916,30 @@ static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key) static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len) { - int ret; - mm_segment_t oldfs; - struct kvec vec = { - .iov_len = len, - .iov_base = data, - }; - struct msghdr msg = { - .msg_iovlen = 1, - .msg_iov = (struct iovec *)&vec, - .msg_flags = MSG_DONTWAIT, - }; - - oldfs = get_fs(); - set_fs(get_ds()); - ret = sock_recvmsg(sock, &msg, len, msg.msg_flags); - set_fs(oldfs); - - return ret; + struct kvec vec = { .iov_len = len, .iov_base = data, }; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; + return kernel_recvmsg(sock, &msg, &vec, 1, len, msg.msg_flags); } static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec, size_t veclen, size_t total) { int ret; - mm_segment_t oldfs; - struct msghdr msg = { - .msg_iov = (struct iovec *)vec, - .msg_iovlen = veclen, - }; + struct msghdr msg; if (sock == NULL) { ret = -EINVAL; goto out; } - oldfs = get_fs(); - set_fs(get_ds()); - ret = sock_sendmsg(sock, &msg, total); - set_fs(oldfs); - if (ret != total) { - mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, - total); - if (ret >= 0) - ret = -EPIPE; /* should be smarter, I bet */ - goto out; - } - - ret = 0; + ret = kernel_sendmsg(sock, &msg, vec, veclen, total); + if (likely(ret == total)) + return 0; + mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, total); + if (ret >= 0) + ret = -EPIPE; /* should be smarter, I bet */ out: - if (ret < 0) - mlog(0, "returning error: %d\n", ret); + mlog(0, "returning error: %d\n", ret); return ret; } @@ -1826,7 +1799,7 @@ int o2net_register_hb_callbacks(void) /* ------------------------------------------------------------ */ -static int o2net_accept_one(struct socket *sock) +static int o2net_accept_one(struct socket *sock, int *more) { int ret, slen; struct sockaddr_in sin; @@ -1837,6 +1810,7 @@ static int o2net_accept_one(struct socket *sock) struct o2net_node *nn; BUG_ON(sock == NULL); + *more = 0; ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, sock->sk->sk_protocol, &new_sock); if (ret) @@ -1848,6 +1822,7 @@ static int o2net_accept_one(struct socket *sock) if (ret < 0) goto out; + *more = 1; new_sock->sk->sk_allocation = GFP_ATOMIC; ret = o2net_set_nodelay(new_sock); @@ -1946,16 +1921,41 @@ out: return ret; } +/* + * This function is invoked in response to one or more + * pending accepts at softIRQ level. We must drain the + * entire que before returning. + */ + static void o2net_accept_many(struct work_struct *work) { struct socket *sock = o2net_listen_sock; - while (o2net_accept_one(sock) == 0) + int more; + int err; + + /* + * It is critical to note that due to interrupt moderation + * at the network driver level, we can't assume to get a + * softIRQ for every single conn since tcp SYN packets + * can arrive back-to-back, and therefore many pending + * accepts may result in just 1 softIRQ. If we terminate + * the o2net_accept_one() loop upon seeing an err, what happens + * to the rest of the conns in the queue? If no new SYN + * arrives for hours, no softIRQ will be delivered, + * and the connections will just sit in the queue. + */ + + for (;;) { + err = o2net_accept_one(sock, &more); + if (!more) + break; cond_resched(); + } } -static void o2net_listen_data_ready(struct sock *sk, int bytes) +static void o2net_listen_data_ready(struct sock *sk) { - void (*ready)(struct sock *sk, int bytes); + void (*ready)(struct sock *sk); read_lock(&sk->sk_callback_lock); ready = sk->sk_user_data; @@ -1964,18 +1964,29 @@ static void o2net_listen_data_ready(struct sock *sk, int bytes) goto out; } - /* ->sk_data_ready is also called for a newly established child socket - * before it has been accepted and the acceptor has set up their - * data_ready.. we only want to queue listen work for our listening - * socket */ + /* This callback may called twice when a new connection + * is being established as a child socket inherits everything + * from a parent LISTEN socket, including the data_ready cb of + * the parent. This leads to a hazard. In o2net_accept_one() + * we are still initializing the child socket but have not + * changed the inherited data_ready callback yet when + * data starts arriving. + * We avoid this hazard by checking the state. + * For the listening socket, the state will be TCP_LISTEN; for the new + * socket, will be TCP_ESTABLISHED. Also, in this case, + * sk->sk_user_data is not a valid function pointer. + */ + if (sk->sk_state == TCP_LISTEN) { - mlog(ML_TCP, "bytes: %d\n", bytes); queue_work(o2net_wq, &o2net_listen_work); + } else { + ready = NULL; } out: read_unlock(&sk->sk_callback_lock); - ready(sk, bytes); + if (ready != NULL) + ready(sk); } static int o2net_open_listening_sock(__be32 addr, __be16 port) diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 4cbcb65784a..dc024367110 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -165,7 +165,7 @@ struct o2net_sock_container { /* original handlers for the sockets */ void (*sc_state_change)(struct sock *sk); - void (*sc_data_ready)(struct sock *sk, int bytes); + void (*sc_data_ready)(struct sock *sk); u32 sc_msg_key; u16 sc_msg_type; diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c deleted file mode 100644 index a56eee6abad..00000000000 --- a/fs/ocfs2/cluster/ver.c +++ /dev/null @@ -1,42 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include "ver.h" - -#define CLUSTER_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION - -void cluster_print_version(void) -{ - printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(CLUSTER_BUILD_VERSION); diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h deleted file mode 100644 index 32554c3382c..00000000000 --- a/fs/ocfs2/cluster/ver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.h - * - * Function prototypes - * - * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef O2CLUSTER_VER_H -#define O2CLUSTER_VER_H - -void cluster_print_version(void); - -#endif /* O2CLUSTER_VER_H */ diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 0d3a97d2d5f..e2e05a106be 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -37,7 +37,6 @@ #include "dlmglue.h" #include "file.h" #include "inode.h" -#include "super.h" #include "ocfs2_trace.h" void ocfs2_dentry_attach_gen(struct dentry *dentry) @@ -346,52 +345,6 @@ out_attach: return ret; } -DEFINE_SPINLOCK(dentry_list_lock); - -/* We limit the number of dentry locks to drop in one go. We have - * this limit so that we don't starve other users of ocfs2_wq. */ -#define DL_INODE_DROP_COUNT 64 - -/* Drop inode references from dentry locks */ -static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count) -{ - struct ocfs2_dentry_lock *dl; - - spin_lock(&dentry_list_lock); - while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) { - dl = osb->dentry_lock_list; - osb->dentry_lock_list = dl->dl_next; - spin_unlock(&dentry_list_lock); - iput(dl->dl_inode); - kfree(dl); - spin_lock(&dentry_list_lock); - } - spin_unlock(&dentry_list_lock); -} - -void ocfs2_drop_dl_inodes(struct work_struct *work) -{ - struct ocfs2_super *osb = container_of(work, struct ocfs2_super, - dentry_lock_work); - - __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT); - /* - * Don't queue dropping if umount is in progress. We flush the - * list in ocfs2_dismount_volume - */ - spin_lock(&dentry_list_lock); - if (osb->dentry_lock_list && - !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) - queue_work(ocfs2_wq, &osb->dentry_lock_work); - spin_unlock(&dentry_list_lock); -} - -/* Flush the whole work queue */ -void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) -{ - __ocfs2_drop_dl_inodes(osb, -1); -} - /* * ocfs2_dentry_iput() and friends. * @@ -416,24 +369,16 @@ void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl) { + iput(dl->dl_inode); ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); ocfs2_lock_res_free(&dl->dl_lockres); - - /* We leave dropping of inode reference to ocfs2_wq as that can - * possibly lead to inode deletion which gets tricky */ - spin_lock(&dentry_list_lock); - if (!osb->dentry_lock_list && - !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) - queue_work(ocfs2_wq, &osb->dentry_lock_work); - dl->dl_next = osb->dentry_lock_list; - osb->dentry_lock_list = dl; - spin_unlock(&dentry_list_lock); + kfree(dl); } void ocfs2_dentry_lock_put(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl) { - int unlock; + int unlock = 0; BUG_ON(dl->dl_count == 0); diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h index b79eff70995..55f58892b15 100644 --- a/fs/ocfs2/dcache.h +++ b/fs/ocfs2/dcache.h @@ -29,13 +29,8 @@ extern const struct dentry_operations ocfs2_dentry_ops; struct ocfs2_dentry_lock { - /* Use count of dentry lock */ unsigned int dl_count; - union { - /* Linked list of dentry locks to release */ - struct ocfs2_dentry_lock *dl_next; - u64 dl_parent_blkno; - }; + u64 dl_parent_blkno; /* * The ocfs2_dentry_lock keeps an inode reference until @@ -49,14 +44,9 @@ struct ocfs2_dentry_lock { int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, u64 parent_blkno); -extern spinlock_t dentry_list_lock; - void ocfs2_dentry_lock_put(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl); -void ocfs2_drop_dl_inodes(struct work_struct *work); -void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb); - struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, int skip_unhashed); diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 30544ce8e9f..0717662b4ae 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2349,7 +2349,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, dx_root_bh = sb_getblk(osb->sb, dr_blkno); if (dx_root_bh == NULL) { - ret = -EIO; + ret = -ENOMEM; goto out; } ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh); @@ -2422,7 +2422,7 @@ static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb, for (i = 0; i < num_dx_leaves; i++) { bh = sb_getblk(osb->sb, start_blk + i); if (bh == NULL) { - ret = -EIO; + ret = -ENOMEM; goto out; } dx_leaves[i] = bh; @@ -2929,7 +2929,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off); dirdata_bh = sb_getblk(sb, blkno); if (!dirdata_bh) { - ret = -EIO; + ret = -ENOMEM; mlog_errno(ret); goto out_commit; } @@ -2957,6 +2957,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, ocfs2_init_dir_trailer(dir, dirdata_bh, i); } + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_journal_dirty(handle, dirdata_bh); if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { @@ -3005,6 +3006,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, di->i_size = cpu_to_le64(sb->s_blocksize); di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec); di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, dir, 1); /* * This should never fail as our extent list is empty and all @@ -3159,7 +3161,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb, *new_bh = sb_getblk(sb, p_blkno); if (!*new_bh) { - status = -EIO; + status = -ENOMEM; mlog_errno(status); goto bail; } @@ -3284,7 +3286,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, if (ocfs2_dir_resv_allowed(osb)) data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv; - credits = ocfs2_calc_extend_credits(sb, el, 1); + credits = ocfs2_calc_extend_credits(sb, el); } else { spin_unlock(&OCFS2_I(dir)->ip_lock); credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; @@ -3338,6 +3340,7 @@ do_extend: } else { de->rec_len = cpu_to_le16(sb->s_blocksize); } + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_journal_dirty(handle, new_bh); dir_i_size += dir->i_sb->s_blocksize; @@ -3716,7 +3719,7 @@ static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb, { int credits = ocfs2_clusters_to_blocks(osb->sb, 2); - credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1); + credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list); credits += ocfs2_quota_trans_credits(osb->sb); return credits; } @@ -3896,6 +3899,7 @@ out_commit: dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(dir->i_sb, 1)); + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_commit_trans(osb, handle); out: @@ -4134,6 +4138,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir, mlog_errno(ret); did_quota = 0; + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_journal_dirty(handle, dx_root_bh); out_commit: @@ -4401,6 +4406,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir, di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); spin_unlock(&OCFS2_I(dir)->ip_lock); di->i_dx_root = cpu_to_le64(0ULL); + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile index c8a044efbb1..bd1aab1f49a 100644 --- a/fs/ocfs2/dlm/Makefile +++ b/fs/ocfs2/dlm/Makefile @@ -3,5 +3,5 @@ ccflags-y := -Ifs/ocfs2 obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ - dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o + dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index e0517762fcc..fae17c640df 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -108,7 +108,6 @@ static inline int dlm_is_recovery_lock(const char *lock_name, int name_len) struct dlm_recovery_ctxt { struct list_head resources; - struct list_head received; struct list_head node_data; u8 new_master; u8 dead_node; @@ -332,6 +331,7 @@ struct dlm_lock_resource u16 state; char lvb[DLM_LVB_LEN]; unsigned int inflight_locks; + unsigned int inflight_assert_workers; unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; }; @@ -911,6 +911,9 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); +void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); + void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index e33cd7a3c58..18f13c2e4a1 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -338,7 +338,7 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle) #ifdef CONFIG_DEBUG_FS -static struct dentry *dlm_debugfs_root = NULL; +static struct dentry *dlm_debugfs_root; #define DLM_DEBUGFS_DIR "o2dlm" #define DLM_DEBUGFS_DLM_STATE "dlm_state" diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 8b3382abf84..39efc5057a3 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -43,8 +43,6 @@ #include "dlmdomain.h" #include "dlmdebug.h" -#include "dlmver.h" - #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) #include "cluster/masklog.h" @@ -961,6 +959,14 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, * domain. Set him in the map and clean up our * leftover join state. */ BUG_ON(dlm->joining_node != assert->node_idx); + + if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) { + mlog(0, "dlm recovery is ongoing, disallow join\n"); + spin_unlock(&dlm->spinlock); + spin_unlock(&dlm_domain_lock); + return -EAGAIN; + } + set_bit(assert->node_idx, dlm->domain_map); clear_bit(assert->node_idx, dlm->exit_domain_map); __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); @@ -1125,7 +1131,6 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, struct dlm_ctxt *dlm = NULL; char *local = NULL; int status = 0; - int locked = 0; qr = (struct dlm_query_region *) msg->buf; @@ -1134,10 +1139,8 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, /* buffer used in dlm_mast_regions() */ local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL); - if (!local) { - status = -ENOMEM; - goto bail; - } + if (!local) + return -ENOMEM; status = -EINVAL; @@ -1146,16 +1149,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, if (!dlm) { mlog(ML_ERROR, "Node %d queried hb regions on domain %s " "before join domain\n", qr->qr_node, qr->qr_domain); - goto bail; + goto out_domain_lock; } spin_lock(&dlm->spinlock); - locked = 1; if (dlm->joining_node != qr->qr_node) { mlog(ML_ERROR, "Node %d queried hb regions on domain %s " "but joining node is %d\n", qr->qr_node, qr->qr_domain, dlm->joining_node); - goto bail; + goto out_dlm_lock; } /* Support for global heartbeat was added in 1.1 */ @@ -1165,14 +1167,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, "but active dlm protocol is %d.%d\n", qr->qr_node, qr->qr_domain, dlm->dlm_locking_proto.pv_major, dlm->dlm_locking_proto.pv_minor); - goto bail; + goto out_dlm_lock; } status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions)); -bail: - if (locked) - spin_unlock(&dlm->spinlock); +out_dlm_lock: + spin_unlock(&dlm->spinlock); + +out_domain_lock: spin_unlock(&dlm_domain_lock); kfree(local); @@ -1522,6 +1525,7 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm, unsigned int node) { int status; + int ret; struct dlm_assert_joined assert_msg; mlog(0, "Sending join assert to node %u\n", node); @@ -1533,11 +1537,13 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm, status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, &assert_msg, sizeof(assert_msg), node, - NULL); + &ret); if (status < 0) mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, node); + else + status = ret; return status; } @@ -1879,19 +1885,19 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) goto bail; } - status = dlm_debug_init(dlm); + status = dlm_launch_thread(dlm); if (status < 0) { mlog_errno(status); goto bail; } - status = dlm_launch_thread(dlm); + status = dlm_launch_recovery_thread(dlm); if (status < 0) { mlog_errno(status); goto bail; } - status = dlm_launch_recovery_thread(dlm); + status = dlm_debug_init(dlm); if (status < 0) { mlog_errno(status); goto bail; @@ -2028,7 +2034,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, INIT_LIST_HEAD(&dlm->list); INIT_LIST_HEAD(&dlm->dirty_list); INIT_LIST_HEAD(&dlm->reco.resources); - INIT_LIST_HEAD(&dlm->reco.received); INIT_LIST_HEAD(&dlm->reco.node_data); INIT_LIST_HEAD(&dlm->purge_list); INIT_LIST_HEAD(&dlm->dlm_domain_handlers); @@ -2328,8 +2333,6 @@ static int __init dlm_init(void) { int status; - dlm_print_version(); - status = dlm_init_mle_cache(); if (status) { mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n"); @@ -2379,6 +2382,7 @@ static void __exit dlm_exit (void) MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 Distributed Lock Management"); module_init(dlm_init); module_exit(dlm_exit); diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 5d32f7511f7..66c2a491f68 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -52,7 +52,7 @@ #define MLOG_MASK_PREFIX ML_DLM #include "cluster/masklog.h" -static struct kmem_cache *dlm_lock_cache = NULL; +static struct kmem_cache *dlm_lock_cache; static DEFINE_SPINLOCK(dlm_cookie_lock); static u64 dlm_next_cookie = 1; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index cf0f103963b..82abf0cc9a1 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -82,9 +82,9 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm, return 1; } -static struct kmem_cache *dlm_lockres_cache = NULL; -static struct kmem_cache *dlm_lockname_cache = NULL; -static struct kmem_cache *dlm_mle_cache = NULL; +static struct kmem_cache *dlm_lockres_cache; +static struct kmem_cache *dlm_lockname_cache; +static struct kmem_cache *dlm_mle_cache; static void dlm_mle_release(struct kref *kref); static void dlm_init_mle(struct dlm_master_list_entry *mle, @@ -472,11 +472,15 @@ bail: void dlm_destroy_master_caches(void) { - if (dlm_lockname_cache) + if (dlm_lockname_cache) { kmem_cache_destroy(dlm_lockname_cache); + dlm_lockname_cache = NULL; + } - if (dlm_lockres_cache) + if (dlm_lockres_cache) { kmem_cache_destroy(dlm_lockres_cache); + dlm_lockres_cache = NULL; + } } static void dlm_lockres_release(struct kref *kref) @@ -577,6 +581,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, atomic_set(&res->asts_reserved, 0); res->migration_pending = 0; res->inflight_locks = 0; + res->inflight_assert_workers = 0; res->dlm = dlm; @@ -679,6 +684,43 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, wake_up(&res->wq); } +void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + assert_spin_locked(&res->spinlock); + res->inflight_assert_workers++; + mlog(0, "%s:%.*s: inflight assert worker++: now %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->inflight_assert_workers); +} + +static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + spin_lock(&res->spinlock); + __dlm_lockres_grab_inflight_worker(dlm, res); + spin_unlock(&res->spinlock); +} + +static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + assert_spin_locked(&res->spinlock); + BUG_ON(res->inflight_assert_workers == 0); + res->inflight_assert_workers--; + mlog(0, "%s:%.*s: inflight assert worker--: now %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->inflight_assert_workers); +} + +static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + spin_lock(&res->spinlock); + __dlm_lockres_drop_inflight_worker(dlm, res); + spin_unlock(&res->spinlock); +} + /* * lookup a lock resource by name. * may already exist in the hashtable. @@ -1599,7 +1641,8 @@ send_response: mlog(ML_ERROR, "failed to dispatch assert master work\n"); response = DLM_MASTER_RESP_ERROR; dlm_lockres_put(res); - } + } else + dlm_lockres_grab_inflight_worker(dlm, res); } else { if (res) dlm_lockres_put(res); @@ -1885,8 +1928,10 @@ ok: * up nodes that this node contacted */ while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES, nn+1)) < O2NM_MAX_NODES) { - if (nn != dlm->node_num && nn != assert->node_idx) + if (nn != dlm->node_num && nn != assert->node_idx) { master_request = 1; + break; + } } } mle->master = assert->node_idx; @@ -2112,6 +2157,8 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) dlm_lockres_release_ast(dlm, res); put: + dlm_lockres_drop_inflight_worker(dlm, res); + dlm_lockres_put(res); mlog(0, "finished with dlm_assert_master_worker\n"); @@ -2354,6 +2401,10 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, assert_spin_locked(&res->spinlock); + /* delay migration when the lockres is in MIGRATING state */ + if (res->state & DLM_LOCK_RES_MIGRATING) + return 0; + if (res->owner != dlm->node_num) return 0; @@ -3078,11 +3129,15 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, /* remove it so that only one mle will be found */ __dlm_unlink_mle(dlm, tmp); __dlm_mle_detach_hb_events(dlm, tmp); - ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; - mlog(0, "%s:%.*s: master=%u, newmaster=%u, " - "telling master to get ref for cleared out mle " - "during migration\n", dlm->name, namelen, name, - master, new_master); + if (tmp->type == DLM_MLE_MASTER) { + ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; + mlog(0, "%s:%.*s: master=%u, newmaster=%u, " + "telling master to get ref " + "for cleared out mle during " + "migration\n", dlm->name, + namelen, name, master, + new_master); + } } spin_unlock(&tmp->spinlock); } diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 0b5adca1b17..45067faf569 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -537,7 +537,10 @@ master_here: /* success! see if any other nodes need recovery */ mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n", dlm->name, dlm->reco.dead_node, dlm->node_num); - dlm_reset_recovery(dlm); + spin_lock(&dlm->spinlock); + __dlm_reset_recovery(dlm); + dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; + spin_unlock(&dlm->spinlock); } dlm_end_recovery(dlm); @@ -695,6 +698,14 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) if (all_nodes_done) { int ret; + /* Set this flag on recovery master to avoid + * a new recovery for another dead node start + * before the recovery is not done. That may + * cause recovery hung.*/ + spin_lock(&dlm->spinlock); + dlm->reco.state |= DLM_RECO_STATE_FINALIZE; + spin_unlock(&dlm->spinlock); + /* all nodes are now in DLM_RECO_NODE_DATA_DONE state * just send a finalize message to everyone and * clean up */ @@ -1697,7 +1708,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, mlog_errno(-ENOMEM); /* retry!? */ BUG(); - } + } else + __dlm_lockres_grab_inflight_worker(dlm, res); } else /* put.. incase we are not the master */ dlm_lockres_put(res); spin_unlock(&res->spinlock); @@ -1750,13 +1762,13 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, struct dlm_migratable_lockres *mres) { struct dlm_migratable_lock *ml; - struct list_head *queue; + struct list_head *queue, *iter; struct list_head *tmpq = NULL; struct dlm_lock *newlock = NULL; struct dlm_lockstatus *lksb = NULL; int ret = 0; int i, j, bad; - struct dlm_lock *lock = NULL; + struct dlm_lock *lock; u8 from = O2NM_MAX_NODES; unsigned int added = 0; __be64 c; @@ -1791,14 +1803,16 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, /* MIGRATION ONLY! */ BUG_ON(!(mres->flags & DLM_MRES_MIGRATION)); + lock = NULL; spin_lock(&res->spinlock); for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { tmpq = dlm_list_idx_to_ptr(res, j); - list_for_each_entry(lock, tmpq, list) { - if (lock->ml.cookie != ml->cookie) - lock = NULL; - else + list_for_each(iter, tmpq) { + lock = list_entry(iter, + struct dlm_lock, list); + if (lock->ml.cookie == ml->cookie) break; + lock = NULL; } if (lock) break; @@ -1886,6 +1900,13 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, if (ml->type == LKM_NLMODE) goto skip_lvb; + /* + * If the lock is in the blocked list it can't have a valid lvb, + * so skip it + */ + if (ml->list == DLM_BLOCKED_LIST) + goto skip_lvb; + if (!dlm_lvb_is_empty(mres->lvb)) { if (lksb->flags & DLM_LKSB_PUT_LVB) { /* other node was trying to update @@ -1966,7 +1987,15 @@ skip_lvb: } if (!bad) { dlm_lock_get(newlock); - list_add_tail(&newlock->list, queue); + if (mres->flags & DLM_MRES_RECOVERY && + ml->list == DLM_CONVERTING_LIST && + newlock->ml.type > + newlock->ml.convert_type) { + /* newlock is doing downconvert, add it to the + * head of converting list */ + list_add(&newlock->list, queue); + } else + list_add_tail(&newlock->list, queue); mlog(0, "%s:%.*s: added lock for node %u, " "setting refmap bit\n", dlm->name, res->lockname.len, res->lockname.name, ml->node); @@ -2875,8 +2904,8 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, BUG(); } dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; + __dlm_reset_recovery(dlm); spin_unlock(&dlm->spinlock); - dlm_reset_recovery(dlm); dlm_kick_recovery_thread(dlm); break; default: diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 9db869de829..69aac6f088a 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -259,12 +259,15 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm, * refs on it. */ unused = __dlm_lockres_unused(lockres); if (!unused || - (lockres->state & DLM_LOCK_RES_MIGRATING)) { + (lockres->state & DLM_LOCK_RES_MIGRATING) || + (lockres->inflight_assert_workers != 0)) { mlog(0, "%s: res %.*s is in use or being remastered, " - "used %d, state %d\n", dlm->name, - lockres->lockname.len, lockres->lockname.name, - !unused, lockres->state); - list_move_tail(&dlm->purge_list, &lockres->purge); + "used %d, state %d, assert master workers %u\n", + dlm->name, lockres->lockname.len, + lockres->lockname.name, + !unused, lockres->state, + lockres->inflight_assert_workers); + list_move_tail(&lockres->purge, &dlm->purge_list); spin_unlock(&lockres->spinlock); continue; } diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 5698b52cf5c..2e3c9dbab68 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -191,7 +191,9 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, DLM_UNLOCK_CLEAR_CONVERT_TYPE); } else if (status == DLM_RECOVERING || status == DLM_MIGRATING || - status == DLM_FORWARD) { + status == DLM_FORWARD || + status == DLM_NOLOCKMGR + ) { /* must clear the actions because this unlock * is about to be retried. cannot free or do * any list manipulation. */ @@ -200,7 +202,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, res->lockname.name, status==DLM_RECOVERING?"recovering": (status==DLM_MIGRATING?"migrating": - "forward")); + (status == DLM_FORWARD ? "forward" : + "nolockmanager"))); actions = 0; } if (flags & LKM_CANCEL) @@ -364,7 +367,10 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, * updated state to the recovery master. this thread * just needs to finish out the operation and call * the unlockast. */ - ret = DLM_NORMAL; + if (dlm_is_node_dead(dlm, owner)) + ret = DLM_NORMAL; + else + ret = DLM_NOLOCKMGR; } else { /* something bad. this will BUG in ocfs2 */ ret = dlm_err_to_dlm_status(tmpret); @@ -638,7 +644,9 @@ retry: if (status == DLM_RECOVERING || status == DLM_MIGRATING || - status == DLM_FORWARD) { + status == DLM_FORWARD || + status == DLM_NOLOCKMGR) { + /* We want to go away for a tiny bit to allow recovery * / migration to complete on this resource. I don't * know of any wait queue we could sleep on as this @@ -650,7 +658,7 @@ retry: msleep(50); mlog(0, "retrying unlock due to pending recovery/" - "migration/in-progress\n"); + "migration/in-progress/reconnect\n"); goto retry; } diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c deleted file mode 100644 index dfc0da4d158..00000000000 --- a/fs/ocfs2/dlm/dlmver.c +++ /dev/null @@ -1,42 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include "dlmver.h" - -#define DLM_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION - -void dlm_print_version(void) -{ - printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(DLM_BUILD_VERSION); diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h deleted file mode 100644 index f674aee77a1..00000000000 --- a/fs/ocfs2/dlm/dlmver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmfsver.h - * - * Function prototypes - * - * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef DLM_VER_H -#define DLM_VER_H - -void dlm_print_version(void); - -#endif /* DLM_VER_H */ diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile index f14be89a670..eed3db8c5b4 100644 --- a/fs/ocfs2/dlmfs/Makefile +++ b/fs/ocfs2/dlmfs/Makefile @@ -2,4 +2,4 @@ ccflags-y := -Ifs/ocfs2 obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o -ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o +ocfs2_dlmfs-objs := userdlm.o dlmfs.o diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index efa2b3d339e..09b7d9dac71 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -49,7 +49,6 @@ #include "stackglue.h" #include "userdlm.h" -#include "dlmfsver.h" #define MLOG_MASK_PREFIX ML_DLMFS #include "cluster/masklog.h" @@ -644,8 +643,6 @@ static int __init init_dlmfs_fs(void) int status; int cleanup_inode = 0, cleanup_worker = 0; - dlmfs_print_version(); - status = bdi_init(&dlmfs_backing_dev_info); if (status) return status; @@ -701,6 +698,7 @@ static void __exit exit_dlmfs_fs(void) MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 DLM-Filesystem"); module_init(init_dlmfs_fs) module_exit(exit_dlmfs_fs) diff --git a/fs/ocfs2/dlmfs/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c deleted file mode 100644 index a733b3321f8..00000000000 --- a/fs/ocfs2/dlmfs/dlmfsver.c +++ /dev/null @@ -1,42 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmfsver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include "dlmfsver.h" - -#define DLM_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION - -void dlmfs_print_version(void) -{ - printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(DLM_BUILD_VERSION); diff --git a/fs/ocfs2/dlmfs/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h deleted file mode 100644 index f35eadbed25..00000000000 --- a/fs/ocfs2/dlmfs/dlmfsver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmver.h - * - * Function prototypes - * - * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef DLMFS_VER_H -#define DLMFS_VER_H - -void dlmfs_print_version(void); - -#endif /* DLMFS_VER_H */ diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 3a44a648dae..52cfe99ae05 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1304,7 +1304,7 @@ static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw) { wait_for_completion(&mw->mw_complete); /* Re-arm the completion in case we want to wait on it again */ - INIT_COMPLETION(mw->mw_complete); + reinit_completion(&mw->mw_complete); return mw->mw_status; } @@ -1355,7 +1355,7 @@ static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw, else ret = mw->mw_status; /* Re-arm the completion in case we want to wait on it again */ - INIT_COMPLETION(mw->mw_complete); + reinit_completion(&mw->mw_complete); return ret; } @@ -2544,11 +2544,6 @@ int ocfs2_super_lock(struct ocfs2_super *osb, * refreshed, so we do it here. Of course, making sense of * everything is up to the caller :) */ status = ocfs2_should_refresh_lock_res(lockres); - if (status < 0) { - ocfs2_cluster_unlock(osb, lockres, level); - mlog_errno(status); - goto bail; - } if (status) { status = ocfs2_refresh_slot_info(osb); @@ -2996,6 +2991,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) /* for now, uuid == domain */ status = ocfs2_cluster_connect(osb->osb_cluster_stack, + osb->osb_cluster_name, + strlen(osb->osb_cluster_name), osb->uuid_str, strlen(osb->uuid_str), &lproto, ocfs2_do_node_down, osb, @@ -3005,7 +3002,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) goto bail; } - status = ocfs2_cluster_this_node(&osb->node_num); + status = ocfs2_cluster_this_node(conn, &osb->node_num); if (status < 0) { mlog_errno(status); mlog(ML_ERROR, @@ -3142,22 +3139,60 @@ out: return 0; } +static void ocfs2_process_blocked_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); + /* Mark the lockres as being dropped. It will no longer be * queued if blocking, but we still may have to wait on it * being dequeued from the downconvert thread before we can consider * it safe to drop. * * You can *not* attempt to call cluster_lock on this lockres anymore. */ -void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) +void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) { int status; struct ocfs2_mask_waiter mw; - unsigned long flags; + unsigned long flags, flags2; ocfs2_init_mask_waiter(&mw); spin_lock_irqsave(&lockres->l_lock, flags); lockres->l_flags |= OCFS2_LOCK_FREEING; + if (lockres->l_flags & OCFS2_LOCK_QUEUED && current == osb->dc_task) { + /* + * We know the downconvert is queued but not in progress + * because we are the downconvert thread and processing + * different lock. So we can just remove the lock from the + * queue. This is not only an optimization but also a way + * to avoid the following deadlock: + * ocfs2_dentry_post_unlock() + * ocfs2_dentry_lock_put() + * ocfs2_drop_dentry_lock() + * iput() + * ocfs2_evict_inode() + * ocfs2_clear_inode() + * ocfs2_mark_lockres_freeing() + * ... blocks waiting for OCFS2_LOCK_QUEUED + * since we are the downconvert thread which + * should clear the flag. + */ + spin_unlock_irqrestore(&lockres->l_lock, flags); + spin_lock_irqsave(&osb->dc_task_lock, flags2); + list_del_init(&lockres->l_blocked_list); + osb->blocked_lock_count--; + spin_unlock_irqrestore(&osb->dc_task_lock, flags2); + /* + * Warn if we recurse into another post_unlock call. Strictly + * speaking it isn't a problem but we need to be careful if + * that happens (stack overflow, deadlocks, ...) so warn if + * ocfs2 grows a path for which this can happen. + */ + WARN_ON_ONCE(lockres->l_ops->post_unlock); + /* Since the lock is freeing we don't do much in the fn below */ + ocfs2_process_blocked_lock(osb, lockres); + return; + } while (lockres->l_flags & OCFS2_LOCK_QUEUED) { lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0); spin_unlock_irqrestore(&lockres->l_lock, flags); @@ -3178,7 +3213,7 @@ void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, { int ret; - ocfs2_mark_lockres_freeing(lockres); + ocfs2_mark_lockres_freeing(osb, lockres); ret = ocfs2_drop_lock(osb, lockres); if (ret) mlog_errno(ret); diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index 1d596d8c4a4..d293a22c32c 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -157,7 +157,8 @@ int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex); void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex); -void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); +void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index d71903c6068..2930e231f3f 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -175,9 +175,13 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { int err = 0; - journal_t *journal; struct inode *inode = file->f_mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_inode_info *oi = OCFS2_I(inode); + journal_t *journal = osb->journal->j_journal; + int ret; + tid_t commit_tid; + bool needs_barrier = false; trace_ocfs2_sync_file(inode, file, file->f_path.dentry, OCFS2_I(inode)->ip_blkno, @@ -185,33 +189,26 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, file->f_path.dentry->d_name.name, (unsigned long long)datasync); + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + err = filemap_write_and_wait_range(inode->i_mapping, start, end); if (err) return err; - /* - * Probably don't need the i_mutex at all in here, just putting it here - * to be consistent with how fsync used to be called, someone more - * familiar with the fs could possibly remove it. - */ - mutex_lock(&inode->i_mutex); - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { - /* - * We still have to flush drive's caches to get data to the - * platter - */ - if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - goto bail; + commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid; + if (journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(journal, commit_tid)) + needs_barrier = true; + err = jbd2_complete_transaction(journal, commit_tid); + if (needs_barrier) { + ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + if (!err) + err = ret; } - journal = osb->journal->j_journal; - err = jbd2_journal_force_commit(journal); - -bail: if (err) mlog_errno(err); - mutex_unlock(&inode->i_mutex); return (err < 0) ? -EIO : 0; } @@ -289,6 +286,7 @@ int ocfs2_update_inode_atime(struct inode *inode, inode->i_atime = CURRENT_TIME; di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, bh); out_commit: @@ -338,6 +336,7 @@ int ocfs2_simple_size_update(struct inode *inode, if (ret < 0) mlog_errno(ret); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_commit_trans(osb, handle); out: return ret; @@ -432,6 +431,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, di->i_size = cpu_to_le64(new_i_size); di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, fe_bh); @@ -474,11 +474,6 @@ static int ocfs2_truncate_file(struct inode *inode, goto bail; } - /* lets handle the simple truncate cases before doing any more - * cluster locking. */ - if (new_i_size == le64_to_cpu(fe->i_size)) - goto bail; - down_write(&OCFS2_I(inode)->ip_alloc_sem); ocfs2_resv_discard(&osb->osb_la_resmap, @@ -580,7 +575,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, int did_quota = 0; /* - * This function only exists for file systems which don't + * Unwritten extent only exists for file systems which * support holes. */ BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); @@ -603,8 +598,7 @@ restart_all: goto leave; } - credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list, - clusters_to_add); + credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { status = PTR_ERR(handle); @@ -653,7 +647,7 @@ restarted_transaction: mlog_errno(status); goto leave; } - + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, bh); spin_lock(&OCFS2_I(inode)->ip_lock); @@ -719,7 +713,8 @@ leave: * While a write will already be ordering the data, a truncate will not. * Thus, we need to explicitly order the zeroed pages. */ -static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode) +static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode, + struct buffer_head *di_bh) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); handle_t *handle = NULL; @@ -736,8 +731,16 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode) } ret = ocfs2_jbd2_file_inode(handle, inode); - if (ret < 0) + if (ret < 0) { mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) + mlog_errno(ret); + ocfs2_update_inode_fsync_trans(handle, inode, 1); out: if (ret) { @@ -752,7 +755,7 @@ out: * to be too fragile to do exactly what we need without us having to * worry about recursive locking in ->write_begin() and ->write_end(). */ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, - u64 abs_to) + u64 abs_to, struct buffer_head *di_bh) { struct address_space *mapping = inode->i_mapping; struct page *page; @@ -760,6 +763,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, handle_t *handle = NULL; int ret = 0; unsigned zero_from, zero_to, block_start, block_end; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; BUG_ON(abs_from >= abs_to); BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); @@ -802,7 +806,8 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, } if (!handle) { - handle = ocfs2_zero_start_ordered_transaction(inode); + handle = ocfs2_zero_start_ordered_transaction(inode, + di_bh); if (IS_ERR(handle)) { ret = PTR_ERR(handle); handle = NULL; @@ -819,8 +824,23 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, ret = 0; } - if (handle) + if (handle) { + /* + * fs-writeback will release the dirty pages without page lock + * whose offset are over inode size, the release happens at + * block_write_full_page(). + */ + i_size_write(inode, abs_to); + inode->i_blocks = ocfs2_inode_sector_count(inode); + di->i_size = cpu_to_le64((u64)i_size_read(inode)); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + di->i_mtime_nsec = di->i_ctime_nsec; + ocfs2_journal_dirty(handle, di_bh); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); + } out_unlock: unlock_page(page); @@ -916,7 +936,7 @@ out: * has made sure that the entire range needs zeroing. */ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, - u64 range_end) + u64 range_end, struct buffer_head *di_bh) { int rc = 0; u64 next_pos; @@ -932,7 +952,7 @@ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE; if (next_pos > range_end) next_pos = range_end; - rc = ocfs2_write_zero_page(inode, zero_pos, next_pos); + rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh); if (rc < 0) { mlog_errno(rc); break; @@ -978,7 +998,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, range_end = zero_to_size; ret = ocfs2_zero_extend_range(inode, range_start, - range_end); + range_end, di_bh); if (ret) { mlog_errno(ret); break; @@ -1146,14 +1166,14 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) goto bail_unlock_rw; } - if (size_change && attr->ia_size != i_size_read(inode)) { + if (size_change) { status = inode_newsize_ok(inode, attr->ia_size); if (status) goto bail_unlock; inode_dio_wait(inode); - if (i_size_read(inode) > attr->ia_size) { + if (i_size_read(inode) >= attr->ia_size) { if (ocfs2_should_order_data(inode)) { status = ocfs2_begin_ordered_truncate(inode, attr->ia_size); @@ -1237,7 +1257,7 @@ bail: dqput(transfer_to[qtype]); if (!status && attr->ia_valid & ATTR_MODE) { - status = ocfs2_acl_chmod(inode); + status = posix_acl_chmod(inode, inode->i_mode); if (status < 0) mlog_errno(status); } @@ -1323,6 +1343,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode, di = (struct ocfs2_dinode *) bh->b_data; di->i_mode = cpu_to_le16(inode->i_mode); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, bh); @@ -1555,6 +1576,7 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, if (ret) mlog_errno(ret); } + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_commit_trans(osb, handle); out: @@ -1870,7 +1892,8 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, } size = sr->l_start + sr->l_len; - if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { + if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 || + cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) { if (sr->l_len <= 0) { ret = -EINVAL; goto out_inode_unlock; @@ -2039,13 +2062,6 @@ out: return ret; } -static void ocfs2_aiodio_wait(struct inode *inode) -{ - wait_queue_head_t *wq = ocfs2_ioend_wq(inode); - - wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0)); -} - static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos) { int blockmask = inode->i_sb->s_blocksize - 1; @@ -2217,16 +2233,13 @@ out: return ret; } -static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, - loff_t pos) +static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, + struct iov_iter *from) { int ret, direct_io, appending, rw_level, have_alloc_sem = 0; int can_do_direct, has_refcount = 0; ssize_t written = 0; - size_t ocount; /* original count */ - size_t count; /* after file limit checks */ + size_t count = iov_iter_count(from); loff_t old_size, *ppos = &iocb->ki_pos; u32 old_clusters; struct file *file = iocb->ki_filp; @@ -2240,7 +2253,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, (unsigned long long)OCFS2_I(inode)->ip_blkno, file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name, - (unsigned int)nr_segs); + (unsigned int)from->nr_segs); /* GRRRRR */ if (iocb->ki_nbytes == 0) return 0; @@ -2323,10 +2336,8 @@ relock: * Wait on previous unaligned aio to complete before * proceeding. */ - ocfs2_aiodio_wait(inode); - - /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */ - atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio); + mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio); + /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */ ocfs2_iocb_set_unaligned_aio(iocb); } @@ -2340,28 +2351,23 @@ relock: /* communicate with ocfs2_dio_end_io */ ocfs2_iocb_set_rw_locked(iocb, rw_level); - ret = generic_segment_checks(iov, &nr_segs, &ocount, - VERIFY_READ); - if (ret) - goto out_dio; - - count = ocount; ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode)); if (ret) goto out_dio; + iov_iter_truncate(from, count); if (direct_io) { - written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, - ppos, count, ocount); + written = generic_file_direct_write(iocb, from, *ppos); if (written < 0) { ret = written; goto out_dio; } } else { current->backing_dev_info = file->f_mapping->backing_dev_info; - written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos, - ppos, count, 0); + written = generic_perform_write(file, from, *ppos); + if (likely(written >= 0)) + iocb->ki_pos = *ppos + written; current->backing_dev_info = NULL; } @@ -2371,8 +2377,8 @@ out_dio: if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) || ((file->f_flags & O_DIRECT) && !direct_io)) { - ret = filemap_fdatawrite_range(file->f_mapping, pos, - pos + count - 1); + ret = filemap_fdatawrite_range(file->f_mapping, *ppos, + *ppos + count - 1); if (ret < 0) written = ret; @@ -2385,8 +2391,8 @@ out_dio: } if (!ret) - ret = filemap_fdatawait_range(file->f_mapping, pos, - pos + count - 1); + ret = filemap_fdatawait_range(file->f_mapping, *ppos, + *ppos + count - 1); } /* @@ -2406,7 +2412,7 @@ out_dio: if (unaligned_dio) { ocfs2_iocb_clear_unaligned_aio(iocb); - atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio); + mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); } out: @@ -2424,84 +2430,6 @@ out_sems: return ret; } -static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, - struct file *out, - struct splice_desc *sd) -{ - int ret; - - ret = ocfs2_prepare_inode_for_write(out, &sd->pos, - sd->total_len, 0, NULL, NULL); - if (ret < 0) { - mlog_errno(ret); - return ret; - } - - return splice_from_pipe_feed(pipe, sd, pipe_to_file); -} - -static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, - struct file *out, - loff_t *ppos, - size_t len, - unsigned int flags) -{ - int ret; - struct address_space *mapping = out->f_mapping; - struct inode *inode = mapping->host; - struct splice_desc sd = { - .total_len = len, - .flags = flags, - .pos = *ppos, - .u.file = out, - }; - - - trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry, - (unsigned long long)OCFS2_I(inode)->ip_blkno, - out->f_path.dentry->d_name.len, - out->f_path.dentry->d_name.name, len); - - pipe_lock(pipe); - - splice_from_pipe_begin(&sd); - do { - ret = splice_from_pipe_next(pipe, &sd); - if (ret <= 0) - break; - - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); - ret = ocfs2_rw_lock(inode, 1); - if (ret < 0) - mlog_errno(ret); - else { - ret = ocfs2_splice_to_file(pipe, out, &sd); - ocfs2_rw_unlock(inode, 1); - } - mutex_unlock(&inode->i_mutex); - } while (ret > 0); - splice_from_pipe_end(pipe, &sd); - - pipe_unlock(pipe); - - if (sd.num_spliced) - ret = sd.num_spliced; - - if (ret > 0) { - int err; - - err = generic_write_sync(out, *ppos, ret); - if (err) - ret = err; - else - *ppos += ret; - - balance_dirty_pages_ratelimited(mapping); - } - - return ret; -} - static ssize_t ocfs2_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, @@ -2517,7 +2445,7 @@ static ssize_t ocfs2_file_splice_read(struct file *in, in->f_path.dentry->d_name.name, len); /* - * See the comment in ocfs2_file_aio_read() + * See the comment in ocfs2_file_read_iter() */ ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level); if (ret < 0) { @@ -2532,10 +2460,8 @@ bail: return ret; } -static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, - loff_t pos) +static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, + struct iov_iter *to) { int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; struct file *filp = iocb->ki_filp; @@ -2544,7 +2470,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, filp->f_path.dentry->d_name.len, - filp->f_path.dentry->d_name.name, nr_segs); + filp->f_path.dentry->d_name.name, + to->nr_segs); /* GRRRRR */ if (!inode) { @@ -2589,13 +2516,13 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, } ocfs2_inode_unlock(inode, lock_level); - ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); + ret = generic_file_read_iter(iocb, to); trace_generic_file_aio_read_ret(ret); /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); - /* see ocfs2_file_aio_write */ + /* see ocfs2_file_write_iter */ if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { rw_level = -1; have_alloc_sem = 0; @@ -2623,7 +2550,16 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) case SEEK_SET: break; case SEEK_END: - offset += inode->i_size; + /* SEEK_END requires the OCFS2 inode lock for the file + * because it references the file's size. + */ + ret = ocfs2_inode_lock(inode, NULL, 0); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + offset += i_size_read(inode); + ocfs2_inode_unlock(inode, 0); break; case SEEK_CUR: if (offset == 0) { @@ -2662,6 +2598,7 @@ const struct inode_operations ocfs2_file_iops = { .removexattr = generic_removexattr, .fiemap = ocfs2_fiemap, .get_acl = ocfs2_iop_get_acl, + .set_acl = ocfs2_iop_set_acl, }; const struct inode_operations ocfs2_special_file_iops = { @@ -2669,6 +2606,7 @@ const struct inode_operations ocfs2_special_file_iops = { .getattr = ocfs2_getattr, .permission = ocfs2_permission, .get_acl = ocfs2_iop_get_acl, + .set_acl = ocfs2_iop_set_acl, }; /* @@ -2677,14 +2615,14 @@ const struct inode_operations ocfs2_special_file_iops = { */ const struct file_operations ocfs2_fops = { .llseek = ocfs2_file_llseek, - .read = do_sync_read, - .write = do_sync_write, + .read = new_sync_read, + .write = new_sync_write, .mmap = ocfs2_mmap, .fsync = ocfs2_sync_file, .release = ocfs2_file_release, .open = ocfs2_file_open, - .aio_read = ocfs2_file_aio_read, - .aio_write = ocfs2_file_aio_write, + .read_iter = ocfs2_file_read_iter, + .write_iter = ocfs2_file_write_iter, .unlocked_ioctl = ocfs2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ocfs2_compat_ioctl, @@ -2692,7 +2630,7 @@ const struct file_operations ocfs2_fops = { .lock = ocfs2_lock, .flock = ocfs2_flock, .splice_read = ocfs2_file_splice_read, - .splice_write = ocfs2_file_splice_write, + .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, }; @@ -2725,21 +2663,21 @@ const struct file_operations ocfs2_dops = { */ const struct file_operations ocfs2_fops_no_plocks = { .llseek = ocfs2_file_llseek, - .read = do_sync_read, - .write = do_sync_write, + .read = new_sync_read, + .write = new_sync_write, .mmap = ocfs2_mmap, .fsync = ocfs2_sync_file, .release = ocfs2_file_release, .open = ocfs2_file_open, - .aio_read = ocfs2_file_aio_read, - .aio_write = ocfs2_file_aio_write, + .read_iter = ocfs2_file_read_iter, + .write_iter = ocfs2_file_write_iter, .unlocked_ioctl = ocfs2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ocfs2_compat_ioctl, #endif .flock = ocfs2_flock, .splice_read = ocfs2_file_splice_read, - .splice_write = ocfs2_file_splice_write, + .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, }; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index f87f9bd1edf..437de7f768c 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -130,6 +130,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, struct inode *inode = NULL; struct super_block *sb = osb->sb; struct ocfs2_find_inode_args args; + journal_t *journal = OCFS2_SB(sb)->journal->j_journal; trace_ocfs2_iget_begin((unsigned long long)blkno, flags, sysfile_type); @@ -169,6 +170,32 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, goto bail; } + /* + * Set transaction id's of transactions that have to be committed + * to finish f[data]sync. We set them to currently running transaction + * as we cannot be sure that the inode or some of its metadata isn't + * part of the transaction - the inode could have been reclaimed and + * now it is reread from disk. + */ + if (journal) { + transaction_t *transaction; + tid_t tid; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + read_lock(&journal->j_state_lock); + if (journal->j_running_transaction) + transaction = journal->j_running_transaction; + else + transaction = journal->j_committing_transaction; + if (transaction) + tid = transaction->t_tid; + else + tid = journal->j_commit_sequence; + read_unlock(&journal->j_state_lock); + oi->i_sync_tid = tid; + oi->i_datasync_tid = tid; + } + bail: if (!IS_ERR(inode)) { trace_ocfs2_iget_end(inode, @@ -386,19 +413,9 @@ static int ocfs2_read_locked_inode(struct inode *inode, u32 generation = 0; status = -EINVAL; - if (inode == NULL || inode->i_sb == NULL) { - mlog(ML_ERROR, "bad inode\n"); - return status; - } sb = inode->i_sb; osb = OCFS2_SB(sb); - if (!args) { - mlog(ML_ERROR, "bad inode args\n"); - make_bad_inode(inode); - return status; - } - /* * To improve performance of cold-cache inode stats, we take * the cluster lock here if possible. @@ -814,11 +831,13 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode) goto bail; } - /* If we're coming from downconvert_thread we can't go into our own - * voting [hello, deadlock city!], so unforuntately we just - * have to skip deleting this guy. That's OK though because - * the node who's doing the actual deleting should handle it - * anyway. */ + /* + * If we're coming from downconvert_thread we can't go into our own + * voting [hello, deadlock city!] so we cannot delete the inode. But + * since we dropped last inode ref when downconverting dentry lock, + * we cannot have the file open and thus the node doing unlink will + * take care of deleting the inode. + */ if (current == osb->dc_task) goto bail; @@ -832,12 +851,6 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode) goto bail_unlock; } - /* If we have allowd wipe of this inode for another node, it - * will be marked here so we can safely skip it. Recovery will - * cleanup any inodes we might inadvertently skip here. */ - if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) - goto bail_unlock; - ret = 1; bail_unlock: spin_unlock(&oi->ip_lock); @@ -951,7 +964,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode, (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); if (sync_data) filemap_write_and_wait(inode->i_mapping); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); } static void ocfs2_delete_inode(struct inode *inode) @@ -970,8 +983,6 @@ static void ocfs2_delete_inode(struct inode *inode) if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) goto bail; - dquot_initialize(inode); - if (!ocfs2_inode_is_valid_to_delete(inode)) { /* It's probably not necessary to truncate_inode_pages * here but we do it for safety anyway (it will most @@ -980,6 +991,8 @@ static void ocfs2_delete_inode(struct inode *inode) goto bail; } + dquot_initialize(inode); + /* We want to block signals in delete_inode as the lock and * messaging paths may return us -ERESTARTSYS. Which would * cause us to exit early, resulting in inodes being orphaned @@ -1067,6 +1080,7 @@ static void ocfs2_clear_inode(struct inode *inode) { int status; struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); clear_inode(inode); trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno, @@ -1083,9 +1097,9 @@ static void ocfs2_clear_inode(struct inode *inode) /* Do these before all the other work so that we don't bounce * the downconvert thread while waiting to destroy the locks. */ - ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); - ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres); - ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); + ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres); + ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres); + ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres); ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap, &oi->ip_la_data_resv); @@ -1167,7 +1181,7 @@ void ocfs2_evict_inode(struct inode *inode) (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) { ocfs2_delete_inode(inode); } else { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); } ocfs2_clear_inode(inode); } @@ -1270,6 +1284,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle, fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); ocfs2_journal_dirty(handle, bh); + ocfs2_update_inode_fsync_trans(handle, inode, 1); leave: return status; } diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 621fc73bf23..a6c991c0fc9 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -44,7 +44,7 @@ struct ocfs2_inode_info struct rw_semaphore ip_xattr_sem; /* Number of outstanding AIO's which are not page aligned */ - atomic_t ip_unaligned_aio; + struct mutex ip_unaligned_aio; /* These fields are protected by ip_lock */ spinlock_t ip_lock; @@ -73,6 +73,13 @@ struct ocfs2_inode_info u32 ip_dir_lock_gen; struct ocfs2_alloc_reservation ip_la_data_resv; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; }; /* @@ -84,8 +91,6 @@ struct ocfs2_inode_info #define OCFS2_INODE_BITMAP 0x00000004 /* This inode has been wiped from disk */ #define OCFS2_INODE_DELETED 0x00000008 -/* Another node is deleting, so our delete is a nop */ -#define OCFS2_INODE_SKIP_DELETE 0x00000010 /* Has the inode been orphaned on another node? * * This hints to ocfs2_drop_inode that it should clear i_nlink before @@ -100,11 +105,11 @@ struct ocfs2_inode_info * rely on ocfs2_delete_inode to sort things out under the proper * cluster locks. */ -#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020 +#define OCFS2_INODE_MAYBE_ORPHANED 0x00000010 /* Does someone have the file open O_DIRECT */ -#define OCFS2_INODE_OPEN_DIRECT 0x00000040 +#define OCFS2_INODE_OPEN_DIRECT 0x00000020 /* Tell the inode wipe code it's not in orphan dir */ -#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000080 +#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) { diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index fa32ce9b455..6f66b3751ac 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -7,6 +7,7 @@ #include <linux/fs.h> #include <linux/mount.h> +#include <linux/blkdev.h> #include <linux/compat.h> #include <cluster/masklog.h> @@ -142,8 +143,8 @@ bail: return status; } -int ocfs2_info_handle_blocksize(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_blocksize(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_blocksize oib; @@ -166,8 +167,8 @@ bail: return status; } -int ocfs2_info_handle_clustersize(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_clustersize(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_clustersize oic; @@ -191,8 +192,8 @@ bail: return status; } -int ocfs2_info_handle_maxslots(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_maxslots(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_maxslots oim; @@ -216,8 +217,8 @@ bail: return status; } -int ocfs2_info_handle_label(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_label(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_label oil; @@ -241,8 +242,8 @@ bail: return status; } -int ocfs2_info_handle_uuid(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_uuid(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_uuid oiu; @@ -266,8 +267,8 @@ bail: return status; } -int ocfs2_info_handle_fs_features(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_fs_features(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_fs_features oif; @@ -293,8 +294,8 @@ bail: return status; } -int ocfs2_info_handle_journal_size(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_journal_size(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_journal_size oij; @@ -318,9 +319,10 @@ bail: return status; } -int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, - struct inode *inode_alloc, u64 blkno, - struct ocfs2_info_freeinode *fi, u32 slot) +static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, + struct inode *inode_alloc, u64 blkno, + struct ocfs2_info_freeinode *fi, + u32 slot) { int status = 0, unlock = 0; @@ -365,8 +367,8 @@ bail: return status; } -int ocfs2_info_handle_freeinode(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_freeinode(struct inode *inode, + struct ocfs2_info_request __user *req) { u32 i; u64 blkno = -1; @@ -412,11 +414,12 @@ int ocfs2_info_handle_freeinode(struct inode *inode, } status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i); - if (status < 0) - goto bail; iput(inode_alloc); inode_alloc = NULL; + + if (status < 0) + goto bail; } o2info_set_request_filled(&oifi->ifi_req); @@ -460,19 +463,19 @@ static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats, stats->ffs_free_chunks_real++; } -void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg, - unsigned int chunksize) +static void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg, + unsigned int chunksize) { o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize); o2ffg_update_stats(&(ffg->iff_ffs), chunksize); } -int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb, - struct inode *gb_inode, - struct ocfs2_dinode *gb_dinode, - struct ocfs2_chain_rec *rec, - struct ocfs2_info_freefrag *ffg, - u32 chunks_in_group) +static int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb, + struct inode *gb_inode, + struct ocfs2_dinode *gb_dinode, + struct ocfs2_chain_rec *rec, + struct ocfs2_info_freefrag *ffg, + u32 chunks_in_group) { int status = 0, used; u64 blkno; @@ -570,9 +573,9 @@ bail: return status; } -int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, - struct inode *gb_inode, u64 blkno, - struct ocfs2_info_freefrag *ffg) +static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, + struct inode *gb_inode, u64 blkno, + struct ocfs2_info_freefrag *ffg) { u32 chunks_in_group; int status = 0, unlock = 0, i; @@ -650,8 +653,8 @@ bail: return status; } -int ocfs2_info_handle_freefrag(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_freefrag(struct inode *inode, + struct ocfs2_info_request __user *req) { u64 blkno = -1; char namebuf[40]; @@ -721,8 +724,8 @@ out_err: return status; } -int ocfs2_info_handle_unknown(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_unknown(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_request oir; @@ -750,8 +753,8 @@ bail: * - distinguish different requests. * - validate size of different requests. */ -int ocfs2_info_handle_request(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_request(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_request oir; @@ -809,8 +812,8 @@ bail: return status; } -int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx, - u64 *req_addr, int compat_flag) +static int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx, + u64 *req_addr, int compat_flag) { int status = -EFAULT; u64 __user *bp = NULL; @@ -847,8 +850,8 @@ bail: * a better backward&forward compatibility, since a small piece of * request will be less likely to be broken if disk layout get changed. */ -int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, - int compat_flag) +static int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, + int compat_flag) { int i, status = 0; u64 req_addr; @@ -966,15 +969,21 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case FITRIM: { struct super_block *sb = inode->i_sb; + struct request_queue *q = bdev_get_queue(sb->s_bdev); struct fstrim_range range; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + if (copy_from_user(&range, argp, sizeof(range))) return -EFAULT; + range.minlen = max_t(u64, q->limits.discard_granularity, + range.minlen); ret = ocfs2_trim_fs(sb, &range); if (ret < 0) return ret; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 44fc3e530c3..4b0c68849b3 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -30,6 +30,7 @@ #include <linux/kthread.h> #include <linux/time.h> #include <linux/random.h> +#include <linux/delay.h> #include <cluster/masklog.h> @@ -2132,12 +2133,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, iter = oi->ip_next_orphan; spin_lock(&oi->ip_lock); - /* The remote delete code may have set these on the - * assumption that the other node would wipe them - * successfully. If they are still in the node's - * orphan dir, we need to reset that state. */ - oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE); - /* Set the proper information to get us going into * ocfs2_delete_inode. */ oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; @@ -2191,8 +2186,20 @@ static int ocfs2_commit_thread(void *arg) || kthread_should_stop()); status = ocfs2_commit_cache(osb); - if (status < 0) - mlog_errno(status); + if (status < 0) { + static unsigned long abort_warn_time; + + /* Warn about this once per minute */ + if (printk_timed_ratelimit(&abort_warn_time, 60*HZ)) + mlog(ML_ERROR, "status = %d, journal is " + "already aborted.\n", status); + /* + * After ocfs2_commit_cache() fails, j_num_trans has a + * non-zero value. Sleep here to avoid a busy-wait + * loop. + */ + msleep_interruptible(1000); + } if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){ mlog(ML_KTHREAD, diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 0b479bab367..7f8cde94abf 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -524,8 +524,7 @@ static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb) * the result may be wrong. */ static inline int ocfs2_calc_extend_credits(struct super_block *sb, - struct ocfs2_extent_list *root_el, - u32 bits_wanted) + struct ocfs2_extent_list *root_el) { int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks; @@ -627,4 +626,15 @@ static inline int ocfs2_begin_ordered_truncate(struct inode *inode, new_size); } +static inline void ocfs2_update_inode_fsync_trans(handle_t *handle, + struct inode *inode, + int datasync) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + oi->i_sync_tid = handle->h_transaction->t_tid; + if (datasync) + oi->i_datasync_tid = handle->h_transaction->t_tid; +} + #endif /* OCFS2_JOURNAL_H */ diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index cd5496b7a0a..04401345562 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -781,6 +781,48 @@ bail: return status; } +int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 bit_off, + u32 num_bits) +{ + int status, start; + u32 clear_bits; + struct inode *local_alloc_inode; + void *bitmap; + struct ocfs2_dinode *alloc; + struct ocfs2_local_alloc *la; + + BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL); + + local_alloc_inode = ac->ac_inode; + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + la = OCFS2_LOCAL_ALLOC(alloc); + + bitmap = la->la_bitmap; + start = bit_off - le32_to_cpu(la->la_bm_off); + clear_bits = num_bits; + + status = ocfs2_journal_access_di(handle, + INODE_CACHE(local_alloc_inode), + osb->local_alloc_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + while (clear_bits--) + ocfs2_clear_bit(start++, bitmap); + + le32_add_cpu(&alloc->id1.bitmap1.i_used, -num_bits); + ocfs2_journal_dirty(handle, osb->local_alloc_bh); + +bail: + return status; +} + static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc) { u32 count; diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h index 1be9b586446..44a7d1fb2de 100644 --- a/fs/ocfs2/localalloc.h +++ b/fs/ocfs2/localalloc.h @@ -55,6 +55,12 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, u32 *bit_off, u32 *num_bits); +int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 bit_off, + u32 num_bits); + void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb, unsigned int num_clusters); void ocfs2_la_enable_worker(struct work_struct *work); diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index e57c804069e..6b6d092b099 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -82,6 +82,8 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode, } ret = flock_lock_file_wait(file, fl); + if (ret) + ocfs2_file_unlock(file); out: mutex_unlock(&fp->fp_mutex); diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 3d3f3c83065..599eb4c4c8b 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -151,6 +151,7 @@ static int __ocfs2_move_extent(handle_t *handle, old_blkno, len); } + ocfs2_update_inode_fsync_trans(handle, inode, 0); out: ocfs2_free_path(path); return ret; @@ -201,8 +202,7 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode, } } - *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el, - clusters_to_move + 2); + *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el); mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n", extra_blocks, clusters_to_move, *credits); @@ -562,83 +562,6 @@ static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh, mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); } -static int ocfs2_alloc_dinode_update_counts(struct inode *inode, - handle_t *handle, - struct buffer_head *di_bh, - u32 num_bits, - u16 chain) -{ - int ret; - u32 tmp_used; - struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; - struct ocfs2_chain_list *cl = - (struct ocfs2_chain_list *) &di->id2.i_chain; - - ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - - tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); - di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); - le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); - ocfs2_journal_dirty(handle, di_bh); - -out: - return ret; -} - -static inline int ocfs2_block_group_set_bits(handle_t *handle, - struct inode *alloc_inode, - struct ocfs2_group_desc *bg, - struct buffer_head *group_bh, - unsigned int bit_off, - unsigned int num_bits) -{ - int status; - void *bitmap = bg->bg_bitmap; - int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; - - /* All callers get the descriptor via - * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ - BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); - BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); - - mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, - num_bits); - - if (ocfs2_is_cluster_bitmap(alloc_inode)) - journal_type = OCFS2_JOURNAL_ACCESS_UNDO; - - status = ocfs2_journal_access_gd(handle, - INODE_CACHE(alloc_inode), - group_bh, - journal_type); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - le16_add_cpu(&bg->bg_free_bits_count, -num_bits); - if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { - ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" - " count %u but claims %u are freed. num_bits %d", - (unsigned long long)le64_to_cpu(bg->bg_blkno), - le16_to_cpu(bg->bg_bits), - le16_to_cpu(bg->bg_free_bits_count), num_bits); - return -EROFS; - } - while (num_bits--) - ocfs2_set_bit(bit_off++, bitmap); - - ocfs2_journal_dirty(handle, group_bh); - -bail: - return status; -} - static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, u32 len, int ext_flags) @@ -768,8 +691,11 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, goal_bit, len); - if (ret) + if (ret) { + ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len, + le16_to_cpu(gd->bg_chain)); mlog_errno(ret); + } /* * Here we should write the new page out first if we are @@ -1035,6 +961,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) inode->i_ctime = CURRENT_TIME; di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); @@ -1067,8 +994,10 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) if (status) return status; - if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) + if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) { + status = -EPERM; goto out_drop; + } if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { status = -EPERM; @@ -1090,8 +1019,10 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) goto out_free; } - if (range.me_start > i_size_read(inode)) + if (range.me_start > i_size_read(inode)) { + status = -EINVAL; goto out_free; + } if (range.me_start + range.me_len > i_size_read(inode)) range.me_len = i_size_read(inode) - range.me_start; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index be3f8676a43..8add6f1030d 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -205,6 +205,21 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) return inode; } +static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb, + struct dentry *dentry, struct inode *inode) +{ + struct ocfs2_dentry_lock *dl = dentry->d_fsdata; + + ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); + ocfs2_lock_res_free(&dl->dl_lockres); + BUG_ON(dl->dl_count != 1); + spin_lock(&dentry_attach_lock); + dentry->d_fsdata = NULL; + spin_unlock(&dentry_attach_lock); + kfree(dl); + iput(inode); +} + static int ocfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, @@ -230,6 +245,8 @@ static int ocfs2_mknod(struct inode *dir, struct ocfs2_dir_lookup_result lookup = { NULL, }; sigset_t oldset; int did_block_signals = 0; + struct posix_acl *default_acl = NULL, *acl = NULL; + struct ocfs2_dentry_lock *dl = NULL; trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno, @@ -331,6 +348,12 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } + status = posix_acl_create(dir, &mode, &default_acl, &acl); + if (status) { + mlog_errno(status); + goto leave; + } + handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, S_ISDIR(mode), xattr_credits)); @@ -379,8 +402,17 @@ static int ocfs2_mknod(struct inode *dir, inc_nlink(dir); } - status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh, - meta_ac, data_ac); + if (default_acl) { + status = ocfs2_set_acl(handle, inode, new_fe_bh, + ACL_TYPE_DEFAULT, default_acl, + meta_ac, data_ac); + } + if (!status && acl) { + status = ocfs2_set_acl(handle, inode, new_fe_bh, + ACL_TYPE_ACCESS, acl, + meta_ac, data_ac); + } + if (status < 0) { mlog_errno(status); goto leave; @@ -407,6 +439,8 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } + dl = dentry->d_fsdata; + status = ocfs2_add_entry(handle, dentry, inode, OCFS2_I(inode)->ip_blkno, parent_fe_bh, &lookup); @@ -419,6 +453,10 @@ static int ocfs2_mknod(struct inode *dir, d_instantiate(dentry, inode); status = 0; leave: + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); if (status < 0 && did_quota_inode) dquot_free_inode(inode); if (handle) @@ -430,7 +468,6 @@ leave: brelse(new_fe_bh); brelse(parent_fe_bh); - kfree(si.name); kfree(si.value); ocfs2_free_dir_lookup_result(&lookup); @@ -450,6 +487,9 @@ leave: * ocfs2_delete_inode will mutex_lock again. */ if ((status < 0) && inode) { + if (dl) + ocfs2_cleanup_add_entry_failure(osb, dentry, inode); + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR; clear_nlink(inode); iput(inode); @@ -475,6 +515,7 @@ static int __ocfs2_mknod_locked(struct inode *dir, struct ocfs2_dinode *fe = NULL; struct ocfs2_extent_list *fel; u16 feat; + struct ocfs2_inode_info *oi = OCFS2_I(inode); *new_fe_bh = NULL; @@ -489,7 +530,7 @@ static int __ocfs2_mknod_locked(struct inode *dir, *new_fe_bh = sb_getblk(osb->sb, fe_blkno); if (!*new_fe_bh) { - status = -EIO; + status = -ENOMEM; mlog_errno(status); goto leave; } @@ -556,8 +597,8 @@ static int __ocfs2_mknod_locked(struct inode *dir, mlog_errno(status); } - status = 0; /* error in ocfs2_create_new_inode_locks is not - * critical */ + oi->i_sync_tid = handle->h_transaction->t_tid; + oi->i_datasync_tid = handle->h_transaction->t_tid; leave: if (status < 0) { @@ -644,6 +685,7 @@ static int ocfs2_link(struct dentry *old_dentry, struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_dir_lookup_result lookup = { NULL, }; sigset_t oldset; + u64 old_de_ino; trace_ocfs2_link((unsigned long long)OCFS2_I(inode)->ip_blkno, old_dentry->d_name.len, old_dentry->d_name.name, @@ -666,6 +708,22 @@ static int ocfs2_link(struct dentry *old_dentry, goto out; } + err = ocfs2_lookup_ino_from_name(dir, old_dentry->d_name.name, + old_dentry->d_name.len, &old_de_ino); + if (err) { + err = -ENOENT; + goto out; + } + + /* + * Check whether another node removed the source inode while we + * were in the vfs. + */ + if (old_de_ino != OCFS2_I(inode)->ip_blkno) { + err = -ENOENT; + goto out; + } + err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, dentry->d_name.len); if (err) @@ -948,12 +1006,71 @@ leave: ocfs2_free_dir_lookup_result(&orphan_insert); ocfs2_free_dir_lookup_result(&lookup); - if (status && (status != -ENOTEMPTY)) + if (status && (status != -ENOTEMPTY) && (status != -ENOENT)) mlog_errno(status); return status; } +static int ocfs2_check_if_ancestor(struct ocfs2_super *osb, + u64 src_inode_no, u64 dest_inode_no) +{ + int ret = 0, i = 0; + u64 parent_inode_no = 0; + u64 child_inode_no = src_inode_no; + struct inode *child_inode; + +#define MAX_LOOKUP_TIMES 32 + while (1) { + child_inode = ocfs2_iget(osb, child_inode_no, 0, 0); + if (IS_ERR(child_inode)) { + ret = PTR_ERR(child_inode); + break; + } + + ret = ocfs2_inode_lock(child_inode, NULL, 0); + if (ret < 0) { + iput(child_inode); + if (ret != -ENOENT) + mlog_errno(ret); + break; + } + + ret = ocfs2_lookup_ino_from_name(child_inode, "..", 2, + &parent_inode_no); + ocfs2_inode_unlock(child_inode, 0); + iput(child_inode); + if (ret < 0) { + ret = -ENOENT; + break; + } + + if (parent_inode_no == dest_inode_no) { + ret = 1; + break; + } + + if (parent_inode_no == osb->root_inode->i_ino) { + ret = 0; + break; + } + + child_inode_no = parent_inode_no; + + if (++i >= MAX_LOOKUP_TIMES) { + mlog(ML_NOTICE, "max lookup times reached, filesystem " + "may have nested directories, " + "src inode: %llu, dest inode: %llu.\n", + (unsigned long long)src_inode_no, + (unsigned long long)dest_inode_no); + ret = 0; + break; + } + } + + return ret; +} + /* * The only place this should be used is rename! * if they have the same id, then the 1st one is the only one locked. @@ -965,6 +1082,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, struct inode *inode2) { int status; + int inode1_is_ancestor, inode2_is_ancestor; struct ocfs2_inode_info *oi1 = OCFS2_I(inode1); struct ocfs2_inode_info *oi2 = OCFS2_I(inode2); struct buffer_head **tmpbh; @@ -978,9 +1096,26 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, if (*bh2) *bh2 = NULL; - /* we always want to lock the one with the lower lockid first. */ + /* we always want to lock the one with the lower lockid first. + * and if they are nested, we lock ancestor first */ if (oi1->ip_blkno != oi2->ip_blkno) { - if (oi1->ip_blkno < oi2->ip_blkno) { + inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno, + oi1->ip_blkno); + if (inode1_is_ancestor < 0) { + status = inode1_is_ancestor; + goto bail; + } + + inode2_is_ancestor = ocfs2_check_if_ancestor(osb, oi1->ip_blkno, + oi2->ip_blkno); + if (inode2_is_ancestor < 0) { + status = inode2_is_ancestor; + goto bail; + } + + if ((inode1_is_ancestor == 1) || + (oi1->ip_blkno < oi2->ip_blkno && + inode2_is_ancestor == 0)) { /* switch id1 and id2 around */ tmpbh = bh2; bh2 = bh1; @@ -1061,6 +1196,7 @@ static int ocfs2_rename(struct inode *old_dir, struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, }; struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; struct ocfs2_dir_lookup_result target_insert = { NULL, }; + bool should_add_orphan = false; /* At some point it might be nice to break this function up a * bit. */ @@ -1097,6 +1233,21 @@ static int ocfs2_rename(struct inode *old_dir, goto bail; } rename_lock = 1; + + /* here we cannot guarantee the inodes haven't just been + * changed, so check if they are nested again */ + status = ocfs2_check_if_ancestor(osb, new_dir->i_ino, + old_inode->i_ino); + if (status < 0) { + mlog_errno(status); + goto bail; + } else if (status == 1) { + status = -EPERM; + trace_ocfs2_rename_not_permitted( + (unsigned long long)old_inode->i_ino, + (unsigned long long)new_dir->i_ino); + goto bail; + } } /* if old and new are the same, this'll just do one lock. */ @@ -1267,6 +1418,7 @@ static int ocfs2_rename(struct inode *old_dir, mlog_errno(status); goto bail; } + should_add_orphan = true; } } else { BUG_ON(new_dentry->d_parent->d_inode != new_dir); @@ -1311,17 +1463,6 @@ static int ocfs2_rename(struct inode *old_dir, goto bail; } - if (S_ISDIR(new_inode->i_mode) || - (ocfs2_read_links_count(newfe) == 1)) { - status = ocfs2_orphan_add(osb, handle, new_inode, - newfe_bh, orphan_name, - &orphan_insert, orphan_dir); - if (status < 0) { - mlog_errno(status); - goto bail; - } - } - /* change the dirent to point to the correct inode */ status = ocfs2_update_entry(new_dir, handle, &target_lookup_res, old_inode); @@ -1336,6 +1477,15 @@ static int ocfs2_rename(struct inode *old_dir, else ocfs2_add_links_count(newfe, -1); ocfs2_journal_dirty(handle, newfe_bh); + if (should_add_orphan) { + status = ocfs2_orphan_add(osb, handle, new_inode, + newfe_bh, orphan_name, + &orphan_insert, orphan_dir); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } } else { /* if the name was not found in new_dir, add it now */ status = ocfs2_add_entry(handle, new_dentry, old_inode, @@ -1605,6 +1755,7 @@ static int ocfs2_symlink(struct inode *dir, struct ocfs2_dir_lookup_result lookup = { NULL, }; sigset_t oldset; int did_block_signals = 0; + struct ocfs2_dentry_lock *dl = NULL; trace_ocfs2_symlink_begin(dir, dentry, symname, dentry->d_name.len, dentry->d_name.name); @@ -1793,6 +1944,8 @@ static int ocfs2_symlink(struct inode *dir, goto bail; } + dl = dentry->d_fsdata; + status = ocfs2_add_entry(handle, dentry, inode, le64_to_cpu(fe->i_blkno), parent_fe_bh, &lookup); @@ -1818,7 +1971,6 @@ bail: brelse(new_fe_bh); brelse(parent_fe_bh); - kfree(si.name); kfree(si.value); ocfs2_free_dir_lookup_result(&lookup); if (inode_ac) @@ -1828,6 +1980,9 @@ bail: if (xattr_ac) ocfs2_free_alloc_context(xattr_ac); if ((status < 0) && inode) { + if (dl) + ocfs2_cleanup_add_entry_failure(osb, dentry, inode); + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR; clear_nlink(inode); iput(inode); @@ -2444,6 +2599,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, di->i_orphaned_slot = 0; set_nlink(inode, 1); ocfs2_set_links_count(di, inode->i_nlink); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, di_bh); status = ocfs2_add_entry(handle, dentry, inode, @@ -2504,4 +2660,5 @@ const struct inode_operations ocfs2_dir_iops = { .removexattr = generic_removexattr, .fiemap = ocfs2_fiemap, .get_acl = ocfs2_iop_get_acl, + .set_acl = ocfs2_iop_set_acl, }; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 3a903470c79..bbec539230f 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -30,6 +30,7 @@ #include <linux/sched.h> #include <linux/wait.h> #include <linux/list.h> +#include <linux/llist.h> #include <linux/rbtree.h> #include <linux/workqueue.h> #include <linux/kref.h> @@ -274,19 +275,16 @@ enum ocfs2_mount_options OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ }; -#define OCFS2_OSB_SOFT_RO 0x0001 -#define OCFS2_OSB_HARD_RO 0x0002 -#define OCFS2_OSB_ERROR_FS 0x0004 -#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008 - -#define OCFS2_DEFAULT_ATIME_QUANTUM 60 +#define OCFS2_OSB_SOFT_RO 0x0001 +#define OCFS2_OSB_HARD_RO 0x0002 +#define OCFS2_OSB_ERROR_FS 0x0004 +#define OCFS2_DEFAULT_ATIME_QUANTUM 60 struct ocfs2_journal; struct ocfs2_slot_info; struct ocfs2_recovery_map; struct ocfs2_replay_map; struct ocfs2_quota_recovery; -struct ocfs2_dentry_lock; struct ocfs2_super { struct task_struct *commit_task; @@ -387,6 +385,7 @@ struct ocfs2_super u8 osb_stackflags; char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; + char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1]; struct ocfs2_cluster_connection *cconn; struct ocfs2_lock_res osb_super_lockres; struct ocfs2_lock_res osb_rename_lockres; @@ -413,10 +412,9 @@ struct ocfs2_super struct list_head blocked_lock_list; unsigned long blocked_lock_count; - /* List of dentry locks to release. Anyone can add locks to - * the list, ocfs2_wq processes the list */ - struct ocfs2_dentry_lock *dentry_lock_list; - struct work_struct dentry_lock_work; + /* List of dquot structures to drop last reference to */ + struct llist_head dquot_drop_list; + struct work_struct dquot_drop_work; wait_queue_head_t osb_mount_event; @@ -424,6 +422,7 @@ struct ocfs2_super struct inode *osb_tl_inode; struct buffer_head *osb_tl_bh; struct delayed_work osb_truncate_log_wq; + atomic_t osb_tl_disable; /* * How many clusters in our truncate log. * It must be protected by osb_tl_inode->i_mutex. @@ -448,6 +447,8 @@ struct ocfs2_super /* rb tree root for refcount lock. */ struct rb_root osb_rf_lock_tree; struct ocfs2_refcount_tree *osb_ref_tree_lru; + + struct mutex system_file_mutex; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) @@ -578,18 +579,6 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, spin_unlock(&osb->osb_lock); } - -static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb, - unsigned long flag) -{ - unsigned long ret; - - spin_lock(&osb->osb_lock); - ret = osb->osb_flags & flag; - spin_unlock(&osb->osb_lock); - return ret; -} - static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, int hard) { diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 1b60c62aa9d..6cb019b7c6a 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -2292,6 +2292,8 @@ TRACE_EVENT(ocfs2_rename, __entry->new_len, __get_str(new_name)) ); +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_rename_not_permitted); + TRACE_EVENT(ocfs2_rename_target_exists, TP_PROTO(int new_len, const char *new_name), TP_ARGS(new_len, new_name), diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index d5ab56cbe5c..f266d67df3c 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -28,6 +28,7 @@ struct ocfs2_dquot { unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */ s64 dq_origspace; /* Last globally synced space usage */ s64 dq_originodes; /* Last globally synced inode usage */ + struct llist_node list; /* Member of list of dquots to drop */ }; /* Description of one chunk to recover in memory */ @@ -110,6 +111,7 @@ int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block, int ocfs2_create_local_dquot(struct dquot *dquot); int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot); int ocfs2_local_write_dquot(struct dquot *dquot); +void ocfs2_drop_dquot_refs(struct work_struct *work); extern const struct dquot_operations ocfs2_quota_operations; extern struct quota_format_type ocfs2_quota_format; diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index aaa50611ec6..b990a62cff5 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -10,6 +10,7 @@ #include <linux/jiffies.h> #include <linux/writeback.h> #include <linux/workqueue.h> +#include <linux/llist.h> #include <cluster/masklog.h> @@ -679,6 +680,27 @@ static int ocfs2_calc_qdel_credits(struct super_block *sb, int type) OCFS2_INODE_UPDATE_CREDITS; } +void ocfs2_drop_dquot_refs(struct work_struct *work) +{ + struct ocfs2_super *osb = container_of(work, struct ocfs2_super, + dquot_drop_work); + struct llist_node *list; + struct ocfs2_dquot *odquot, *next_odquot; + + list = llist_del_all(&osb->dquot_drop_list); + llist_for_each_entry_safe(odquot, next_odquot, list, list) { + /* Drop the reference we acquired in ocfs2_dquot_release() */ + dqput(&odquot->dq_dquot); + } +} + +/* + * Called when the last reference to dquot is dropped. If we are called from + * downconvert thread, we cannot do all the handling here because grabbing + * quota lock could deadlock (the node holding the quota lock could need some + * other cluster lock to proceed but with blocked downconvert thread we cannot + * release any lock). + */ static int ocfs2_release_dquot(struct dquot *dquot) { handle_t *handle; @@ -694,6 +716,19 @@ static int ocfs2_release_dquot(struct dquot *dquot) /* Check whether we are not racing with some other dqget() */ if (atomic_read(&dquot->dq_count) > 1) goto out; + /* Running from downconvert thread? Postpone quota processing to wq */ + if (current == osb->dc_task) { + /* + * Grab our own reference to dquot and queue it for delayed + * dropping. Quota code rechecks after calling + * ->release_dquot() and won't free dquot structure. + */ + dqgrab(dquot); + /* First entry on list -> queue work */ + if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list)) + queue_work(ocfs2_wq, &osb->dquot_drop_work); + goto out; + } status = ocfs2_lock_global_qf(oinfo, 1); if (status < 0) goto out; @@ -717,6 +752,12 @@ static int ocfs2_release_dquot(struct dquot *dquot) */ if (status < 0) mlog_errno(status); + /* + * Clear dq_off so that we search for the structure in quota file next + * time we acquire it. The structure might be deleted and reallocated + * elsewhere by another node while our dquot structure is on freelist. + */ + dquot->dq_off = 0; clear_bit(DQ_ACTIVE_B, &dquot->dq_flags); out_trans: ocfs2_commit_trans(osb, handle); @@ -756,16 +797,17 @@ static int ocfs2_acquire_dquot(struct dquot *dquot) status = ocfs2_lock_global_qf(info, 1); if (status < 0) goto out; - if (!test_bit(DQ_READ_B, &dquot->dq_flags)) { - status = ocfs2_qinfo_lock(info, 0); - if (status < 0) - goto out_dq; - status = qtree_read_dquot(&info->dqi_gi, dquot); - ocfs2_qinfo_unlock(info, 0); - if (status < 0) - goto out_dq; - } - set_bit(DQ_READ_B, &dquot->dq_flags); + status = ocfs2_qinfo_lock(info, 0); + if (status < 0) + goto out_dq; + /* + * We always want to read dquot structure from disk because we don't + * know what happened with it while it was on freelist. + */ + status = qtree_read_dquot(&info->dqi_gi, dquot); + ocfs2_qinfo_unlock(info, 0); + if (status < 0) + goto out_dq; OCFS2_DQUOT(dquot)->dq_use_count++; OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 2e4344be3b9..2001862bf2b 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -1303,10 +1303,6 @@ int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot) ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); out: - /* Clear the read bit so that next time someone uses this - * dquot he reads fresh info from disk and allocates local - * dquot structure */ - clear_bit(DQ_READ_B, &dquot->dq_flags); return status; } diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index bf4dfc14bb2..636aab69ead 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -46,6 +46,7 @@ #include <linux/quotaops.h> #include <linux/namei.h> #include <linux/mount.h> +#include <linux/posix_acl.h> struct ocfs2_cow_context { struct inode *inode; @@ -612,6 +613,11 @@ static int ocfs2_create_refcount_tree(struct inode *inode, } new_bh = sb_getblk(inode->i_sb, first_blkno); + if (!new_bh) { + ret = -ENOMEM; + mlog_errno(ret); + goto out_commit; + } ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh); ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh, @@ -1310,7 +1316,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle, new_bh = sb_getblk(sb, blkno); if (new_bh == NULL) { - ret = -EIO; + ret = -ENOMEM; mlog_errno(ret); goto out; } @@ -1402,10 +1408,9 @@ static void swap_refcount_rec(void *a, void *b, int size) { struct ocfs2_refcount_rec *l = a, *r = b, tmp; - tmp = *(struct ocfs2_refcount_rec *)l; - *(struct ocfs2_refcount_rec *)l = - *(struct ocfs2_refcount_rec *)r; - *(struct ocfs2_refcount_rec *)r = tmp; + tmp = *l; + *l = *r; + *r = tmp; } /* @@ -1561,7 +1566,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle, new_bh = sb_getblk(sb, blkno); if (new_bh == NULL) { - ret = -EIO; + ret = -ENOMEM; mlog_errno(ret); goto out; } @@ -2502,8 +2507,7 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); *meta_add += ocfs2_extend_meta_needed(et.et_root_el); *credits += ocfs2_calc_extend_credits(sb, - et.et_root_el, - ref_blocks); + et.et_root_el); } else { *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; *meta_add += 1; @@ -2874,8 +2878,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb, meta_add = ocfs2_extend_meta_needed(et->et_root_el); - *credits += ocfs2_calc_extend_credits(sb, et->et_root_el, - num_clusters + 2); + *credits += ocfs2_calc_extend_credits(sb, et->et_root_el); ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh, p_cluster, num_clusters, @@ -3031,7 +3034,7 @@ int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, for (i = 0; i < blocks; i++, old_block++, new_block++) { new_bh = sb_getblk(osb->sb, new_block); if (new_bh == NULL) { - ret = -EIO; + ret = -ENOMEM; mlog_errno(ret); break; } @@ -3625,8 +3628,7 @@ int ocfs2_refcounted_xattr_delete_need(struct inode *inode, ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh); *credits += ocfs2_calc_extend_credits(inode->i_sb, - et.et_root_el, - ref_blocks); + et.et_root_el); } out: @@ -4266,20 +4268,36 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, struct inode *inode = old_dentry->d_inode; struct buffer_head *old_bh = NULL; struct inode *new_orphan_inode = NULL; + struct posix_acl *default_acl, *acl; + umode_t mode; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) return -EOPNOTSUPP; - error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, + mode = inode->i_mode; + error = posix_acl_create(dir, &mode, &default_acl, &acl); + if (error) { + mlog_errno(error); + goto out; + } + + error = ocfs2_create_inode_in_orphan(dir, mode, &new_orphan_inode); if (error) { mlog_errno(error); goto out; } + error = ocfs2_rw_lock(inode, 1); + if (error) { + mlog_errno(error); + goto out; + } + error = ocfs2_inode_lock(inode, &old_bh, 1); if (error) { mlog_errno(error); + ocfs2_rw_unlock(inode, 1); goto out; } @@ -4291,6 +4309,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, up_write(&OCFS2_I(inode)->ip_xattr_sem); ocfs2_inode_unlock(inode, 1); + ocfs2_rw_unlock(inode, 1); brelse(old_bh); if (error) { @@ -4301,11 +4320,16 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { error = ocfs2_init_security_and_acl(dir, new_orphan_inode, - &new_dentry->d_name); + &new_dentry->d_name, + default_acl, acl); if (error) mlog_errno(error); } out: + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); if (!error) { error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, new_dentry); diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index ec55add7604..d5da6f62414 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -53,8 +53,6 @@ */ static u16 ocfs2_calc_new_backup_super(struct inode *inode, struct ocfs2_group_desc *gd, - int new_clusters, - u32 first_new_cluster, u16 cl_cpg, int set) { @@ -127,8 +125,6 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, OCFS2_FEATURE_COMPAT_BACKUP_SB)) { backups = ocfs2_calc_new_backup_super(bm_inode, group, - new_clusters, - first_new_cluster, cl_cpg, 1); le16_add_cpu(&group->bg_free_bits_count, -1 * backups); } @@ -157,7 +153,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, spin_lock(&OCFS2_I(bm_inode)->ip_lock); OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); - le64_add_cpu(&fe->i_size, new_clusters << osb->s_clustersize_bits); + le64_add_cpu(&fe->i_size, (u64)new_clusters << osb->s_clustersize_bits); spin_unlock(&OCFS2_I(bm_inode)->ip_lock); i_size_write(bm_inode, le64_to_cpu(fe->i_size)); @@ -167,8 +163,6 @@ out_rollback: if (ret < 0) { ocfs2_calc_new_backup_super(bm_inode, group, - new_clusters, - first_new_cluster, cl_cpg, 0); le16_add_cpu(&group->bg_free_bits_count, backups); le16_add_cpu(&group->bg_bits, -1 * num_bits); @@ -469,6 +463,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) struct ocfs2_chain_list *cl; struct ocfs2_chain_rec *cr; u16 cl_bpc; + u64 bg_ptr; if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; @@ -513,7 +508,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh); if (ret) { mlog_errno(ret); - goto out_unlock; + goto out_free_group_bh; } trace_ocfs2_group_add((unsigned long long)input->group, @@ -523,7 +518,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) if (IS_ERR(handle)) { mlog_errno(PTR_ERR(handle)); ret = -EINVAL; - goto out_unlock; + goto out_free_group_bh; } cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc); @@ -538,12 +533,14 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) } group = (struct ocfs2_group_desc *)group_bh->b_data; + bg_ptr = le64_to_cpu(group->bg_next_group); group->bg_next_group = cr->c_blkno; ocfs2_journal_dirty(handle, group_bh); ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode), main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { + group->bg_next_group = cpu_to_le64(bg_ptr); mlog_errno(ret); goto out_commit; } @@ -566,7 +563,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) spin_lock(&OCFS2_I(main_bm_inode)->ip_lock); OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); - le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits); + le64_add_cpu(&fe->i_size, (u64)input->clusters << osb->s_clustersize_bits); spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock); i_size_write(main_bm_inode, le64_to_cpu(fe->i_size)); @@ -574,8 +571,11 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) out_commit: ocfs2_commit_trans(osb, handle); -out_unlock: + +out_free_group_bh: brelse(group_bh); + +out_unlock: brelse(main_bm_bh); ocfs2_inode_unlock(main_bm_inode, 1); diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index bf1f8930456..1724d43d3da 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -398,7 +398,8 @@ static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn) return 0; } -static int o2cb_cluster_this_node(unsigned int *node) +static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *node) { int node_num; diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 286edf1e231..13a8537d8e8 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -23,6 +23,7 @@ #include <linux/mutex.h> #include <linux/slab.h> #include <linux/reboot.h> +#include <linux/sched.h> #include <asm/uaccess.h> #include "stackglue.h" @@ -102,6 +103,12 @@ #define OCFS2_TEXT_UUID_LEN 32 #define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2 #define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8 +#define VERSION_LOCK "version_lock" + +enum ocfs2_connection_type { + WITH_CONTROLD, + NO_CONTROLD +}; /* * ocfs2_live_connection is refcounted because the filesystem and @@ -110,6 +117,13 @@ struct ocfs2_live_connection { struct list_head oc_list; struct ocfs2_cluster_connection *oc_conn; + enum ocfs2_connection_type oc_type; + atomic_t oc_this_node; + int oc_our_slot; + struct dlm_lksb oc_version_lksb; + char oc_lvb[DLM_LVB_LEN]; + struct completion oc_sync_wait; + wait_queue_head_t oc_wait; }; struct ocfs2_control_private { @@ -198,20 +212,15 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name) * mount path. Since the VFS prevents multiple calls to * fill_super(), we can't get dupes here. */ -static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, - struct ocfs2_live_connection **c_ret) +static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn, + struct ocfs2_live_connection *c) { int rc = 0; - struct ocfs2_live_connection *c; - - c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); - if (!c) - return -ENOMEM; mutex_lock(&ocfs2_control_lock); c->oc_conn = conn; - if (atomic_read(&ocfs2_control_opened)) + if ((c->oc_type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened)) list_add(&c->oc_list, &ocfs2_live_connection_list); else { printk(KERN_ERR @@ -220,12 +229,6 @@ static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, } mutex_unlock(&ocfs2_control_lock); - - if (!rc) - *c_ret = c; - else - kfree(c); - return rc; } @@ -799,18 +802,251 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing, return 0; } +static void lvb_to_version(char *lvb, struct ocfs2_protocol_version *ver) +{ + struct ocfs2_protocol_version *pv = + (struct ocfs2_protocol_version *)lvb; + /* + * ocfs2_protocol_version has two u8 variables, so we don't + * need any endian conversion. + */ + ver->pv_major = pv->pv_major; + ver->pv_minor = pv->pv_minor; +} + +static void version_to_lvb(struct ocfs2_protocol_version *ver, char *lvb) +{ + struct ocfs2_protocol_version *pv = + (struct ocfs2_protocol_version *)lvb; + /* + * ocfs2_protocol_version has two u8 variables, so we don't + * need any endian conversion. + */ + pv->pv_major = ver->pv_major; + pv->pv_minor = ver->pv_minor; +} + +static void sync_wait_cb(void *arg) +{ + struct ocfs2_cluster_connection *conn = arg; + struct ocfs2_live_connection *lc = conn->cc_private; + complete(&lc->oc_sync_wait); +} + +static int sync_unlock(struct ocfs2_cluster_connection *conn, + struct dlm_lksb *lksb, char *name) +{ + int error; + struct ocfs2_live_connection *lc = conn->cc_private; + + error = dlm_unlock(conn->cc_lockspace, lksb->sb_lkid, 0, lksb, conn); + if (error) { + printk(KERN_ERR "%s lkid %x error %d\n", + name, lksb->sb_lkid, error); + return error; + } + + wait_for_completion(&lc->oc_sync_wait); + + if (lksb->sb_status != -DLM_EUNLOCK) { + printk(KERN_ERR "%s lkid %x status %d\n", + name, lksb->sb_lkid, lksb->sb_status); + return -1; + } + return 0; +} + +static int sync_lock(struct ocfs2_cluster_connection *conn, + int mode, uint32_t flags, + struct dlm_lksb *lksb, char *name) +{ + int error, status; + struct ocfs2_live_connection *lc = conn->cc_private; + + error = dlm_lock(conn->cc_lockspace, mode, lksb, flags, + name, strlen(name), + 0, sync_wait_cb, conn, NULL); + if (error) { + printk(KERN_ERR "%s lkid %x flags %x mode %d error %d\n", + name, lksb->sb_lkid, flags, mode, error); + return error; + } + + wait_for_completion(&lc->oc_sync_wait); + + status = lksb->sb_status; + + if (status && status != -EAGAIN) { + printk(KERN_ERR "%s lkid %x flags %x mode %d status %d\n", + name, lksb->sb_lkid, flags, mode, status); + } + + return status; +} + + +static int version_lock(struct ocfs2_cluster_connection *conn, int mode, + int flags) +{ + struct ocfs2_live_connection *lc = conn->cc_private; + return sync_lock(conn, mode, flags, + &lc->oc_version_lksb, VERSION_LOCK); +} + +static int version_unlock(struct ocfs2_cluster_connection *conn) +{ + struct ocfs2_live_connection *lc = conn->cc_private; + return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK); +} + +/* get_protocol_version() + * + * To exchange ocfs2 versioning, we use the LVB of the version dlm lock. + * The algorithm is: + * 1. Attempt to take the lock in EX mode (non-blocking). + * 2. If successful (which means it is the first mount), write the + * version number and downconvert to PR lock. + * 3. If unsuccessful (returns -EAGAIN), read the version from the LVB after + * taking the PR lock. + */ + +static int get_protocol_version(struct ocfs2_cluster_connection *conn) +{ + int ret; + struct ocfs2_live_connection *lc = conn->cc_private; + struct ocfs2_protocol_version pv; + + running_proto.pv_major = + ocfs2_user_plugin.sp_max_proto.pv_major; + running_proto.pv_minor = + ocfs2_user_plugin.sp_max_proto.pv_minor; + + lc->oc_version_lksb.sb_lvbptr = lc->oc_lvb; + ret = version_lock(conn, DLM_LOCK_EX, + DLM_LKF_VALBLK|DLM_LKF_NOQUEUE); + if (!ret) { + conn->cc_version.pv_major = running_proto.pv_major; + conn->cc_version.pv_minor = running_proto.pv_minor; + version_to_lvb(&running_proto, lc->oc_lvb); + version_lock(conn, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_VALBLK); + } else if (ret == -EAGAIN) { + ret = version_lock(conn, DLM_LOCK_PR, DLM_LKF_VALBLK); + if (ret) + goto out; + lvb_to_version(lc->oc_lvb, &pv); + + if ((pv.pv_major != running_proto.pv_major) || + (pv.pv_minor > running_proto.pv_minor)) { + ret = -EINVAL; + goto out; + } + + conn->cc_version.pv_major = pv.pv_major; + conn->cc_version.pv_minor = pv.pv_minor; + } +out: + return ret; +} + +static void user_recover_prep(void *arg) +{ +} + +static void user_recover_slot(void *arg, struct dlm_slot *slot) +{ + struct ocfs2_cluster_connection *conn = arg; + printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n", + slot->nodeid, slot->slot); + conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data); + +} + +static void user_recover_done(void *arg, struct dlm_slot *slots, + int num_slots, int our_slot, + uint32_t generation) +{ + struct ocfs2_cluster_connection *conn = arg; + struct ocfs2_live_connection *lc = conn->cc_private; + int i; + + for (i = 0; i < num_slots; i++) + if (slots[i].slot == our_slot) { + atomic_set(&lc->oc_this_node, slots[i].nodeid); + break; + } + + lc->oc_our_slot = our_slot; + wake_up(&lc->oc_wait); +} + +static const struct dlm_lockspace_ops ocfs2_ls_ops = { + .recover_prep = user_recover_prep, + .recover_slot = user_recover_slot, + .recover_done = user_recover_done, +}; + +static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) +{ + version_unlock(conn); + dlm_release_lockspace(conn->cc_lockspace, 2); + conn->cc_lockspace = NULL; + ocfs2_live_connection_drop(conn->cc_private); + conn->cc_private = NULL; + return 0; +} + static int user_cluster_connect(struct ocfs2_cluster_connection *conn) { dlm_lockspace_t *fsdlm; - struct ocfs2_live_connection *uninitialized_var(control); - int rc = 0; + struct ocfs2_live_connection *lc; + int rc, ops_rv; BUG_ON(conn == NULL); - rc = ocfs2_live_connection_new(conn, &control); + lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); + if (!lc) { + rc = -ENOMEM; + goto out; + } + + init_waitqueue_head(&lc->oc_wait); + init_completion(&lc->oc_sync_wait); + atomic_set(&lc->oc_this_node, 0); + conn->cc_private = lc; + lc->oc_type = NO_CONTROLD; + + rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name, + DLM_LSFL_FS, DLM_LVB_LEN, + &ocfs2_ls_ops, conn, &ops_rv, &fsdlm); + if (rc) + goto out; + + if (ops_rv == -EOPNOTSUPP) { + lc->oc_type = WITH_CONTROLD; + printk(KERN_NOTICE "ocfs2: You seem to be using an older " + "version of dlm_controld and/or ocfs2-tools." + " Please consider upgrading.\n"); + } else if (ops_rv) { + rc = ops_rv; + goto out; + } + conn->cc_lockspace = fsdlm; + + rc = ocfs2_live_connection_attach(conn, lc); if (rc) goto out; + if (lc->oc_type == NO_CONTROLD) { + rc = get_protocol_version(conn); + if (rc) { + printk(KERN_ERR "ocfs2: Could not determine" + " locking version\n"); + user_cluster_disconnect(conn); + goto out; + } + wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0)); + } + /* * running_proto must have been set before we allowed any mounts * to proceed. @@ -818,42 +1054,34 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) if (fs_protocol_compare(&running_proto, &conn->cc_version)) { printk(KERN_ERR "Unable to mount with fs locking protocol version " - "%u.%u because the userspace control daemon has " - "negotiated %u.%u\n", + "%u.%u because negotiated protocol is %u.%u\n", conn->cc_version.pv_major, conn->cc_version.pv_minor, running_proto.pv_major, running_proto.pv_minor); rc = -EPROTO; - ocfs2_live_connection_drop(control); - goto out; - } - - rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN, - NULL, NULL, NULL, &fsdlm); - if (rc) { - ocfs2_live_connection_drop(control); - goto out; + ocfs2_live_connection_drop(lc); + lc = NULL; } - conn->cc_private = control; - conn->cc_lockspace = fsdlm; out: + if (rc && lc) + kfree(lc); return rc; } -static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) -{ - dlm_release_lockspace(conn->cc_lockspace, 2); - conn->cc_lockspace = NULL; - ocfs2_live_connection_drop(conn->cc_private); - conn->cc_private = NULL; - return 0; -} -static int user_cluster_this_node(unsigned int *this_node) +static int user_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *this_node) { int rc; + struct ocfs2_live_connection *lc = conn->cc_private; + + if (lc->oc_type == WITH_CONTROLD) + rc = ocfs2_control_get_this_node(); + else if (lc->oc_type == NO_CONTROLD) + rc = atomic_read(&lc->oc_this_node); + else + rc = -EINVAL; - rc = ocfs2_control_get_this_node(); if (rc < 0) return rc; diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 39abf89697e..5d965e83bd4 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -309,6 +309,8 @@ int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, EXPORT_SYMBOL_GPL(ocfs2_plock); int ocfs2_cluster_connect(const char *stack_name, + const char *cluster_name, + int cluster_name_len, const char *group, int grouplen, struct ocfs2_locking_protocol *lproto, @@ -342,8 +344,12 @@ int ocfs2_cluster_connect(const char *stack_name, goto out; } - memcpy(new_conn->cc_name, group, grouplen); + strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); new_conn->cc_namelen = grouplen; + if (cluster_name_len) + strlcpy(new_conn->cc_cluster_name, cluster_name, + CLUSTER_NAME_MAX + 1); + new_conn->cc_cluster_name_len = cluster_name_len; new_conn->cc_recovery_handler = recovery_handler; new_conn->cc_recovery_data = recovery_data; @@ -386,8 +392,9 @@ int ocfs2_cluster_connect_agnostic(const char *group, if (cluster_stack_name[0]) stack_name = cluster_stack_name; - return ocfs2_cluster_connect(stack_name, group, grouplen, lproto, - recovery_handler, recovery_data, conn); + return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen, + lproto, recovery_handler, recovery_data, + conn); } EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic); @@ -460,9 +467,10 @@ void ocfs2_cluster_hangup(const char *group, int grouplen) } EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup); -int ocfs2_cluster_this_node(unsigned int *node) +int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *node) { - return active_stack->sp_ops->this_node(node); + return active_stack->sp_ops->this_node(conn, node); } EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node); @@ -488,7 +496,7 @@ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj, } static struct kobj_attribute ocfs2_attr_max_locking_protocol = - __ATTR(max_locking_protocol, S_IFREG | S_IRUGO, + __ATTR(max_locking_protocol, S_IRUGO, ocfs2_max_locking_protocol_show, NULL); static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj, @@ -520,7 +528,7 @@ static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj, } static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins = - __ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO, + __ATTR(loaded_cluster_plugins, S_IRUGO, ocfs2_loaded_cluster_plugins_show, NULL); static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj, @@ -542,7 +550,7 @@ static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj, } static struct kobj_attribute ocfs2_attr_active_cluster_plugin = - __ATTR(active_cluster_plugin, S_IFREG | S_IRUGO, + __ATTR(active_cluster_plugin, S_IRUGO, ocfs2_active_cluster_plugin_show, NULL); static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj, @@ -591,15 +599,29 @@ static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj, static struct kobj_attribute ocfs2_attr_cluster_stack = - __ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR, + __ATTR(cluster_stack, S_IRUGO | S_IWUSR, ocfs2_cluster_stack_show, ocfs2_cluster_stack_store); + + +static ssize_t ocfs2_dlm_recover_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "1\n"); +} + +static struct kobj_attribute ocfs2_attr_dlm_recover_support = + __ATTR(dlm_recover_callback_support, S_IRUGO, + ocfs2_dlm_recover_show, NULL); + static struct attribute *ocfs2_attrs[] = { &ocfs2_attr_max_locking_protocol.attr, &ocfs2_attr_loaded_cluster_plugins.attr, &ocfs2_attr_active_cluster_plugin.attr, &ocfs2_attr_cluster_stack.attr, + &ocfs2_attr_dlm_recover_support.attr, NULL, }; @@ -643,7 +665,7 @@ error: #define FS_OCFS2_NM 1 -static ctl_table ocfs2_nm_table[] = { +static struct ctl_table ocfs2_nm_table[] = { { .procname = "hb_ctl_path", .data = ocfs2_hb_ctl_path, @@ -654,7 +676,7 @@ static ctl_table ocfs2_nm_table[] = { { } }; -static ctl_table ocfs2_mod_table[] = { +static struct ctl_table ocfs2_mod_table[] = { { .procname = "nm", .data = NULL, @@ -665,7 +687,7 @@ static ctl_table ocfs2_mod_table[] = { { } }; -static ctl_table ocfs2_kern_table[] = { +static struct ctl_table ocfs2_kern_table[] = { { .procname = "ocfs2", .data = NULL, @@ -676,7 +698,7 @@ static ctl_table ocfs2_kern_table[] = { { } }; -static ctl_table ocfs2_root_table[] = { +static struct ctl_table ocfs2_root_table[] = { { .procname = "fs", .data = NULL, @@ -687,7 +709,7 @@ static ctl_table ocfs2_root_table[] = { { } }; -static struct ctl_table_header *ocfs2_table_header = NULL; +static struct ctl_table_header *ocfs2_table_header; /* diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index 1ec56fdb8d0..66334a30cea 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -45,6 +45,9 @@ struct file_lock; */ #define GROUP_NAME_MAX 64 +/* This shadows OCFS2_CLUSTER_NAME_LEN */ +#define CLUSTER_NAME_MAX 16 + /* * ocfs2_protocol_version changes when ocfs2 does something different in @@ -97,8 +100,10 @@ struct ocfs2_locking_protocol { * locking compatibility. */ struct ocfs2_cluster_connection { - char cc_name[GROUP_NAME_MAX]; + char cc_name[GROUP_NAME_MAX + 1]; int cc_namelen; + char cc_cluster_name[CLUSTER_NAME_MAX + 1]; + int cc_cluster_name_len; struct ocfs2_protocol_version cc_version; struct ocfs2_locking_protocol *cc_proto; void (*cc_recovery_handler)(int node_num, void *recovery_data); @@ -152,7 +157,8 @@ struct ocfs2_stack_operations { * ->this_node() returns the cluster's unique identifier for the * local node. */ - int (*this_node)(unsigned int *node); + int (*this_node)(struct ocfs2_cluster_connection *conn, + unsigned int *node); /* * Call the underlying dlm lock function. The ->dlm_lock() @@ -239,6 +245,8 @@ struct ocfs2_stack_plugin { /* Used by the filesystem */ int ocfs2_cluster_connect(const char *stack_name, + const char *cluster_name, + int cluster_name_len, const char *group, int grouplen, struct ocfs2_locking_protocol *lproto, @@ -260,7 +268,8 @@ int ocfs2_cluster_connect_agnostic(const char *group, int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, int hangup_pending); void ocfs2_cluster_hangup(const char *group, int grouplen); -int ocfs2_cluster_this_node(unsigned int *node); +int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *node); struct ocfs2_lock_res; int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 5397c07ce60..0cb889a17ae 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -113,12 +113,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, struct ocfs2_suballoc_result *res); static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, int nr); -static inline int ocfs2_block_group_set_bits(handle_t *handle, - struct inode *alloc_inode, - struct ocfs2_group_desc *bg, - struct buffer_head *group_bh, - unsigned int bit_off, - unsigned int num_bits); static int ocfs2_relink_block_group(handle_t *handle, struct inode *alloc_inode, struct buffer_head *fe_bh, @@ -481,7 +475,7 @@ ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle, bg_bh = sb_getblk(osb->sb, bg_blkno); if (!bg_bh) { - status = -EIO; + status = -ENOMEM; mlog_errno(status); goto bail; } @@ -661,7 +655,7 @@ ocfs2_block_group_alloc_discontig(handle_t *handle, bg_bh = sb_getblk(osb->sb, bg_blkno); if (!bg_bh) { - status = -EIO; + status = -ENOMEM; mlog_errno(status); goto bail; } @@ -777,6 +771,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); + ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0); status = 0; @@ -1343,7 +1338,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, return status; } -static inline int ocfs2_block_group_set_bits(handle_t *handle, +int ocfs2_block_group_set_bits(handle_t *handle, struct inode *alloc_inode, struct ocfs2_group_desc *bg, struct buffer_head *group_bh, @@ -1388,8 +1383,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle, ocfs2_journal_dirty(handle, group_bh); bail: - if (status) - mlog_errno(status); return status; } @@ -1588,7 +1581,7 @@ static int ocfs2_block_group_search(struct inode *inode, return ret; } -static int ocfs2_alloc_dinode_update_counts(struct inode *inode, +int ocfs2_alloc_dinode_update_counts(struct inode *inode, handle_t *handle, struct buffer_head *di_bh, u32 num_bits, @@ -1615,6 +1608,21 @@ out: return ret; } +void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain) +{ + u32 tmp_used; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; + struct ocfs2_chain_list *cl; + + cl = (struct ocfs2_chain_list *)&di->id2.i_chain; + tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); + di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits); + le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits); +} + static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res, struct ocfs2_extent_rec *rec, struct ocfs2_chain_list *cl) @@ -1715,8 +1723,12 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, res->sr_bit_offset, res->sr_bits); - if (ret < 0) + if (ret < 0) { + ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh, + res->sr_bits, + le16_to_cpu(gd->bg_chain)); mlog_errno(ret); + } out_loc_only: *bits_left = le16_to_cpu(gd->bg_free_bits_count); @@ -1846,6 +1858,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, res->sr_bit_offset, res->sr_bits); if (status < 0) { + ocfs2_rollback_alloc_dinode_counts(alloc_inode, + ac->ac_bh, res->sr_bits, chain); mlog_errno(status); goto bail; } @@ -2099,7 +2113,7 @@ int ocfs2_find_new_inode_loc(struct inode *dir, ac->ac_find_loc_priv = res; *fe_blkno = res->sr_blkno; - + ocfs2_update_inode_fsync_trans(handle, dir, 0); out: if (handle) ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle); @@ -2157,6 +2171,8 @@ int ocfs2_claim_new_inode_at_loc(handle_t *handle, res->sr_bit_offset, res->sr_bits); if (ret < 0) { + ocfs2_rollback_alloc_dinode_counts(ac->ac_inode, + ac->ac_bh, res->sr_bits, chain); mlog_errno(ret); goto out; } @@ -2878,6 +2894,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); if (status < 0) { mutex_unlock(&inode_alloc_inode->i_mutex); + iput(inode_alloc_inode); mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", (u32)suballoc_slot, status); goto bail; diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index a36d0aa5091..2d2501767c0 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -86,6 +86,22 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb, u32 bits_wanted, struct ocfs2_alloc_context **ac); +int ocfs2_alloc_dinode_update_counts(struct inode *inode, + handle_t *handle, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain); +void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain); +int ocfs2_block_group_set_bits(handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_group_desc *bg, + struct buffer_head *group_bh, + unsigned int bit_off, + unsigned int num_bits); + int ocfs2_claim_metadata(handle_t *handle, struct ocfs2_alloc_context *ac, u32 bits_wanted, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index d4e81e4a9b0..ddb662b3244 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -68,7 +68,6 @@ #include "super.h" #include "sysfile.h" #include "uptodate.h" -#include "ver.h" #include "xattr.h" #include "quota.h" #include "refcounttree.h" @@ -76,7 +75,7 @@ #include "buffer_head_io.h" -static struct kmem_cache *ocfs2_inode_cachep = NULL; +static struct kmem_cache *ocfs2_inode_cachep; struct kmem_cache *ocfs2_dquot_cachep; struct kmem_cache *ocfs2_qf_chunk_cachep; @@ -86,10 +85,11 @@ struct kmem_cache *ocfs2_qf_chunk_cachep; * workqueue and schedule on our own. */ struct workqueue_struct *ocfs2_wq = NULL; -static struct dentry *ocfs2_debugfs_root = NULL; +static struct dentry *ocfs2_debugfs_root; MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 cluster file system"); struct mount_options { @@ -561,6 +561,9 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb) if (!oi) return NULL; + oi->i_sync_tid = 0; + oi->i_datasync_tid = 0; + jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode); return &oi->vfs_inode; } @@ -631,6 +634,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) struct ocfs2_super *osb = OCFS2_SB(sb); u32 tmp; + sync_filesystem(sb); + if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || !ocfs2_check_set_options(sb, &parsed_options)) { ret = -EINVAL; @@ -1238,30 +1243,11 @@ static struct dentry *ocfs2_mount(struct file_system_type *fs_type, return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); } -static void ocfs2_kill_sb(struct super_block *sb) -{ - struct ocfs2_super *osb = OCFS2_SB(sb); - - /* Failed mount? */ - if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED) - goto out; - - /* Prevent further queueing of inode drop events */ - spin_lock(&dentry_list_lock); - ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED); - spin_unlock(&dentry_list_lock); - /* Wait for work to finish and/or remove it */ - cancel_work_sync(&osb->dentry_lock_work); -out: - kill_block_super(sb); -} - static struct file_system_type ocfs2_fs_type = { .owner = THIS_MODULE, .name = "ocfs2", .mount = ocfs2_mount, - .kill_sb = ocfs2_kill_sb, - + .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, .next = NULL }; @@ -1612,16 +1598,9 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) return 0; } -wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ]; - static int __init ocfs2_init(void) { - int status, i; - - ocfs2_print_version(); - - for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++) - init_waitqueue_head(&ocfs2__ioend_wq[i]); + int status; status = init_ocfs2_uptodate_cache(); if (status < 0) @@ -1763,7 +1742,7 @@ static void ocfs2_inode_init_once(void *data) ocfs2_extent_map_init(&oi->vfs_inode); INIT_LIST_HEAD(&oi->ip_io_markers); oi->ip_dir_start_lookup = 0; - atomic_set(&oi->ip_unaligned_aio, 0); + mutex_init(&oi->ip_unaligned_aio); init_rwsem(&oi->ip_alloc_sem); init_rwsem(&oi->ip_xattr_sem); mutex_init(&oi->ip_io_mutex); @@ -1848,8 +1827,8 @@ static int ocfs2_get_sector(struct super_block *sb, *bh = sb_getblk(sb, block); if (!*bh) { - mlog_errno(-EIO); - return -EIO; + mlog_errno(-ENOMEM); + return -ENOMEM; } lock_buffer(*bh); if (!buffer_dirty(*bh)) @@ -1934,17 +1913,16 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) debugfs_remove(osb->osb_ctxt); - /* - * Flush inode dropping work queue so that deletes are - * performed while the filesystem is still working - */ - ocfs2_drop_all_dl_inodes(osb); - /* Orphan scan should be stopped as early as possible */ ocfs2_orphan_scan_stop(osb); ocfs2_disable_quotas(osb); + /* All dquots should be freed by now */ + WARN_ON(!llist_empty(&osb->dquot_drop_list)); + /* Wait for worker to be done with the work structure in osb */ + cancel_work_sync(&osb->dquot_drop_work); + ocfs2_shutdown_local_alloc(osb); ocfs2_truncate_log_shutdown(osb); @@ -2075,7 +2053,6 @@ static int ocfs2_initialize_super(struct super_block *sb, struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; struct inode *inode = NULL; struct ocfs2_journal *journal; - __le32 uuid_net_key; struct ocfs2_super *osb; u64 total_blocks; @@ -2121,6 +2098,8 @@ static int ocfs2_initialize_super(struct super_block *sb, spin_lock_init(&osb->osb_xattr_lock); ocfs2_init_steal_slots(osb); + mutex_init(&osb->system_file_mutex); + atomic_set(&osb->alloc_stats.moves, 0); atomic_set(&osb->alloc_stats.local_data, 0); atomic_set(&osb->alloc_stats.bitmap_data, 0); @@ -2225,10 +2204,9 @@ static int ocfs2_initialize_super(struct super_block *sb, if (ocfs2_clusterinfo_valid(osb)) { osb->osb_stackflags = OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; - memcpy(osb->osb_cluster_stack, + strlcpy(osb->osb_cluster_stack, OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, - OCFS2_STACK_LABEL_LEN); - osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; + OCFS2_STACK_LABEL_LEN + 1); if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) { mlog(ML_ERROR, "couldn't mount because of an invalid " @@ -2237,6 +2215,9 @@ static int ocfs2_initialize_super(struct super_block *sb, status = -EINVAL; goto bail; } + strlcpy(osb->osb_cluster_name, + OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster, + OCFS2_CLUSTER_NAME_LEN + 1); } else { /* The empty string is identical with classic tools that * don't know about s_cluster_info. */ @@ -2272,8 +2253,8 @@ static int ocfs2_initialize_super(struct super_block *sb, INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); journal->j_state = OCFS2_JOURNAL_FREE; - INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes); - osb->dentry_lock_list = NULL; + INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs); + init_llist_head(&osb->dquot_drop_list); /* get some pseudo constants for clustersize bits */ osb->s_clustersize_bits = @@ -2307,10 +2288,8 @@ static int ocfs2_initialize_super(struct super_block *sb, goto bail; } - memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key)); - - strncpy(osb->vol_label, di->id2.i_super.s_label, 63); - osb->vol_label[63] = '\0'; + strlcpy(osb->vol_label, di->id2.i_super.s_label, + OCFS2_MAX_VOL_LABEL_LEN); osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno); osb->first_cluster_group_blkno = diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index f053688d22a..af155c18312 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -113,9 +113,11 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, } else arr = get_local_system_inode(osb, type, slot); + mutex_lock(&osb->system_file_mutex); if (arr && ((inode = *arr) != NULL)) { /* get a ref in addition to the array ref */ inode = igrab(inode); + mutex_unlock(&osb->system_file_mutex); BUG_ON(!inode); return inode; @@ -129,6 +131,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, *arr = igrab(inode); BUG_ON(!*arr); } + mutex_unlock(&osb->system_file_mutex); return inode; } diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index 52eaf33d346..82e17b076ce 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -67,7 +67,7 @@ struct ocfs2_meta_cache_item { sector_t c_block; }; -static struct kmem_cache *ocfs2_uptodate_cachep = NULL; +static struct kmem_cache *ocfs2_uptodate_cachep; u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci) { diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c deleted file mode 100644 index e2488f4128a..00000000000 --- a/fs/ocfs2/ver.c +++ /dev/null @@ -1,43 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/string.h> -#include <linux/kernel.h> - -#include "ver.h" - -#define OCFS2_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION - -void ocfs2_print_version(void) -{ - printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(OCFS2_BUILD_VERSION); diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h deleted file mode 100644 index d7395cb91d2..00000000000 --- a/fs/ocfs2/ver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.h - * - * Function prototypes - * - * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef OCFS2_VER_H -#define OCFS2_VER_H - -void ocfs2_print_version(void); - -#endif /* OCFS2_VER_H */ diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 6ce0686eab7..016f01df382 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -99,8 +99,8 @@ static struct ocfs2_xattr_def_value_root def_xv = { const struct xattr_handler *ocfs2_xattr_handlers[] = { &ocfs2_xattr_user_handler, - &ocfs2_xattr_acl_access_handler, - &ocfs2_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, &ocfs2_xattr_trusted_handler, &ocfs2_xattr_security_handler, NULL @@ -109,9 +109,9 @@ const struct xattr_handler *ocfs2_xattr_handlers[] = { static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] - = &ocfs2_xattr_acl_access_handler, + = &posix_acl_access_xattr_handler, [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] - = &ocfs2_xattr_acl_default_handler, + = &posix_acl_default_xattr_handler, [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler, }; @@ -369,7 +369,7 @@ static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket) * them fully. */ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket, - u64 xb_blkno) + u64 xb_blkno, int new) { int i, rc = 0; @@ -377,15 +377,22 @@ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket, bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb, xb_blkno + i); if (!bucket->bu_bhs[i]) { - rc = -EIO; + rc = -ENOMEM; mlog_errno(rc); break; } if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode), - bucket->bu_bhs[i])) - ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode), - bucket->bu_bhs[i]); + bucket->bu_bhs[i])) { + if (new) + ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode), + bucket->bu_bhs[i]); + else { + set_buffer_uptodate(bucket->bu_bhs[i]); + ocfs2_set_buffer_uptodate(INODE_CACHE(bucket->bu_inode), + bucket->bu_bhs[i]); + } + } } if (rc) @@ -754,8 +761,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode, BUG_ON(why == RESTART_META); credits = ocfs2_calc_extend_credits(inode->i_sb, - &vb->vb_xv->xr_list, - clusters_to_add); + &vb->vb_xv->xr_list); status = ocfs2_extend_trans(handle, credits); if (status < 0) { status = -ENOMEM; @@ -2603,6 +2609,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL); di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); spin_unlock(&oi->ip_lock); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); out_commit: @@ -2865,6 +2872,12 @@ static int ocfs2_create_xattr_block(struct inode *inode, } new_bh = sb_getblk(inode->i_sb, first_blkno); + if (!new_bh) { + ret = -ENOMEM; + mlog_errno(ret); + goto end; + } + ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh); ret = ocfs2_journal_access_xb(ctxt->handle, INODE_CACHE(inode), @@ -3040,8 +3053,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode, if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { clusters_add += new_clusters; credits += ocfs2_calc_extend_credits(inode->i_sb, - &def_xv.xv.xr_list, - new_clusters); + &def_xv.xv.xr_list); } goto meta_guess; @@ -3106,8 +3118,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode, if (!ocfs2_xattr_is_local(xe)) credits += ocfs2_calc_extend_credits( inode->i_sb, - &def_xv.xv.xr_list, - new_clusters); + &def_xv.xv.xr_list); goto out; } } @@ -3132,9 +3143,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode, meta_add += ocfs2_extend_meta_needed(&xv->xr_list); clusters_add += new_clusters - old_clusters; credits += ocfs2_calc_extend_credits(inode->i_sb, - &xv->xr_list, - new_clusters - - old_clusters); + &xv->xr_list); if (value_size >= OCFS2_XATTR_ROOT_SIZE) goto out; } @@ -3180,7 +3189,7 @@ meta_guess: &xb->xb_attrs.xb_root.xt_list; meta_add += ocfs2_extend_meta_needed(el); credits += ocfs2_calc_extend_credits(inode->i_sb, - el, 1); + el); } else credits += OCFS2_SUBALLOC_ALLOC + 1; @@ -3199,8 +3208,15 @@ meta_guess: clusters_add += 1; } } else { - meta_add += 1; credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { + struct ocfs2_extent_list *el = &def_xv.xv.xr_list; + meta_add += ocfs2_extend_meta_needed(el); + credits += ocfs2_calc_extend_credits(inode->i_sb, + el); + } else { + meta_add += 1; + } } out: if (clusters_need) @@ -3613,6 +3629,7 @@ int ocfs2_xattr_set(struct inode *inode, } ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt); + ocfs2_update_inode_fsync_trans(ctxt.handle, inode, 0); ocfs2_commit_trans(osb, ctxt.handle); @@ -4293,7 +4310,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode, trace_ocfs2_xattr_create_index_block((unsigned long long)blkno); - ret = ocfs2_init_xattr_bucket(xs->bucket, blkno); + ret = ocfs2_init_xattr_bucket(xs->bucket, blkno, 1); if (ret) { mlog_errno(ret); goto out; @@ -4637,7 +4654,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode, * Even if !new_bucket_head, we're overwriting t_bucket. Thus, * there's no need to read it. */ - ret = ocfs2_init_xattr_bucket(t_bucket, new_blk); + ret = ocfs2_init_xattr_bucket(t_bucket, new_blk, new_bucket_head); if (ret) { mlog_errno(ret); goto out; @@ -4803,7 +4820,7 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode, * Even if !t_is_new, we're overwriting t_bucket. Thus, * there's no need to read it. */ - ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno); + ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno, t_is_new); if (ret) goto out; @@ -5475,6 +5492,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, ret = ocfs2_truncate_log_append(osb, handle, blkno, len); if (ret) mlog_errno(ret); + ocfs2_update_inode_fsync_trans(handle, inode, 0); out_commit: ocfs2_commit_trans(osb, handle); @@ -6216,8 +6234,7 @@ static int ocfs2_value_metas_in_xattr_header(struct super_block *sb, le16_to_cpu(xv->xr_list.l_next_free_rec); *credits += ocfs2_calc_extend_credits(sb, - &def_xv.xv.xr_list, - le32_to_cpu(xv->xr_clusters)); + &def_xv.xv.xr_list); /* * If the value is a tree with depth > 1, We don't go deep @@ -6782,7 +6799,7 @@ static int ocfs2_lock_reflink_xattr_rec_allocators( metas.num_metas += ocfs2_extend_meta_needed(xt_et->et_root_el); *credits += ocfs2_calc_extend_credits(osb->sb, - xt_et->et_root_el, len); + xt_et->et_root_el); if (metas.num_metas) { ret = ocfs2_reserve_new_metadata_blocks(osb, metas.num_metas, @@ -6830,7 +6847,7 @@ static int ocfs2_reflink_xattr_bucket(handle_t *handle, break; } - ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno); + ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno, 1); if (ret) { mlog_errno(ret); break; @@ -7190,10 +7207,12 @@ out: */ int ocfs2_init_security_and_acl(struct inode *dir, struct inode *inode, - const struct qstr *qstr) + const struct qstr *qstr, + struct posix_acl *default_acl, + struct posix_acl *acl) { - int ret = 0; struct buffer_head *dir_bh = NULL; + int ret = 0; ret = ocfs2_init_security_get(inode, dir, qstr, NULL); if (ret) { @@ -7207,9 +7226,10 @@ int ocfs2_init_security_and_acl(struct inode *dir, goto leave; } - ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); - if (ret) - mlog_errno(ret); + if (!ret && default_acl) + ret = ocfs2_iop_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + if (!ret && acl) + ret = ocfs2_iop_set_acl(inode, acl, ACL_TYPE_ACCESS); ocfs2_inode_unlock(dir, 0); brelse(dir_bh); diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 19f134e896a..f10d5b93c36 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -40,8 +40,6 @@ struct ocfs2_security_xattr_info { extern const struct xattr_handler ocfs2_xattr_user_handler; extern const struct xattr_handler ocfs2_xattr_trusted_handler; extern const struct xattr_handler ocfs2_xattr_security_handler; -extern const struct xattr_handler ocfs2_xattr_acl_access_handler; -extern const struct xattr_handler ocfs2_xattr_acl_default_handler; extern const struct xattr_handler *ocfs2_xattr_handlers[]; ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); @@ -96,5 +94,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, bool preserve_security); int ocfs2_init_security_and_acl(struct inode *dir, struct inode *inode, - const struct qstr *qstr); + const struct qstr *qstr, + struct posix_acl *default_acl, + struct posix_acl *acl); #endif /* OCFS2_XATTR_H */ diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 54d57d6ba68..902e88527fc 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -337,10 +337,10 @@ static sector_t omfs_bmap(struct address_space *mapping, sector_t block) const struct file_operations omfs_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index d8b0afde217..ec58c765918 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -183,7 +183,7 @@ int omfs_sync_inode(struct inode *inode) */ static void omfs_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (inode->i_nlink) diff --git a/fs/open.c b/fs/open.c index d420331ca32..d6fd3acde13 100644 --- a/fs/open.c +++ b/fs/open.c @@ -57,7 +57,8 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, newattrs.ia_valid |= ret | ATTR_FORCE; mutex_lock(&dentry->d_inode->i_mutex); - ret = notify_change(dentry, &newattrs); + /* Note any delegations or leases have already been broken: */ + ret = notify_change(dentry, &newattrs, NULL); mutex_unlock(&dentry->d_inode->i_mutex); return ret; } @@ -230,7 +231,13 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return -EINVAL; /* Return error if mode is not supported */ - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) + return -EOPNOTSUPP; + + /* Punch hole and zero range are mutually exclusive */ + if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) == + (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; /* Punch hole must have keep size set */ @@ -238,17 +245,30 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) !(mode & FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; + /* Collapse range should only be used exclusively. */ + if ((mode & FALLOC_FL_COLLAPSE_RANGE) && + (mode & ~FALLOC_FL_COLLAPSE_RANGE)) + return -EINVAL; + if (!(file->f_mode & FMODE_WRITE)) return -EBADF; - /* It's not possible punch hole on append only file */ - if (mode & FALLOC_FL_PUNCH_HOLE && IS_APPEND(inode)) + /* + * We can only allow pure fallocate on append only files + */ + if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode)) return -EPERM; if (IS_IMMUTABLE(inode)) return -EPERM; /* + * We cannot allow any fallocate operation on an active swapfile + */ + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + + /* * Revalidate the write permissions, in case security policy has * changed since the files were opened. */ @@ -464,21 +484,28 @@ out: static int chmod_common(struct path *path, umode_t mode) { struct inode *inode = path->dentry->d_inode; + struct inode *delegated_inode = NULL; struct iattr newattrs; int error; error = mnt_want_write(path->mnt); if (error) return error; +retry_deleg: mutex_lock(&inode->i_mutex); error = security_path_chmod(path, mode); if (error) goto out_unlock; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - error = notify_change(path->dentry, &newattrs); + error = notify_change(path->dentry, &newattrs, &delegated_inode); out_unlock: mutex_unlock(&inode->i_mutex); + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } mnt_drop_write(path->mnt); return error; } @@ -522,6 +549,7 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) static int chown_common(struct path *path, uid_t user, gid_t group) { struct inode *inode = path->dentry->d_inode; + struct inode *delegated_inode = NULL; int error; struct iattr newattrs; kuid_t uid; @@ -546,12 +574,17 @@ static int chown_common(struct path *path, uid_t user, gid_t group) if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; +retry_deleg: mutex_lock(&inode->i_mutex); error = security_path_chown(path, uid, gid); if (!error) - error = notify_change(path->dentry, &newattrs); + error = notify_change(path->dentry, &newattrs, &delegated_inode); mutex_unlock(&inode->i_mutex); - + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } return error; } @@ -618,35 +651,6 @@ out: return error; } -/* - * You have to be very careful that these write - * counts get cleaned up in error cases and - * upon __fput(). This should probably never - * be called outside of __dentry_open(). - */ -static inline int __get_file_write_access(struct inode *inode, - struct vfsmount *mnt) -{ - int error; - error = get_write_access(inode); - if (error) - return error; - /* - * Do not take mount writer counts on - * special files since no writes to - * the mount itself will occur. - */ - if (!special_file(inode->i_mode)) { - /* - * Balanced in __fput() - */ - error = __mnt_want_write(mnt); - if (error) - put_write_access(inode); - } - return error; -} - int open_check_o_direct(struct file *f) { /* NB: we're sure to have correct a_ops only after f_op->open */ @@ -671,28 +675,37 @@ static int do_dentry_open(struct file *f, f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; - if (unlikely(f->f_flags & O_PATH)) - f->f_mode = FMODE_PATH; - path_get(&f->f_path); inode = f->f_inode = f->f_path.dentry->d_inode; - if (f->f_mode & FMODE_WRITE) { - error = __get_file_write_access(inode, f->f_path.mnt); - if (error) - goto cleanup_file; - if (!special_file(inode->i_mode)) - file_take_write(f); - } - f->f_mapping = inode->i_mapping; - file_sb_list_add(f, inode->i_sb); - if (unlikely(f->f_mode & FMODE_PATH)) { + if (unlikely(f->f_flags & O_PATH)) { + f->f_mode = FMODE_PATH; f->f_op = &empty_fops; return 0; } + if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { + error = get_write_access(inode); + if (unlikely(error)) + goto cleanup_file; + error = __mnt_want_write(f->f_path.mnt); + if (unlikely(error)) { + put_write_access(inode); + goto cleanup_file; + } + f->f_mode |= FMODE_WRITER; + } + + /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ + if (S_ISREG(inode->i_mode)) + f->f_mode |= FMODE_ATOMIC_POS; + f->f_op = fops_get(inode->i_fop); + if (unlikely(WARN_ON(!f->f_op))) { + error = -ENODEV; + goto cleanup_all; + } error = security_file_open(f, cred); if (error) @@ -702,7 +715,7 @@ static int do_dentry_open(struct file *f, if (error) goto cleanup_all; - if (!open && f->f_op) + if (!open) open = f->f_op->open; if (open) { error = open(inode, f); @@ -711,6 +724,12 @@ static int do_dentry_open(struct file *f, } if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(inode); + if ((f->f_mode & FMODE_READ) && + likely(f->f_op->read || f->f_op->aio_read || f->f_op->read_iter)) + f->f_mode |= FMODE_CAN_READ; + if ((f->f_mode & FMODE_WRITE) && + likely(f->f_op->write || f->f_op->aio_write || f->f_op->write_iter)) + f->f_mode |= FMODE_CAN_WRITE; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); @@ -720,19 +739,9 @@ static int do_dentry_open(struct file *f, cleanup_all: fops_put(f->f_op); - file_sb_list_del(f); - if (f->f_mode & FMODE_WRITE) { + if (f->f_mode & FMODE_WRITER) { put_write_access(inode); - if (!special_file(inode->i_mode)) { - /* - * We don't consider this a real - * mnt_want/drop_write() pair - * because it all happenend right - * here, so just reset the state. - */ - file_reset_write(f); - __mnt_drop_write(f->f_path.mnt); - } + __mnt_drop_write(f->f_path.mnt); } cleanup_file: path_put(&f->f_path); @@ -1023,7 +1032,7 @@ int filp_close(struct file *filp, fl_owner_t id) return 0; } - if (filp->f_op && filp->f_op->flush) + if (filp->f_op->flush) retval = filp->f_op->flush(filp, id); if (likely(!(filp->f_mode & FMODE_PATH))) { diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 8c0ceb8dd1f..15e4500cda3 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -368,6 +368,7 @@ static struct inode *openprom_iget(struct super_block *sb, ino_t ino) static int openprom_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_NOATIME; return 0; } diff --git a/fs/pipe.c b/fs/pipe.c index d2c45e14e6d..21981e58e2a 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -116,99 +116,6 @@ void pipe_wait(struct pipe_inode_info *pipe) pipe_lock(pipe); } -static int -pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len, - int atomic) -{ - unsigned long copy; - - while (len > 0) { - while (!iov->iov_len) - iov++; - copy = min_t(unsigned long, len, iov->iov_len); - - if (atomic) { - if (__copy_from_user_inatomic(to, iov->iov_base, copy)) - return -EFAULT; - } else { - if (copy_from_user(to, iov->iov_base, copy)) - return -EFAULT; - } - to += copy; - len -= copy; - iov->iov_base += copy; - iov->iov_len -= copy; - } - return 0; -} - -static int -pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len, - int atomic) -{ - unsigned long copy; - - while (len > 0) { - while (!iov->iov_len) - iov++; - copy = min_t(unsigned long, len, iov->iov_len); - - if (atomic) { - if (__copy_to_user_inatomic(iov->iov_base, from, copy)) - return -EFAULT; - } else { - if (copy_to_user(iov->iov_base, from, copy)) - return -EFAULT; - } - from += copy; - len -= copy; - iov->iov_base += copy; - iov->iov_len -= copy; - } - return 0; -} - -/* - * Attempt to pre-fault in the user memory, so we can use atomic copies. - * Returns the number of bytes not faulted in. - */ -static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len) -{ - while (!iov->iov_len) - iov++; - - while (len > 0) { - unsigned long this_len; - - this_len = min_t(unsigned long, len, iov->iov_len); - if (fault_in_pages_writeable(iov->iov_base, this_len)) - break; - - len -= this_len; - iov++; - } - - return len; -} - -/* - * Pre-fault in the user memory, so we can use atomic copies. - */ -static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len) -{ - while (!iov->iov_len) - iov++; - - while (len > 0) { - unsigned long this_len; - - this_len = min_t(unsigned long, len, iov->iov_len); - fault_in_pages_readable(iov->iov_base, this_len); - len -= this_len; - iov++; - } -} - static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { @@ -226,52 +133,6 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe, } /** - * generic_pipe_buf_map - virtually map a pipe buffer - * @pipe: the pipe that the buffer belongs to - * @buf: the buffer that should be mapped - * @atomic: whether to use an atomic map - * - * Description: - * This function returns a kernel virtual address mapping for the - * pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided - * and the caller has to be careful not to fault before calling - * the unmap function. - * - * Note that this function calls kmap_atomic() if @atomic != 0. - */ -void *generic_pipe_buf_map(struct pipe_inode_info *pipe, - struct pipe_buffer *buf, int atomic) -{ - if (atomic) { - buf->flags |= PIPE_BUF_FLAG_ATOMIC; - return kmap_atomic(buf->page); - } - - return kmap(buf->page); -} -EXPORT_SYMBOL(generic_pipe_buf_map); - -/** - * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer - * @pipe: the pipe that the buffer belongs to - * @buf: the buffer that should be unmapped - * @map_data: the data that the mapping function returned - * - * Description: - * This function undoes the mapping that ->map() provided. - */ -void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, - struct pipe_buffer *buf, void *map_data) -{ - if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { - buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; - kunmap_atomic(map_data); - } else - kunmap(buf->page); -} -EXPORT_SYMBOL(generic_pipe_buf_unmap); - -/** * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to attempt to steal @@ -351,8 +212,6 @@ EXPORT_SYMBOL(generic_pipe_buf_release); static const struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -361,8 +220,6 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = { static const struct pipe_buf_operations packet_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -370,17 +227,14 @@ static const struct pipe_buf_operations packet_pipe_buf_ops = { }; static ssize_t -pipe_read(struct kiocb *iocb, const struct iovec *_iov, - unsigned long nr_segs, loff_t pos) +pipe_read(struct kiocb *iocb, struct iov_iter *to) { + size_t total_len = iov_iter_count(to); struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; int do_wakeup; ssize_t ret; - struct iovec *iov = (struct iovec *)_iov; - size_t total_len; - total_len = iov_length(iov, nr_segs); /* Null read succeeds. */ if (unlikely(total_len == 0)) return 0; @@ -394,9 +248,9 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, int curbuf = pipe->curbuf; struct pipe_buffer *buf = pipe->bufs + curbuf; const struct pipe_buf_operations *ops = buf->ops; - void *addr; size_t chars = buf->len; - int error, atomic; + size_t written; + int error; if (chars > total_len) chars = total_len; @@ -408,21 +262,10 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, break; } - atomic = !iov_fault_in_pages_write(iov, chars); -redo: - addr = ops->map(pipe, buf, atomic); - error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic); - ops->unmap(pipe, buf, addr); - if (unlikely(error)) { - /* - * Just retry with the slow path if we failed. - */ - if (atomic) { - atomic = 0; - goto redo; - } + written = copy_page_to_iter(buf->page, buf->offset, chars, to); + if (unlikely(written < chars)) { if (!ret) - ret = error; + ret = -EFAULT; break; } ret += chars; @@ -493,24 +336,19 @@ static inline int is_packetized(struct file *file) } static ssize_t -pipe_write(struct kiocb *iocb, const struct iovec *_iov, - unsigned long nr_segs, loff_t ppos) +pipe_write(struct kiocb *iocb, struct iov_iter *from) { struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; - ssize_t ret; - int do_wakeup; - struct iovec *iov = (struct iovec *)_iov; - size_t total_len; + ssize_t ret = 0; + int do_wakeup = 0; + size_t total_len = iov_iter_count(from); ssize_t chars; - total_len = iov_length(iov, nr_segs); /* Null write succeeds. */ if (unlikely(total_len == 0)) return 0; - do_wakeup = 0; - ret = 0; __pipe_lock(pipe); if (!pipe->readers) { @@ -529,32 +367,19 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov, int offset = buf->offset + buf->len; if (ops->can_merge && offset + chars <= PAGE_SIZE) { - int error, atomic = 1; - void *addr; - - error = ops->confirm(pipe, buf); + int error = ops->confirm(pipe, buf); if (error) goto out; - iov_fault_in_pages_read(iov, chars); -redo1: - addr = ops->map(pipe, buf, atomic); - error = pipe_iov_copy_from_user(offset + addr, iov, - chars, atomic); - ops->unmap(pipe, buf, addr); - ret = error; - do_wakeup = 1; - if (error) { - if (atomic) { - atomic = 0; - goto redo1; - } + ret = copy_page_from_iter(buf->page, offset, chars, from); + if (unlikely(ret < chars)) { + error = -EFAULT; goto out; } + do_wakeup = 1; buf->len += chars; - total_len -= chars; ret = chars; - if (!total_len) + if (!iov_iter_count(from)) goto out; } } @@ -573,8 +398,7 @@ redo1: int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); struct pipe_buffer *buf = pipe->bufs + newbuf; struct page *page = pipe->tmp_page; - char *src; - int error, atomic = 1; + int copied; if (!page) { page = alloc_page(GFP_HIGHUSER); @@ -590,40 +414,19 @@ redo1: * FIXME! Is this really true? */ do_wakeup = 1; - chars = PAGE_SIZE; - if (chars > total_len) - chars = total_len; - - iov_fault_in_pages_read(iov, chars); -redo2: - if (atomic) - src = kmap_atomic(page); - else - src = kmap(page); - - error = pipe_iov_copy_from_user(src, iov, chars, - atomic); - if (atomic) - kunmap_atomic(src); - else - kunmap(page); - - if (unlikely(error)) { - if (atomic) { - atomic = 0; - goto redo2; - } + copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); + if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { if (!ret) - ret = error; + ret = -EFAULT; break; } - ret += chars; + ret += copied; /* Insert it into the buffer array */ buf->page = page; buf->ops = &anon_pipe_buf_ops; buf->offset = 0; - buf->len = chars; + buf->len = copied; buf->flags = 0; if (is_packetized(filp)) { buf->ops = &packet_pipe_buf_ops; @@ -632,8 +435,7 @@ redo2: pipe->nrbufs = ++bufs; pipe->tmp_page = NULL; - total_len -= chars; - if (!total_len) + if (!iov_iter_count(from)) break; } if (bufs < pipe->buffers) @@ -663,10 +465,11 @@ out: wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } - if (ret > 0) { + if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { int err = file_update_time(filp); if (err) ret = err; + sb_end_write(file_inode(filp)->i_sb); } return ret; } @@ -726,11 +529,25 @@ pipe_poll(struct file *filp, poll_table *wait) return mask; } +static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe) +{ + int kill = 0; + + spin_lock(&inode->i_lock); + if (!--pipe->files) { + inode->i_pipe = NULL; + kill = 1; + } + spin_unlock(&inode->i_lock); + + if (kill) + free_pipe_info(pipe); +} + static int pipe_release(struct inode *inode, struct file *file) { - struct pipe_inode_info *pipe = inode->i_pipe; - int kill = 0; + struct pipe_inode_info *pipe = file->private_data; __pipe_lock(pipe); if (file->f_mode & FMODE_READ) @@ -743,17 +560,9 @@ pipe_release(struct inode *inode, struct file *file) kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - spin_lock(&inode->i_lock); - if (!--pipe->files) { - inode->i_pipe = NULL; - kill = 1; - } - spin_unlock(&inode->i_lock); __pipe_unlock(pipe); - if (kill) - free_pipe_info(pipe); - + put_pipe_info(inode, pipe); return 0; } @@ -1014,7 +823,6 @@ static int fifo_open(struct inode *inode, struct file *filp) { struct pipe_inode_info *pipe; bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; - int kill = 0; int ret; filp->f_version = 0; @@ -1130,25 +938,19 @@ err_wr: goto err; err: - spin_lock(&inode->i_lock); - if (!--pipe->files) { - inode->i_pipe = NULL; - kill = 1; - } - spin_unlock(&inode->i_lock); __pipe_unlock(pipe); - if (kill) - free_pipe_info(pipe); + + put_pipe_info(inode, pipe); return ret; } const struct file_operations pipefifo_fops = { .open = fifo_open, .llseek = no_llseek, - .read = do_sync_read, - .aio_read = pipe_read, - .write = do_sync_write, - .aio_write = pipe_write, + .read = new_sync_read, + .read_iter = pipe_read, + .write = new_sync_write, + .write_iter = pipe_write, .poll = pipe_poll, .unlocked_ioctl = pipe_ioctl, .release = pipe_release, diff --git a/fs/pnode.c b/fs/pnode.c index 9af0df15256..302bf22c4a3 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -164,46 +164,94 @@ static struct mount *propagation_next(struct mount *m, } } -/* - * return the source mount to be used for cloning - * - * @dest the current destination mount - * @last_dest the last seen destination mount - * @last_src the last seen source mount - * @type return CL_SLAVE if the new mount has to be - * cloned as a slave. - */ -static struct mount *get_source(struct mount *dest, - struct mount *last_dest, - struct mount *last_src, - int *type) +static struct mount *next_group(struct mount *m, struct mount *origin) { - struct mount *p_last_src = NULL; - struct mount *p_last_dest = NULL; - - while (last_dest != dest->mnt_master) { - p_last_dest = last_dest; - p_last_src = last_src; - last_dest = last_dest->mnt_master; - last_src = last_src->mnt_master; + while (1) { + while (1) { + struct mount *next; + if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + return first_slave(m); + next = next_peer(m); + if (m->mnt_group_id == origin->mnt_group_id) { + if (next == origin) + return NULL; + } else if (m->mnt_slave.next != &next->mnt_slave) + break; + m = next; + } + /* m is the last peer */ + while (1) { + struct mount *master = m->mnt_master; + if (m->mnt_slave.next != &master->mnt_slave_list) + return next_slave(m); + m = next_peer(master); + if (master->mnt_group_id == origin->mnt_group_id) + break; + if (master->mnt_slave.next == &m->mnt_slave) + break; + m = master; + } + if (m == origin) + return NULL; } +} - if (p_last_dest) { - do { - p_last_dest = next_peer(p_last_dest); - } while (IS_MNT_NEW(p_last_dest)); - /* is that a peer of the earlier? */ - if (dest == p_last_dest) { - *type = CL_MAKE_SHARED; - return p_last_src; +/* all accesses are serialized by namespace_sem */ +static struct user_namespace *user_ns; +static struct mount *last_dest, *last_source, *dest_master; +static struct mountpoint *mp; +static struct hlist_head *list; + +static int propagate_one(struct mount *m) +{ + struct mount *child; + int type; + /* skip ones added by this propagate_mnt() */ + if (IS_MNT_NEW(m)) + return 0; + /* skip if mountpoint isn't covered by it */ + if (!is_subdir(mp->m_dentry, m->mnt.mnt_root)) + return 0; + if (m->mnt_group_id == last_dest->mnt_group_id) { + type = CL_MAKE_SHARED; + } else { + struct mount *n, *p; + for (n = m; ; n = p) { + p = n->mnt_master; + if (p == dest_master || IS_MNT_MARKED(p)) { + while (last_dest->mnt_master != p) { + last_source = last_source->mnt_master; + last_dest = last_source->mnt_parent; + } + if (n->mnt_group_id != last_dest->mnt_group_id) { + last_source = last_source->mnt_master; + last_dest = last_source->mnt_parent; + } + break; + } } + type = CL_SLAVE; + /* beginning of peer group among the slaves? */ + if (IS_MNT_SHARED(m)) + type |= CL_MAKE_SHARED; + } + + /* Notice when we are propagating across user namespaces */ + if (m->mnt_ns->user_ns != user_ns) + type |= CL_UNPRIVILEGED; + child = copy_tree(last_source, last_source->mnt.mnt_root, type); + if (IS_ERR(child)) + return PTR_ERR(child); + mnt_set_mountpoint(m, mp, child); + last_dest = m; + last_source = child; + if (m->mnt_master != dest_master) { + read_seqlock_excl(&mount_lock); + SET_MNT_MARK(m->mnt_master); + read_sequnlock_excl(&mount_lock); } - /* slave of the earlier, then */ - *type = CL_SLAVE; - /* beginning of peer group among the slaves? */ - if (IS_MNT_SHARED(dest)) - *type |= CL_MAKE_SHARED; - return last_src; + hlist_add_head(&child->mnt_hash, list); + return 0; } /* @@ -220,56 +268,50 @@ static struct mount *get_source(struct mount *dest, * @tree_list : list of heads of trees to be attached. */ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, - struct mount *source_mnt, struct list_head *tree_list) + struct mount *source_mnt, struct hlist_head *tree_list) { - struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; - struct mount *m, *child; + struct mount *m, *n; int ret = 0; - struct mount *prev_dest_mnt = dest_mnt; - struct mount *prev_src_mnt = source_mnt; - LIST_HEAD(tmp_list); - - for (m = propagation_next(dest_mnt, dest_mnt); m; - m = propagation_next(m, dest_mnt)) { - int type; - struct mount *source; - - if (IS_MNT_NEW(m)) - continue; - source = get_source(m, prev_dest_mnt, prev_src_mnt, &type); - - /* Notice when we are propagating across user namespaces */ - if (m->mnt_ns->user_ns != user_ns) - type |= CL_UNPRIVILEGED; - - child = copy_tree(source, source->mnt.mnt_root, type); - if (IS_ERR(child)) { - ret = PTR_ERR(child); - list_splice(tree_list, tmp_list.prev); + /* + * we don't want to bother passing tons of arguments to + * propagate_one(); everything is serialized by namespace_sem, + * so globals will do just fine. + */ + user_ns = current->nsproxy->mnt_ns->user_ns; + last_dest = dest_mnt; + last_source = source_mnt; + mp = dest_mp; + list = tree_list; + dest_master = dest_mnt->mnt_master; + + /* all peers of dest_mnt, except dest_mnt itself */ + for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) { + ret = propagate_one(n); + if (ret) goto out; - } + } - if (is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) { - mnt_set_mountpoint(m, dest_mp, child); - list_add_tail(&child->mnt_hash, tree_list); - } else { - /* - * This can happen if the parent mount was bind mounted - * on some subdirectory of a shared/slave mount. - */ - list_add_tail(&child->mnt_hash, &tmp_list); - } - prev_dest_mnt = m; - prev_src_mnt = child; + /* all slave groups */ + for (m = next_group(dest_mnt, dest_mnt); m; + m = next_group(m, dest_mnt)) { + /* everything in that slave group */ + n = m; + do { + ret = propagate_one(n); + if (ret) + goto out; + n = next_peer(n); + } while (n != m); } out: - br_write_lock(&vfsmount_lock); - while (!list_empty(&tmp_list)) { - child = list_first_entry(&tmp_list, struct mount, mnt_hash); - umount_tree(child, 0); + read_seqlock_excl(&mount_lock); + hlist_for_each_entry(n, tree_list, mnt_hash) { + m = n->mnt_parent; + if (m->mnt_master != dest_mnt->mnt_master) + CLEAR_MNT_MARK(m->mnt_master); } - br_write_unlock(&vfsmount_lock); + read_sequnlock_excl(&mount_lock); return ret; } @@ -278,8 +320,7 @@ out: */ static inline int do_refcount_check(struct mount *mnt, int count) { - int mycount = mnt_get_count(mnt) - mnt->mnt_ghosts; - return (mycount > count); + return mnt_get_count(mnt) > count; } /* @@ -311,7 +352,7 @@ int propagate_mount_busy(struct mount *mnt, int refcnt) for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { - child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint, 0); + child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); if (child && list_empty(&child->mnt_mounts) && (ret = do_refcount_check(child, 1))) break; @@ -333,14 +374,16 @@ static void __propagate_umount(struct mount *mnt) for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { - struct mount *child = __lookup_mnt(&m->mnt, - mnt->mnt_mountpoint, 0); + struct mount *child = __lookup_mnt_last(&m->mnt, + mnt->mnt_mountpoint); /* * umount the child only if the child has no * other children */ - if (child && list_empty(&child->mnt_mounts)) - list_move_tail(&child->mnt_hash, &mnt->mnt_hash); + if (child && list_empty(&child->mnt_mounts)) { + hlist_del_init_rcu(&child->mnt_hash); + hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash); + } } } @@ -351,11 +394,11 @@ static void __propagate_umount(struct mount *mnt) * * vfsmount lock must be held for write */ -int propagate_umount(struct list_head *list) +int propagate_umount(struct hlist_head *list) { struct mount *mnt; - list_for_each_entry(mnt, list, mnt_hash) + hlist_for_each_entry(mnt, list, mnt_hash) __propagate_umount(mnt); return 0; } diff --git a/fs/pnode.h b/fs/pnode.h index 59e7eda1851..4a246358b03 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -16,6 +16,9 @@ #define IS_MNT_NEW(m) (!(m)->mnt_ns) #define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED) #define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE) +#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED) +#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED) +#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED) #define CL_EXPIRE 0x01 #define CL_SLAVE 0x02 @@ -36,8 +39,8 @@ static inline void set_mnt_shared(struct mount *mnt) void change_mnt_propagation(struct mount *, int); int propagate_mnt(struct mount *, struct mountpoint *, struct mount *, - struct list_head *); -int propagate_umount(struct list_head *); + struct hlist_head *); +int propagate_umount(struct hlist_head *); int propagate_mount_busy(struct mount *, int); void mnt_release_group_id(struct mount *); int get_dominating_id(struct mount *mnt, const struct path *root); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 8bd2135b7f8..0855f772cd4 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -1,10 +1,8 @@ /* - * linux/fs/posix_acl.c + * Copyright (C) 2002,2003 by Andreas Gruenbacher <a.gruenbacher@computer.org> * - * Copyright (C) 2002 by Andreas Gruenbacher <a.gruenbacher@computer.org> - * - * Fixes from William Schumacher incorporated on 15 March 2001. - * (Reported by Charles Bertsch, <CBertsch@microtest.com>). + * Fixes from William Schumacher incorporated on 15 March 2001. + * (Reported by Charles Bertsch, <CBertsch@microtest.com>). */ /* @@ -18,15 +16,112 @@ #include <linux/fs.h> #include <linux/sched.h> #include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> +#include <linux/xattr.h> #include <linux/export.h> +#include <linux/user_namespace.h> -#include <linux/errno.h> +struct posix_acl **acl_by_type(struct inode *inode, int type) +{ + switch (type) { + case ACL_TYPE_ACCESS: + return &inode->i_acl; + case ACL_TYPE_DEFAULT: + return &inode->i_default_acl; + default: + BUG(); + } +} +EXPORT_SYMBOL(acl_by_type); -EXPORT_SYMBOL(posix_acl_init); -EXPORT_SYMBOL(posix_acl_alloc); -EXPORT_SYMBOL(posix_acl_valid); -EXPORT_SYMBOL(posix_acl_equiv_mode); -EXPORT_SYMBOL(posix_acl_from_mode); +struct posix_acl *get_cached_acl(struct inode *inode, int type) +{ + struct posix_acl **p = acl_by_type(inode, type); + struct posix_acl *acl = ACCESS_ONCE(*p); + if (acl) { + spin_lock(&inode->i_lock); + acl = *p; + if (acl != ACL_NOT_CACHED) + acl = posix_acl_dup(acl); + spin_unlock(&inode->i_lock); + } + return acl; +} +EXPORT_SYMBOL(get_cached_acl); + +struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type) +{ + return rcu_dereference(*acl_by_type(inode, type)); +} +EXPORT_SYMBOL(get_cached_acl_rcu); + +void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl) +{ + struct posix_acl **p = acl_by_type(inode, type); + struct posix_acl *old; + spin_lock(&inode->i_lock); + old = *p; + rcu_assign_pointer(*p, posix_acl_dup(acl)); + spin_unlock(&inode->i_lock); + if (old != ACL_NOT_CACHED) + posix_acl_release(old); +} +EXPORT_SYMBOL(set_cached_acl); + +void forget_cached_acl(struct inode *inode, int type) +{ + struct posix_acl **p = acl_by_type(inode, type); + struct posix_acl *old; + spin_lock(&inode->i_lock); + old = *p; + *p = ACL_NOT_CACHED; + spin_unlock(&inode->i_lock); + if (old != ACL_NOT_CACHED) + posix_acl_release(old); +} +EXPORT_SYMBOL(forget_cached_acl); + +void forget_all_cached_acls(struct inode *inode) +{ + struct posix_acl *old_access, *old_default; + spin_lock(&inode->i_lock); + old_access = inode->i_acl; + old_default = inode->i_default_acl; + inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; + spin_unlock(&inode->i_lock); + if (old_access != ACL_NOT_CACHED) + posix_acl_release(old_access); + if (old_default != ACL_NOT_CACHED) + posix_acl_release(old_default); +} +EXPORT_SYMBOL(forget_all_cached_acls); + +struct posix_acl *get_acl(struct inode *inode, int type) +{ + struct posix_acl *acl; + + acl = get_cached_acl(inode, type); + if (acl != ACL_NOT_CACHED) + return acl; + + if (!IS_POSIXACL(inode)) + return NULL; + + /* + * A filesystem can force a ACL callback by just never filling the + * ACL cache. But normally you'd fill the cache either at inode + * instantiation time, or on the first ->get_acl call. + * + * If the filesystem doesn't have a get_acl() function at all, we'll + * just create the negative cache entry. + */ + if (!inode->i_op->get_acl) { + set_cached_acl(inode, type, NULL); + return NULL; + } + return inode->i_op->get_acl(inode, type); +} +EXPORT_SYMBOL(get_acl); /* * Init a fresh posix_acl @@ -37,6 +132,7 @@ posix_acl_init(struct posix_acl *acl, int count) atomic_set(&acl->a_refcount, 1); acl->a_count = count; } +EXPORT_SYMBOL(posix_acl_init); /* * Allocate a new ACL with the specified number of entries. @@ -51,6 +147,7 @@ posix_acl_alloc(int count, gfp_t flags) posix_acl_init(acl, count); return acl; } +EXPORT_SYMBOL(posix_acl_alloc); /* * Clone an ACL. @@ -78,8 +175,6 @@ posix_acl_valid(const struct posix_acl *acl) { const struct posix_acl_entry *pa, *pe; int state = ACL_USER_OBJ; - kuid_t prev_uid = INVALID_UID; - kgid_t prev_gid = INVALID_GID; int needs_mask = 0; FOREACH_ACL_ENTRY(pa, acl, pe) { @@ -98,10 +193,6 @@ posix_acl_valid(const struct posix_acl *acl) return -EINVAL; if (!uid_valid(pa->e_uid)) return -EINVAL; - if (uid_valid(prev_uid) && - uid_lte(pa->e_uid, prev_uid)) - return -EINVAL; - prev_uid = pa->e_uid; needs_mask = 1; break; @@ -117,10 +208,6 @@ posix_acl_valid(const struct posix_acl *acl) return -EINVAL; if (!gid_valid(pa->e_gid)) return -EINVAL; - if (gid_valid(prev_gid) && - gid_lte(pa->e_gid, prev_gid)) - return -EINVAL; - prev_gid = pa->e_gid; needs_mask = 1; break; @@ -146,6 +233,7 @@ posix_acl_valid(const struct posix_acl *acl) return 0; return -EINVAL; } +EXPORT_SYMBOL(posix_acl_valid); /* * Returns 0 if the acl can be exactly represented in the traditional @@ -158,6 +246,12 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p) umode_t mode = 0; int not_equiv = 0; + /* + * A null ACL can always be presented as mode bits. + */ + if (!acl) + return 0; + FOREACH_ACL_ENTRY(pa, acl, pe) { switch (pa->e_tag) { case ACL_USER_OBJ: @@ -186,6 +280,7 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p) *mode_p = (*mode_p & ~S_IRWXUGO) | mode; return not_equiv; } +EXPORT_SYMBOL(posix_acl_equiv_mode); /* * Create an ACL representing the file mode permission bits of an inode. @@ -207,6 +302,7 @@ posix_acl_from_mode(umode_t mode, gfp_t flags) acl->a_entries[2].e_perm = (mode & S_IRWXO); return acl; } +EXPORT_SYMBOL(posix_acl_from_mode); /* * Return 0 if current is granted want access to the inode @@ -338,7 +434,7 @@ static int posix_acl_create_masq(struct posix_acl *acl, umode_t *mode_p) /* * Modify the ACL for the chmod syscall. */ -static int posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode) +static int __posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode) { struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; struct posix_acl_entry *pa, *pe; @@ -384,7 +480,7 @@ static int posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode) } int -posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p) +__posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p) { struct posix_acl *clone = posix_acl_clone(*acl, gfp); int err = -ENOMEM; @@ -399,15 +495,15 @@ posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p) *acl = clone; return err; } -EXPORT_SYMBOL(posix_acl_create); +EXPORT_SYMBOL(__posix_acl_create); int -posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) +__posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) { struct posix_acl *clone = posix_acl_clone(*acl, gfp); int err = -ENOMEM; if (clone) { - err = posix_acl_chmod_masq(clone, mode); + err = __posix_acl_chmod_masq(clone, mode); if (err) { posix_acl_release(clone); clone = NULL; @@ -417,4 +513,389 @@ posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) *acl = clone; return err; } +EXPORT_SYMBOL(__posix_acl_chmod); + +int +posix_acl_chmod(struct inode *inode, umode_t mode) +{ + struct posix_acl *acl; + int ret = 0; + + if (!IS_POSIXACL(inode)) + return 0; + if (!inode->i_op->set_acl) + return -EOPNOTSUPP; + + acl = get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR_OR_NULL(acl)) { + if (acl == ERR_PTR(-EOPNOTSUPP)) + return 0; + return PTR_ERR(acl); + } + + ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); + if (ret) + return ret; + ret = inode->i_op->set_acl(inode, acl, ACL_TYPE_ACCESS); + posix_acl_release(acl); + return ret; +} EXPORT_SYMBOL(posix_acl_chmod); + +int +posix_acl_create(struct inode *dir, umode_t *mode, + struct posix_acl **default_acl, struct posix_acl **acl) +{ + struct posix_acl *p; + int ret; + + if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) + goto no_acl; + + p = get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(p)) { + if (p == ERR_PTR(-EOPNOTSUPP)) + goto apply_umask; + return PTR_ERR(p); + } + + if (!p) + goto apply_umask; + + *acl = posix_acl_clone(p, GFP_NOFS); + if (!*acl) + return -ENOMEM; + + ret = posix_acl_create_masq(*acl, mode); + if (ret < 0) { + posix_acl_release(*acl); + return -ENOMEM; + } + + if (ret == 0) { + posix_acl_release(*acl); + *acl = NULL; + } + + if (!S_ISDIR(*mode)) { + posix_acl_release(p); + *default_acl = NULL; + } else { + *default_acl = p; + } + return 0; + +apply_umask: + *mode &= ~current_umask(); +no_acl: + *default_acl = NULL; + *acl = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(posix_acl_create); + +/* + * Fix up the uids and gids in posix acl extended attributes in place. + */ +static void posix_acl_fix_xattr_userns( + struct user_namespace *to, struct user_namespace *from, + void *value, size_t size) +{ + posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; + posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; + int count; + kuid_t uid; + kgid_t gid; + + if (!value) + return; + if (size < sizeof(posix_acl_xattr_header)) + return; + if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + return; + + count = posix_acl_xattr_count(size); + if (count < 0) + return; + if (count == 0) + return; + + for (end = entry + count; entry != end; entry++) { + switch(le16_to_cpu(entry->e_tag)) { + case ACL_USER: + uid = make_kuid(from, le32_to_cpu(entry->e_id)); + entry->e_id = cpu_to_le32(from_kuid(to, uid)); + break; + case ACL_GROUP: + gid = make_kgid(from, le32_to_cpu(entry->e_id)); + entry->e_id = cpu_to_le32(from_kgid(to, gid)); + break; + default: + break; + } + } +} + +void posix_acl_fix_xattr_from_user(void *value, size_t size) +{ + struct user_namespace *user_ns = current_user_ns(); + if (user_ns == &init_user_ns) + return; + posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size); +} + +void posix_acl_fix_xattr_to_user(void *value, size_t size) +{ + struct user_namespace *user_ns = current_user_ns(); + if (user_ns == &init_user_ns) + return; + posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size); +} + +/* + * Convert from extended attribute to in-memory representation. + */ +struct posix_acl * +posix_acl_from_xattr(struct user_namespace *user_ns, + const void *value, size_t size) +{ + posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; + posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; + int count; + struct posix_acl *acl; + struct posix_acl_entry *acl_e; + + if (!value) + return NULL; + if (size < sizeof(posix_acl_xattr_header)) + return ERR_PTR(-EINVAL); + if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + return ERR_PTR(-EOPNOTSUPP); + + count = posix_acl_xattr_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + acl_e = acl->a_entries; + + for (end = entry + count; entry != end; acl_e++, entry++) { + acl_e->e_tag = le16_to_cpu(entry->e_tag); + acl_e->e_perm = le16_to_cpu(entry->e_perm); + + switch(acl_e->e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + break; + + case ACL_USER: + acl_e->e_uid = + make_kuid(user_ns, + le32_to_cpu(entry->e_id)); + if (!uid_valid(acl_e->e_uid)) + goto fail; + break; + case ACL_GROUP: + acl_e->e_gid = + make_kgid(user_ns, + le32_to_cpu(entry->e_id)); + if (!gid_valid(acl_e->e_gid)) + goto fail; + break; + + default: + goto fail; + } + } + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} +EXPORT_SYMBOL (posix_acl_from_xattr); + +/* + * Convert from in-memory to extended attribute representation. + */ +int +posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, + void *buffer, size_t size) +{ + posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer; + posix_acl_xattr_entry *ext_entry; + int real_size, n; + + real_size = posix_acl_xattr_size(acl->a_count); + if (!buffer) + return real_size; + if (real_size > size) + return -ERANGE; + + ext_entry = ext_acl->a_entries; + ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); + + for (n=0; n < acl->a_count; n++, ext_entry++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; + ext_entry->e_tag = cpu_to_le16(acl_e->e_tag); + ext_entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch(acl_e->e_tag) { + case ACL_USER: + ext_entry->e_id = + cpu_to_le32(from_kuid(user_ns, acl_e->e_uid)); + break; + case ACL_GROUP: + ext_entry->e_id = + cpu_to_le32(from_kgid(user_ns, acl_e->e_gid)); + break; + default: + ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); + break; + } + } + return real_size; +} +EXPORT_SYMBOL (posix_acl_to_xattr); + +static int +posix_acl_xattr_get(struct dentry *dentry, const char *name, + void *value, size_t size, int type) +{ + struct posix_acl *acl; + int error; + + if (!IS_POSIXACL(dentry->d_inode)) + return -EOPNOTSUPP; + if (S_ISLNK(dentry->d_inode->i_mode)) + return -EOPNOTSUPP; + + acl = get_acl(dentry->d_inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + + error = posix_acl_to_xattr(&init_user_ns, acl, value, size); + posix_acl_release(acl); + + return error; +} + +static int +posix_acl_xattr_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + struct inode *inode = dentry->d_inode; + struct posix_acl *acl = NULL; + int ret; + + if (!IS_POSIXACL(inode)) + return -EOPNOTSUPP; + if (!inode->i_op->set_acl) + return -EOPNOTSUPP; + + if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) + return value ? -EACCES : 0; + if (!inode_owner_or_capable(inode)) + return -EPERM; + + if (value) { + acl = posix_acl_from_xattr(&init_user_ns, value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + if (acl) { + ret = posix_acl_valid(acl); + if (ret) + goto out; + } + } + + ret = inode->i_op->set_acl(inode, acl, type); +out: + posix_acl_release(acl); + return ret; +} + +static size_t +posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const char *xname; + size_t size; + + if (!IS_POSIXACL(dentry->d_inode)) + return -EOPNOTSUPP; + if (S_ISLNK(dentry->d_inode->i_mode)) + return -EOPNOTSUPP; + + if (type == ACL_TYPE_ACCESS) + xname = POSIX_ACL_XATTR_ACCESS; + else + xname = POSIX_ACL_XATTR_DEFAULT; + + size = strlen(xname) + 1; + if (list && size <= list_size) + memcpy(list, xname, size); + return size; +} + +const struct xattr_handler posix_acl_access_xattr_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .list = posix_acl_xattr_list, + .get = posix_acl_xattr_get, + .set = posix_acl_xattr_set, +}; +EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler); + +const struct xattr_handler posix_acl_default_xattr_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .list = posix_acl_xattr_list, + .get = posix_acl_xattr_get, + .set = posix_acl_xattr_set, +}; +EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler); + +int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int error; + + if (type == ACL_TYPE_ACCESS) { + error = posix_acl_equiv_mode(acl, &inode->i_mode); + if (error < 0) + return 0; + if (error == 0) + acl = NULL; + } + + inode->i_ctime = CURRENT_TIME; + set_cached_acl(inode, type, acl); + return 0; +} + +int simple_acl_create(struct inode *dir, struct inode *inode) +{ + struct posix_acl *default_acl, *acl; + int error; + + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (error) + return error; + + set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl); + set_cached_acl(inode, ACL_TYPE_ACCESS, acl); + + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); + return 0; +} diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 15af6222f8a..2183fcf41d5 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -31,6 +31,10 @@ config PROC_FS config PROC_KCORE bool "/proc/kcore support" if !ARM depends on PROC_FS && MMU + help + Provides a virtual ELF core file of the live kernel. This can + be read with gdb and other ELF tools. No modifications can be + made using this mechanism. config PROC_VMCORE bool "/proc/vmcore support" diff --git a/fs/proc/Makefile b/fs/proc/Makefile index ab30716584f..239493ec718 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -27,6 +27,5 @@ proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o proc-$(CONFIG_NET) += proc_net.o proc-$(CONFIG_PROC_KCORE) += kcore.o proc-$(CONFIG_PROC_VMCORE) += vmcore.o -proc-$(CONFIG_PROC_DEVICETREE) += proc_devtree.o proc-$(CONFIG_PRINTK) += kmsg.o proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o diff --git a/fs/proc/array.c b/fs/proc/array.c index cbd0f1b324b..64db2bceac5 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -138,26 +138,17 @@ static const char * const task_state_array[] = { "D (disk sleep)", /* 2 */ "T (stopped)", /* 4 */ "t (tracing stop)", /* 8 */ - "Z (zombie)", /* 16 */ - "X (dead)", /* 32 */ - "x (dead)", /* 64 */ - "K (wakekill)", /* 128 */ - "W (waking)", /* 256 */ - "P (parked)", /* 512 */ + "X (dead)", /* 16 */ + "Z (zombie)", /* 32 */ }; static inline const char *get_task_state(struct task_struct *tsk) { - unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state; - const char * const *p = &task_state_array[0]; + unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT; - BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array)); + BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1); - while (state) { - p++; - state >>= 1; - } - return *p; + return task_state_array[fls(state)]; } static inline void task_state(struct seq_file *m, struct pid_namespace *ns, @@ -183,6 +174,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "State:\t%s\n" "Tgid:\t%d\n" + "Ngid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" "TracerPid:\t%d\n" @@ -190,6 +182,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), task_tgid_nr_ns(p, ns), + task_numa_group_id(p), pid_nr_ns(pid, ns), ppid, tpid, from_kuid_munged(user_ns, cred->uid), @@ -451,8 +444,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, min_flt += t->min_flt; maj_flt += t->maj_flt; gtime += task_gtime(t); - t = next_thread(t); - } while (t != task); + } while_each_thread(task, t); min_flt += sig->min_flt; maj_flt += sig->maj_flt; diff --git a/fs/proc/base.c b/fs/proc/base.c index 1485e38daaa..2d696b0c93b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -200,41 +200,9 @@ static int proc_root_link(struct dentry *dentry, struct path *path) return result; } -static int proc_pid_cmdline(struct task_struct *task, char * buffer) +static int proc_pid_cmdline(struct task_struct *task, char *buffer) { - int res = 0; - unsigned int len; - struct mm_struct *mm = get_task_mm(task); - if (!mm) - goto out; - if (!mm->arg_end) - goto out_mm; /* Shh! No looking before we're done */ - - len = mm->arg_end - mm->arg_start; - - if (len > PAGE_SIZE) - len = PAGE_SIZE; - - res = access_process_vm(task, mm->arg_start, buffer, len, 0); - - // If the nul at the end of args has been overwritten, then - // assume application is using setproctitle(3). - if (res > 0 && buffer[res-1] != '\0' && len < PAGE_SIZE) { - len = strnlen(buffer, res); - if (len < res) { - res = len; - } else { - len = mm->env_end - mm->env_start; - if (len > PAGE_SIZE - res) - len = PAGE_SIZE - res; - res += access_process_vm(task, mm->env_start, buffer+res, len, 0); - res = strnlen(buffer, res); - } - } -out_mm: - mmput(mm); -out: - return res; + return get_cmdline(task, buffer, PAGE_SIZE); } static int proc_pid_auxv(struct task_struct *task, char *buffer) @@ -1151,10 +1119,16 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, goto out_free_page; } - kloginuid = make_kuid(file->f_cred->user_ns, loginuid); - if (!uid_valid(kloginuid)) { - length = -EINVAL; - goto out_free_page; + + /* is userspace tring to explicitly UNSET the loginuid? */ + if (loginuid == AUDIT_UID_UNSET) { + kloginuid = INVALID_UID; + } else { + kloginuid = make_kuid(file->f_cred->user_ns, loginuid); + if (!uid_valid(kloginuid)) { + length = -EINVAL; + goto out_free_page; + } } length = audit_set_loginuid(kloginuid); @@ -1230,6 +1204,9 @@ static ssize_t proc_fault_inject_write(struct file * file, make_it_fail = simple_strtol(strstrip(buffer), &end, 0); if (*end) return -EINVAL; + if (make_it_fail < 0 || make_it_fail > 1) + return -EINVAL; + task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; @@ -1652,13 +1629,18 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags) return 0; } +static inline bool proc_inode_is_dead(struct inode *inode) +{ + return !proc_pid(inode)->tasks[PIDTYPE_PID].first; +} + int pid_delete_dentry(const struct dentry *dentry) { /* Is the task we represent dead? * If so, then don't put the dentry on the lru list, * kill it immediately. */ - return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; + return proc_inode_is_dead(dentry->d_inode); } const struct dentry_operations pid_dentry_operations = @@ -1813,6 +1795,7 @@ static int proc_map_files_get_link(struct dentry *dentry, struct path *path) if (rc) goto out_mmput; + rc = -ENOENT; down_read(&mm->mmap_sem); vma = find_exact_vma(mm, vm_start, vm_end); if (vma && vma->vm_file) { @@ -2576,7 +2559,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("environ", S_IRUSR, proc_environ_operations), INF("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), - ONE("personality", S_IRUGO, proc_pid_personality), + ONE("personality", S_IRUSR, proc_pid_personality), INF("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), @@ -2586,7 +2569,7 @@ static const struct pid_entry tgid_base_stuff[] = { #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK - INF("syscall", S_IRUGO, proc_pid_syscall), + INF("syscall", S_IRUSR, proc_pid_syscall), #endif INF("cmdline", S_IRUGO, proc_pid_cmdline), ONE("stat", S_IRUGO, proc_tgid_stat), @@ -2605,7 +2588,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_pid_smaps_operations), - REG("pagemap", S_IRUGO, proc_pagemap_operations), + REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), @@ -2614,7 +2597,7 @@ static const struct pid_entry tgid_base_stuff[] = { INF("wchan", S_IRUGO, proc_pid_wchan), #endif #ifdef CONFIG_STACKTRACE - ONE("stack", S_IRUGO, proc_pid_stack), + ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHEDSTATS INF("schedstat", S_IRUGO, proc_pid_schedstat), @@ -2915,14 +2898,14 @@ static const struct pid_entry tid_base_stuff[] = { REG("environ", S_IRUSR, proc_environ_operations), INF("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), - ONE("personality", S_IRUGO, proc_pid_personality), + ONE("personality", S_IRUSR, proc_pid_personality), INF("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK - INF("syscall", S_IRUGO, proc_pid_syscall), + INF("syscall", S_IRUSR, proc_pid_syscall), #endif INF("cmdline", S_IRUGO, proc_pid_cmdline), ONE("stat", S_IRUGO, proc_tid_stat), @@ -2943,7 +2926,7 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_tid_smaps_operations), - REG("pagemap", S_IRUGO, proc_pagemap_operations), + REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), @@ -2952,7 +2935,7 @@ static const struct pid_entry tid_base_stuff[] = { INF("wchan", S_IRUGO, proc_pid_wchan), #endif #ifdef CONFIG_STACKTRACE - ONE("stack", S_IRUGO, proc_pid_stack), + ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHEDSTATS INF("schedstat", S_IRUGO, proc_pid_schedstat), @@ -3086,34 +3069,42 @@ out_no_task: * In the case of a seek we start with the leader and walk nr * threads past it. */ -static struct task_struct *first_tid(struct task_struct *leader, - int tid, int nr, struct pid_namespace *ns) +static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos, + struct pid_namespace *ns) { - struct task_struct *pos; + struct task_struct *pos, *task; + unsigned long nr = f_pos; + + if (nr != f_pos) /* 32bit overflow? */ + return NULL; rcu_read_lock(); - /* Attempt to start with the pid of a thread */ - if (tid && (nr > 0)) { + task = pid_task(pid, PIDTYPE_PID); + if (!task) + goto fail; + + /* Attempt to start with the tid of a thread */ + if (tid && nr) { pos = find_task_by_pid_ns(tid, ns); - if (pos && (pos->group_leader == leader)) + if (pos && same_thread_group(pos, task)) goto found; } /* If nr exceeds the number of threads there is nothing todo */ - pos = NULL; - if (nr && nr >= get_nr_threads(leader)) - goto out; + if (nr >= get_nr_threads(task)) + goto fail; /* If we haven't found our starting place yet start * with the leader and walk nr threads forward. */ - for (pos = leader; nr > 0; --nr) { - pos = next_thread(pos); - if (pos == leader) { - pos = NULL; - goto out; - } - } + pos = task = task->group_leader; + do { + if (!nr--) + goto found; + } while_each_thread(task, pos); +fail: + pos = NULL; + goto out; found: get_task_struct(pos); out: @@ -3146,25 +3137,16 @@ static struct task_struct *next_tid(struct task_struct *start) /* for the /proc/TGID/task/ directories */ static int proc_task_readdir(struct file *file, struct dir_context *ctx) { - struct task_struct *leader = NULL; - struct task_struct *task = get_proc_task(file_inode(file)); + struct inode *inode = file_inode(file); + struct task_struct *task; struct pid_namespace *ns; int tid; - if (!task) - return -ENOENT; - rcu_read_lock(); - if (pid_alive(task)) { - leader = task->group_leader; - get_task_struct(leader); - } - rcu_read_unlock(); - put_task_struct(task); - if (!leader) + if (proc_inode_is_dead(inode)) return -ENOENT; if (!dir_emit_dots(file, ctx)) - goto out; + return 0; /* f_version caches the tgid value that the last readdir call couldn't * return. lseek aka telldir automagically resets f_version to 0. @@ -3172,7 +3154,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) ns = file->f_dentry->d_sb->s_fs_info; tid = (int)file->f_version; file->f_version = 0; - for (task = first_tid(leader, tid, ctx->pos - 2, ns); + for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); task; task = next_tid(task), ctx->pos++) { char name[PROC_NUMBUF]; @@ -3188,8 +3170,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) break; } } -out: - put_task_struct(leader); + return 0; } diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c index 82676e3fcd1..cbd82dff7e8 100644 --- a/fs/proc/cmdline.c +++ b/fs/proc/cmdline.c @@ -26,4 +26,4 @@ static int __init proc_cmdline_init(void) proc_create("cmdline", 0, NULL, &cmdline_proc_fops); return 0; } -module_init(proc_cmdline_init); +fs_initcall(proc_cmdline_init); diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c index b701eaa482b..290ba85cb90 100644 --- a/fs/proc/consoles.c +++ b/fs/proc/consoles.c @@ -29,7 +29,6 @@ static int show_console_dev(struct seq_file *m, void *v) char flags[ARRAY_SIZE(con_flags) + 1]; struct console *con = v; unsigned int a; - int len; dev_t dev = 0; if (con->device) { @@ -47,11 +46,10 @@ static int show_console_dev(struct seq_file *m, void *v) con_flags[a].name : ' '; flags[a] = 0; - seq_printf(m, "%s%d%n", con->name, con->index, &len); - len = 21 - len; - if (len < 1) - len = 1; - seq_printf(m, "%*c%c%c%c (%s)", len, ' ', con->read ? 'R' : '-', + seq_setwidth(m, 21 - 1); + seq_printf(m, "%s%d", con->name, con->index); + seq_pad(m, ' '); + seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', con->write ? 'W' : '-', con->unblank ? 'U' : '-', flags); if (dev) @@ -111,4 +109,4 @@ static int __init proc_consoles_init(void) proc_create("consoles", 0, NULL, &proc_consoles_operations); return 0; } -module_init(proc_consoles_init); +fs_initcall(proc_consoles_init); diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c index 5a1e539a234..06f4d31e039 100644 --- a/fs/proc/cpuinfo.c +++ b/fs/proc/cpuinfo.c @@ -21,4 +21,4 @@ static int __init proc_cpuinfo_init(void) proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); return 0; } -module_init(proc_cpuinfo_init); +fs_initcall(proc_cpuinfo_init); diff --git a/fs/proc/devices.c b/fs/proc/devices.c index b14347167c3..50493edc30e 100644 --- a/fs/proc/devices.c +++ b/fs/proc/devices.c @@ -67,4 +67,4 @@ static int __init proc_devices_init(void) proc_create("devices", 0, NULL, &proc_devinfo_operations); return 0; } -module_init(proc_devices_init); +fs_initcall(proc_devices_init); diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 985ea881b5b..0788d093f5d 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -11,6 +11,7 @@ #include <linux/proc_fs.h> +#include "../mount.h" #include "internal.h" #include "fd.h" @@ -48,8 +49,9 @@ static int seq_show(struct seq_file *m, void *v) } if (!ret) { - seq_printf(m, "pos:\t%lli\nflags:\t0%o\n", - (long long)file->f_pos, f_flags); + seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n", + (long long)file->f_pos, f_flags, + real_mount(file->f_path.mnt)->mnt_id); if (file->f_op->show_fdinfo) ret = file->f_op->show_fdinfo(m, file); fput(file); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 737e15615b0..b7f268eb5f4 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -49,8 +49,7 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) setattr_copy(inode, iattr); mark_inode_dirty(inode); - de->uid = inode->i_uid; - de->gid = inode->i_gid; + proc_set_user(de, inode->i_uid, inode->i_gid); de->mode = inode->i_mode; return 0; } @@ -175,22 +174,6 @@ static const struct inode_operations proc_link_inode_operations = { }; /* - * As some entries in /proc are volatile, we want to - * get rid of unused dentries. This could be made - * smarter: we could keep a "volatile" flag in the - * inode to indicate which ones to keep. - */ -static int proc_delete_dentry(const struct dentry * dentry) -{ - return 1; -} - -static const struct dentry_operations proc_dentry_operations = -{ - .d_delete = proc_delete_dentry, -}; - -/* * Don't create negative dentries here, return -ENOENT by hand * instead. */ @@ -209,7 +192,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, inode = proc_get_inode(dir->i_sb, de); if (!inode) return ERR_PTR(-ENOMEM); - d_set_d_op(dentry, &proc_dentry_operations); + d_set_d_op(dentry, &simple_dentry_operations); d_add(dentry, inode); return NULL; } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 9f8ef9b7674..0adbc02d60e 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -35,7 +35,7 @@ static void proc_evict_inode(struct inode *inode) const struct proc_ns_operations *ns_ops; void *ns; - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); /* Stop tracking associated processes */ @@ -47,7 +47,7 @@ static void proc_evict_inode(struct inode *inode) pde_put(de); head = PROC_I(inode)->sysctl; if (head) { - rcu_assign_pointer(PROC_I(inode)->sysctl, NULL); + RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); sysctl_head_put(head); } /* Release any associated namespace */ @@ -285,15 +285,27 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) return rv; } -static unsigned long proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) +static unsigned long +proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) { struct proc_dir_entry *pde = PDE(file_inode(file)); - int rv = -EIO; - unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + unsigned long rv = -EIO; + if (use_pde(pde)) { - get_unmapped_area = pde->proc_fops->get_unmapped_area; - if (get_unmapped_area) - rv = get_unmapped_area(file, orig_addr, len, pgoff, flags); + typeof(proc_reg_get_unmapped_area) *get_area; + + get_area = pde->proc_fops->get_unmapped_area; +#ifdef CONFIG_MMU + if (!get_area) + get_area = current->mm->get_unmapped_area; +#endif + + if (get_area) + rv = get_area(file, orig_addr, len, pgoff, flags); + else + rv = orig_addr; unuse_pde(pde); } return rv; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 651d09a11dd..3ab6d14e71c 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -211,13 +211,6 @@ extern int proc_fill_super(struct super_block *); extern void proc_entry_rundown(struct proc_dir_entry *); /* - * proc_devtree.c - */ -#ifdef CONFIG_PROC_DEVICETREE -extern void proc_device_tree_init(void); -#endif - -/* * proc_namespaces.c */ extern const struct inode_operations proc_ns_dir_inode_operations; diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c index 05029c0e2f2..a352d5703b4 100644 --- a/fs/proc/interrupts.c +++ b/fs/proc/interrupts.c @@ -50,4 +50,4 @@ static int __init proc_interrupts_init(void) proc_create("interrupts", 0, NULL, &proc_interrupts_operations); return 0; } -module_init(proc_interrupts_init); +fs_initcall(proc_interrupts_init); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 06ea155e1a5..39e6ef32f0b 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -255,8 +255,7 @@ static int kcore_update_ram(void) end_pfn = 0; for_each_node_state(nid, N_MEMORY) { unsigned long node_end; - node_end = NODE_DATA(nid)->node_start_pfn + - NODE_DATA(nid)->node_spanned_pages; + node_end = node_end_pfn(nid); if (end_pfn < node_end) end_pfn = node_end; } @@ -640,4 +639,4 @@ static int __init proc_kcore_init(void) return 0; } -module_init(proc_kcore_init); +fs_initcall(proc_kcore_init); diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index bdfabdaefdc..05f8dcdb086 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -61,4 +61,4 @@ static int __init proc_kmsg_init(void) proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); return 0; } -module_init(proc_kmsg_init); +fs_initcall(proc_kmsg_init); diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index 1afa4dd4cae..aec66e6c206 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -42,4 +42,4 @@ static int __init proc_loadavg_init(void) proc_create("loadavg", 0, NULL, &loadavg_proc_fops); return 0; } -module_init(proc_loadavg_init); +fs_initcall(proc_loadavg_init); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 59d85d60889..7445af0b1aa 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -1,8 +1,8 @@ #include <linux/fs.h> -#include <linux/hugetlb.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/mmzone.h> #include <linux/proc_fs.h> @@ -24,10 +24,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v) { struct sysinfo i; unsigned long committed; - unsigned long allowed; struct vmalloc_info vmi; long cached; + long available; + unsigned long pagecache; + unsigned long wmark_low = 0; unsigned long pages[NR_LRU_LISTS]; + struct zone *zone; int lru; /* @@ -37,8 +40,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v) si_meminfo(&i); si_swapinfo(&i); committed = percpu_counter_read_positive(&vm_committed_as); - allowed = ((totalram_pages - hugetlb_total_pages()) - * sysctl_overcommit_ratio / 100) + total_swap_pages; cached = global_page_state(NR_FILE_PAGES) - total_swapcache_pages() - i.bufferram; @@ -50,12 +51,44 @@ static int meminfo_proc_show(struct seq_file *m, void *v) for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) pages[lru] = global_page_state(NR_LRU_BASE + lru); + for_each_zone(zone) + wmark_low += zone->watermark[WMARK_LOW]; + + /* + * Estimate the amount of memory available for userspace allocations, + * without causing swapping. + * + * Free memory cannot be taken below the low watermark, before the + * system starts swapping. + */ + available = i.freeram - wmark_low; + + /* + * Not all the page cache can be freed, otherwise the system will + * start swapping. Assume at least half of the page cache, or the + * low watermark worth of cache, needs to stay. + */ + pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; + pagecache -= min(pagecache / 2, wmark_low); + available += pagecache; + + /* + * Part of the reclaimable slab consists of items that are in use, + * and cannot be freed. Cap this estimate at the low watermark. + */ + available += global_page_state(NR_SLAB_RECLAIMABLE) - + min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); + + if (available < 0) + available = 0; + /* * Tagged format, for easy grepping and expansion. */ seq_printf(m, "MemTotal: %8lu kB\n" "MemFree: %8lu kB\n" + "MemAvailable: %8lu kB\n" "Buffers: %8lu kB\n" "Cached: %8lu kB\n" "SwapCached: %8lu kB\n" @@ -108,6 +141,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) , K(i.totalram), K(i.freeram), + K(available), K(i.bufferram), K(cached), K(total_swapcache_pages()), @@ -147,7 +181,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(global_page_state(NR_UNSTABLE_NFS)), K(global_page_state(NR_BOUNCE)), K(global_page_state(NR_WRITEBACK_TEMP)), - K(allowed), + K(vm_commit_limit()), K(committed), (unsigned long)VMALLOC_TOTAL >> 10, vmi.used >> 10, @@ -186,4 +220,4 @@ static int __init proc_meminfo_init(void) proc_create("meminfo", 0, NULL, &meminfo_proc_fops); return 0; } -module_init(proc_meminfo_init); +fs_initcall(proc_meminfo_init); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 49a7fff2e83..89026095f2b 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -42,12 +42,6 @@ static const struct inode_operations ns_inode_operations = { .setattr = proc_setattr, }; -static int ns_delete_dentry(const struct dentry *dentry) -{ - /* Don't cache namespace inodes when not in use */ - return 1; -} - static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) { struct inode *inode = dentry->d_inode; @@ -59,7 +53,7 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) const struct dentry_operations ns_dentry_operations = { - .d_delete = ns_delete_dentry, + .d_delete = always_delete_dentry, .d_dname = ns_dname, }; @@ -152,7 +146,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl struct task_struct *task; void *ns; char name[50]; - int len = -EACCES; + int res = -EACCES; task = get_proc_task(inode); if (!task) @@ -161,24 +155,18 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out_put_task; - len = -ENOENT; + res = -ENOENT; ns = ns_ops->get(task); if (!ns) goto out_put_task; snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns)); - len = strlen(name); - - if (len > buflen) - len = buflen; - if (copy_to_user(buffer, name, len)) - len = -EFAULT; - + res = readlink_copy(buffer, buflen, name); ns_ops->put(ns); out_put_task: put_task_struct(task); out: - return len; + return res; } static const struct inode_operations proc_ns_link_inode_operations = { diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index ccfd99bd1c5..d4a35746cab 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -39,7 +39,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) unsigned long ino = 0; struct file *file; dev_t dev = 0; - int flags, len; + int flags; flags = region->vm_flags; file = region->vm_file; @@ -50,8 +50,9 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) ino = inode->i_ino; } + seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); seq_printf(m, - "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", + "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", region->vm_start, region->vm_end, flags & VM_READ ? 'r' : '-', @@ -59,13 +60,10 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', ((loff_t)region->vm_pgoff) << PAGE_SHIFT, - MAJOR(dev), MINOR(dev), ino, &len); + MAJOR(dev), MINOR(dev), ino); if (file) { - len = 25 + sizeof(void *) * 6 - len; - if (len < 1) - len = 1; - seq_printf(m, "%*c", len, ' '); + seq_pad(m, ' '); seq_path(m, &file->f_path, ""); } @@ -133,4 +131,4 @@ static int __init proc_nommu_init(void) return 0; } -module_init(proc_nommu_init); +fs_initcall(proc_nommu_init); diff --git a/fs/proc/page.c b/fs/proc/page.c index b8730d9ebae..e647c55275d 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -118,10 +118,11 @@ u64 stable_page_flags(struct page *page) /* * PageTransCompound can be true for non-huge compound pages (slab * pages or pages allocated by drivers with __GFP_COMP) because it - * just checks PG_head/PG_tail, so we need to check PageLRU to make - * sure a given page is a thp, not a non-huge compound page. + * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon + * to make sure a given page is a thp, not a non-huge compound page. */ - else if (PageTransCompound(page) && PageLRU(compound_trans_head(page))) + else if (PageTransCompound(page) && (PageLRU(compound_head(page)) || + PageAnon(compound_head(page)))) u |= 1 << KPF_THP; /* @@ -217,4 +218,4 @@ static int __init proc_page_init(void) proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations); return 0; } -module_init(proc_page_init); +fs_initcall(proc_page_init); diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c deleted file mode 100644 index 106a8357063..00000000000 --- a/fs/proc/proc_devtree.c +++ /dev/null @@ -1,243 +0,0 @@ -/* - * proc_devtree.c - handles /proc/device-tree - * - * Copyright 1997 Paul Mackerras - */ -#include <linux/errno.h> -#include <linux/init.h> -#include <linux/time.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/printk.h> -#include <linux/stat.h> -#include <linux/string.h> -#include <linux/of.h> -#include <linux/export.h> -#include <linux/slab.h> -#include <asm/prom.h> -#include <asm/uaccess.h> -#include "internal.h" - -static inline void set_node_proc_entry(struct device_node *np, - struct proc_dir_entry *de) -{ -#ifdef HAVE_ARCH_DEVTREE_FIXUPS - np->pde = de; -#endif -} - -static struct proc_dir_entry *proc_device_tree; - -/* - * Supply data on a read from /proc/device-tree/node/property. - */ -static int property_proc_show(struct seq_file *m, void *v) -{ - struct property *pp = m->private; - - seq_write(m, pp->value, pp->length); - return 0; -} - -static int property_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, property_proc_show, __PDE_DATA(inode)); -} - -static const struct file_operations property_proc_fops = { - .owner = THIS_MODULE, - .open = property_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -/* - * For a node with a name like "gc@10", we make symlinks called "gc" - * and "@10" to it. - */ - -/* - * Add a property to a node - */ -static struct proc_dir_entry * -__proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp, - const char *name) -{ - struct proc_dir_entry *ent; - - /* - * Unfortunately proc_register puts each new entry - * at the beginning of the list. So we rearrange them. - */ - ent = proc_create_data(name, - strncmp(name, "security-", 9) ? S_IRUGO : S_IRUSR, - de, &property_proc_fops, pp); - if (ent == NULL) - return NULL; - - if (!strncmp(name, "security-", 9)) - ent->size = 0; /* don't leak number of password chars */ - else - ent->size = pp->length; - - return ent; -} - - -void proc_device_tree_add_prop(struct proc_dir_entry *pde, struct property *prop) -{ - __proc_device_tree_add_prop(pde, prop, prop->name); -} - -void proc_device_tree_remove_prop(struct proc_dir_entry *pde, - struct property *prop) -{ - remove_proc_entry(prop->name, pde); -} - -void proc_device_tree_update_prop(struct proc_dir_entry *pde, - struct property *newprop, - struct property *oldprop) -{ - struct proc_dir_entry *ent; - - if (!oldprop) { - proc_device_tree_add_prop(pde, newprop); - return; - } - - for (ent = pde->subdir; ent != NULL; ent = ent->next) - if (ent->data == oldprop) - break; - if (ent == NULL) { - pr_warn("device-tree: property \"%s\" does not exist\n", - oldprop->name); - } else { - ent->data = newprop; - ent->size = newprop->length; - } -} - -/* - * Various dodgy firmware might give us nodes and/or properties with - * conflicting names. That's generally ok, except for exporting via /proc, - * so munge names here to ensure they're unique. - */ - -static int duplicate_name(struct proc_dir_entry *de, const char *name) -{ - struct proc_dir_entry *ent; - int found = 0; - - spin_lock(&proc_subdir_lock); - - for (ent = de->subdir; ent != NULL; ent = ent->next) { - if (strcmp(ent->name, name) == 0) { - found = 1; - break; - } - } - - spin_unlock(&proc_subdir_lock); - - return found; -} - -static const char *fixup_name(struct device_node *np, struct proc_dir_entry *de, - const char *name) -{ - char *fixed_name; - int fixup_len = strlen(name) + 2 + 1; /* name + #x + \0 */ - int i = 1, size; - -realloc: - fixed_name = kmalloc(fixup_len, GFP_KERNEL); - if (fixed_name == NULL) { - pr_err("device-tree: Out of memory trying to fixup " - "name \"%s\"\n", name); - return name; - } - -retry: - size = snprintf(fixed_name, fixup_len, "%s#%d", name, i); - size++; /* account for NULL */ - - if (size > fixup_len) { - /* We ran out of space, free and reallocate. */ - kfree(fixed_name); - fixup_len = size; - goto realloc; - } - - if (duplicate_name(de, fixed_name)) { - /* Multiple duplicates. Retry with a different offset. */ - i++; - goto retry; - } - - pr_warn("device-tree: Duplicate name in %s, renamed to \"%s\"\n", - np->full_name, fixed_name); - - return fixed_name; -} - -/* - * Process a node, adding entries for its children and its properties. - */ -void proc_device_tree_add_node(struct device_node *np, - struct proc_dir_entry *de) -{ - struct property *pp; - struct proc_dir_entry *ent; - struct device_node *child; - const char *p; - - set_node_proc_entry(np, de); - for (child = NULL; (child = of_get_next_child(np, child));) { - /* Use everything after the last slash, or the full name */ - p = kbasename(child->full_name); - - if (duplicate_name(de, p)) - p = fixup_name(np, de, p); - - ent = proc_mkdir(p, de); - if (ent == NULL) - break; - proc_device_tree_add_node(child, ent); - } - of_node_put(child); - - for (pp = np->properties; pp != NULL; pp = pp->next) { - p = pp->name; - - if (strchr(p, '/')) - continue; - - if (duplicate_name(de, p)) - p = fixup_name(np, de, p); - - ent = __proc_device_tree_add_prop(de, pp, p); - if (ent == NULL) - break; - } -} - -/* - * Called on initialization to set up the /proc/device-tree subtree - */ -void __init proc_device_tree_init(void) -{ - struct device_node *root; - - proc_device_tree = proc_mkdir("device-tree", NULL); - if (proc_device_tree == NULL) - return; - root = of_find_node_by_path("/"); - if (root == NULL) { - pr_debug("/proc/device-tree: can't find root\n"); - return; - } - proc_device_tree_add_node(root, proc_device_tree); - of_node_put(root); -} diff --git a/fs/proc/root.c b/fs/proc/root.c index 87dbcbef7fe..5dbadecb234 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -92,6 +92,8 @@ static int proc_parse_options(char *options, struct pid_namespace *pid) int proc_remount(struct super_block *sb, int *flags, char *data) { struct pid_namespace *pid = sb->s_fs_info; + + sync_filesystem(sb); return !proc_parse_options(data, pid); } @@ -183,9 +185,6 @@ void __init proc_root_init(void) proc_mkdir("openprom", NULL); #endif proc_tty_init(); -#ifdef CONFIG_PROC_DEVICETREE - proc_device_tree_init(); -#endif proc_mkdir("bus", NULL); proc_sys_init(); } diff --git a/fs/proc/self.c b/fs/proc/self.c index ffeb202ec94..4348bb8907c 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -16,7 +16,7 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer, if (!tgid) return -ENOENT; sprintf(tmp, "%d", tgid); - return vfs_readlink(dentry,buffer,buflen,tmp); + return readlink_copy(buffer, buflen, tmp); } static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c index 62604be9f58..ad8a77f94be 100644 --- a/fs/proc/softirqs.c +++ b/fs/proc/softirqs.c @@ -41,4 +41,4 @@ static int __init proc_softirqs_init(void) proc_create("softirqs", 0, NULL, &proc_softirqs_operations); return 0; } -module_init(proc_softirqs_init); +fs_initcall(proc_softirqs_init); diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 1cf86c0e868..bf2d03f8fd3 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -9,7 +9,7 @@ #include <linux/slab.h> #include <linux/time.h> #include <linux/irqnr.h> -#include <asm/cputime.h> +#include <linux/cputime.h> #include <linux/tick.h> #ifndef arch_irq_stat_cpu @@ -184,29 +184,11 @@ static int show_stat(struct seq_file *p, void *v) static int stat_open(struct inode *inode, struct file *file) { - size_t size = 1024 + 128 * num_possible_cpus(); - char *buf; - struct seq_file *m; - int res; + size_t size = 1024 + 128 * num_online_cpus(); /* minimum size to display an interrupt count : 2 bytes */ size += 2 * nr_irqs; - - /* don't ask for more than the kmalloc() max size */ - if (size > KMALLOC_MAX_SIZE) - size = KMALLOC_MAX_SIZE; - buf = kmalloc(size, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - res = single_open(file, show_stat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = ksize(buf); - } else - kfree(buf); - return res; + return single_open_size(file, show_stat, NULL, size); } static const struct file_operations proc_stat_operations = { @@ -221,4 +203,4 @@ static int __init proc_stat_init(void) proc_create("stat", 0, NULL, &proc_stat_operations); return 0; } -module_init(proc_stat_init); +fs_initcall(proc_stat_init); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7366e9d63ce..cfa63ee92c9 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,4 +1,5 @@ #include <linux/mm.h> +#include <linux/vmacache.h> #include <linux/hugetlb.h> #include <linux/huge_mm.h> #include <linux/mount.h> @@ -62,7 +63,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) total_rss << (PAGE_SHIFT-10), data << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, - (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10, + (PTRS_PER_PTE * sizeof(pte_t) * + atomic_long_read(&mm->nr_ptes)) >> 10, swap << (PAGE_SHIFT-10)); } @@ -83,14 +85,6 @@ unsigned long task_statm(struct mm_struct *mm, return mm->total_vm; } -static void pad_len_spaces(struct seq_file *m, int len) -{ - len = 25 + sizeof(void*) * 6 - len; - if (len < 1) - len = 1; - seq_printf(m, "%*c", len, ' '); -} - #ifdef CONFIG_NUMA /* * These functions are for numa_maps but called in generic **maps seq_file @@ -159,7 +153,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) /* * We remember last_addr rather than next_addr to hit with - * mmap_cache most of the time. We have zero last_addr at + * vmacache most of the time. We have zero last_addr at * the beginning and also after lseek. We will have -1 last_addr * after the end of the vmas. */ @@ -268,7 +262,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) unsigned long long pgoff = 0; unsigned long start, end; dev_t dev = 0; - int len; const char *name = NULL; if (file) { @@ -286,7 +279,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) if (stack_guard_page_end(vma, end)) end -= PAGE_SIZE; - seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", + seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); + seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", start, end, flags & VM_READ ? 'r' : '-', @@ -294,18 +288,24 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? 's' : 'p', pgoff, - MAJOR(dev), MINOR(dev), ino, &len); + MAJOR(dev), MINOR(dev), ino); /* * Print the dentry name for named mappings, and a * special [heap] marker for the heap: */ if (file) { - pad_len_spaces(m, len); + seq_pad(m, ' '); seq_path(m, &file->f_path, "\n"); goto done; } + if (vma->vm_ops && vma->vm_ops->name) { + name = vma->vm_ops->name(vma); + if (name) + goto done; + } + name = arch_vma_name(vma); if (!name) { pid_t tid; @@ -333,7 +333,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) name = "[stack]"; } else { /* Thread stack in /proc/PID/maps */ - pad_len_spaces(m, len); + seq_pad(m, ' '); seq_printf(m, "[stack:%d]", tid); } } @@ -341,7 +341,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) done: if (name) { - pad_len_spaces(m, len); + seq_pad(m, ' '); seq_puts(m, name); } seq_putc(m, '\n'); @@ -505,9 +505,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk); - spin_unlock(&walk->mm->page_table_lock); + spin_unlock(ptl); mss->anonymous_thp += HPAGE_PMD_SIZE; return 0; } @@ -561,6 +561,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_NONLINEAR)] = "nl", [ilog2(VM_ARCH_1)] = "ar", [ilog2(VM_DONTDUMP)] = "dd", +#ifdef CONFIG_MEM_SOFT_DIRTY + [ilog2(VM_SOFTDIRTY)] = "sd", +#endif [ilog2(VM_MIXEDMAP)] = "mm", [ilog2(VM_HUGEPAGE)] = "hg", [ilog2(VM_NOHUGEPAGE)] = "nh", @@ -740,9 +743,6 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, ptent = pte_file_clear_soft_dirty(ptent); } - if (vma->vm_flags & VM_SOFTDIRTY) - vma->vm_flags &= ~VM_SOFTDIRTY; - set_pte_at(vma->vm_mm, addr, pte, ptent); #endif } @@ -810,8 +810,9 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, if (type == CLEAR_REFS_SOFT_DIRTY) { soft_dirty_cleared = true; - pr_warn_once("The pagemap bits 55-60 has changed their meaning! " - "See the linux/Documentation/vm/pagemap.txt for details.\n"); + pr_warn_once("The pagemap bits 55-60 has changed their meaning!" + " See the linux/Documentation/vm/pagemap.txt for " + "details.\n"); } task = get_proc_task(file_inode(file)); @@ -842,11 +843,17 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, * * Writing 3 to /proc/pid/clear_refs only affects file * mapped pages. + * + * Writing 4 to /proc/pid/clear_refs affects all pages. */ if (type == CLEAR_REFS_ANON && vma->vm_file) continue; if (type == CLEAR_REFS_MAPPED && !vma->vm_file) continue; + if (type == CLEAR_REFS_SOFT_DIRTY) { + if (vma->vm_flags & VM_SOFTDIRTY) + vma->vm_flags &= ~VM_SOFTDIRTY; + } walk_page_range(vma->vm_start, vma->vm_end, &clear_refs_walk); } @@ -941,6 +948,8 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, frame = pte_pfn(pte); flags = PM_PRESENT; page = vm_normal_page(vma, addr, pte); + if (pte_soft_dirty(pte)) + flags2 |= __PM_SOFT_DIRTY; } else if (is_swap_pte(pte)) { swp_entry_t entry; if (pte_swp_soft_dirty(pte)) @@ -960,7 +969,7 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, if (page && !PageAnon(page)) flags |= PM_FILE; - if ((vma->vm_flags & VM_SOFTDIRTY) || pte_soft_dirty(pte)) + if ((vma->vm_flags & VM_SOFTDIRTY)) flags2 |= __PM_SOFT_DIRTY; *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); @@ -993,13 +1002,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, { struct vm_area_struct *vma; struct pagemapread *pm = walk->private; + spinlock_t *ptl; pte_t *pte; int err = 0; pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); - if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { + if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { int pmd_flags2; if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) @@ -1017,7 +1027,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (err) break; } - spin_unlock(&walk->mm->page_table_lock); + spin_unlock(ptl); return err; } @@ -1319,7 +1329,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, md = walk->private; - if (pmd_trans_huge_lock(pmd, md->vma) == 1) { + if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) { pte_t huge_pte = *(pte_t *)pmd; struct page *page; @@ -1327,7 +1337,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, if (page) gather_stats(page, md, pte_dirty(huge_pte), HPAGE_PMD_SIZE/PAGE_SIZE); - spin_unlock(&walk->mm->page_table_lock); + spin_unlock(ptl); return 0; } @@ -1351,7 +1361,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, struct numa_maps *md; struct page *page; - if (pte_none(*pte)) + if (!pte_present(*pte)) return 0; page = pte_page(*pte); @@ -1385,8 +1395,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) struct mm_struct *mm = vma->vm_mm; struct mm_walk walk = {}; struct mempolicy *pol; - int n; - char buffer[50]; + char buffer[64]; + int nid; if (!mm) return 0; @@ -1402,18 +1412,16 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) walk.mm = mm; pol = get_vma_policy(task, vma, vma->vm_start); - n = mpol_to_str(buffer, sizeof(buffer), pol); + mpol_to_str(buffer, sizeof(buffer), pol); mpol_cond_put(pol); - if (n < 0) - return n; seq_printf(m, "%08lx %s", vma->vm_start, buffer); if (file) { - seq_printf(m, " file="); + seq_puts(m, " file="); seq_path(m, &file->f_path, "\n\t= "); } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { - seq_printf(m, " heap"); + seq_puts(m, " heap"); } else { pid_t tid = vm_is_stack(task, vma, is_pid); if (tid != 0) { @@ -1423,14 +1431,14 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) */ if (!is_pid || (vma->vm_start <= mm->start_stack && vma->vm_end >= mm->start_stack)) - seq_printf(m, " stack"); + seq_puts(m, " stack"); else seq_printf(m, " stack:%d", tid); } } if (is_vm_hugetlb_page(vma)) - seq_printf(m, " huge"); + seq_puts(m, " huge"); walk_page_range(vma->vm_start, vma->vm_end, &walk); @@ -1458,9 +1466,9 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) if (md->writeback) seq_printf(m, " writeback=%lu", md->writeback); - for_each_node_state(n, N_MEMORY) - if (md->node[n]) - seq_printf(m, " N%d=%lu", n, md->node[n]); + for_each_node_state(nid, N_MEMORY) + if (md->node[nid]) + seq_printf(m, " N%d=%lu", nid, md->node[nid]); out: seq_putc(m, '\n'); diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 56123a6f462..678455d2d68 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -123,14 +123,6 @@ unsigned long task_statm(struct mm_struct *mm, return size; } -static void pad_len_spaces(struct seq_file *m, int len) -{ - len = 25 + sizeof(void*) * 6 - len; - if (len < 1) - len = 1; - seq_printf(m, "%*c", len, ' '); -} - /* * display a single VMA to a sequenced file */ @@ -142,7 +134,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, unsigned long ino = 0; struct file *file; dev_t dev = 0; - int flags, len; + int flags; unsigned long long pgoff = 0; flags = vma->vm_flags; @@ -155,8 +147,9 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT; } + seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); seq_printf(m, - "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", + "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", vma->vm_start, vma->vm_end, flags & VM_READ ? 'r' : '-', @@ -164,16 +157,16 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', pgoff, - MAJOR(dev), MINOR(dev), ino, &len); + MAJOR(dev), MINOR(dev), ino); if (file) { - pad_len_spaces(m, len); + seq_pad(m, ' '); seq_path(m, &file->f_path, ""); } else if (mm) { pid_t tid = vm_is_stack(priv->task, vma, is_pid); if (tid != 0) { - pad_len_spaces(m, len); + seq_pad(m, ' '); /* * Thread stack in /proc/PID/task/TID/maps or * the main process stack. diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 06189462590..33de567c25a 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -5,7 +5,7 @@ #include <linux/seq_file.h> #include <linux/time.h> #include <linux/kernel_stat.h> -#include <asm/cputime.h> +#include <linux/cputime.h> static int uptime_proc_show(struct seq_file *m, void *v) { @@ -49,4 +49,4 @@ static int __init proc_uptime_init(void) proc_create("uptime", 0, NULL, &uptime_proc_fops); return 0; } -module_init(proc_uptime_init); +fs_initcall(proc_uptime_init); diff --git a/fs/proc/version.c b/fs/proc/version.c index 76817a60678..d2154eb6d78 100644 --- a/fs/proc/version.c +++ b/fs/proc/version.c @@ -31,4 +31,4 @@ static int __init proc_version_init(void) proc_create("version", 0, NULL, &version_proc_fops); return 0; } -module_init(proc_version_init); +fs_initcall(proc_version_init); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 9100d695988..382aa890e22 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -42,7 +42,7 @@ static size_t elfnotes_sz; /* Total size of vmcore file. */ static u64 vmcore_size; -static struct proc_dir_entry *proc_vmcore = NULL; +static struct proc_dir_entry *proc_vmcore; /* * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error @@ -468,17 +468,23 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr) return rc; } nhdr_ptr = notes_section; - while (real_sz < max_sz) { - if (nhdr_ptr->n_namesz == 0) - break; + while (nhdr_ptr->n_namesz != 0) { sz = sizeof(Elf64_Nhdr) + ((nhdr_ptr->n_namesz + 3) & ~3) + ((nhdr_ptr->n_descsz + 3) & ~3); + if ((real_sz + sz) > max_sz) { + pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n", + nhdr_ptr->n_namesz, nhdr_ptr->n_descsz); + break; + } real_sz += sz; nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz); } kfree(notes_section); phdr_ptr->p_memsz = real_sz; + if (real_sz == 0) { + pr_warn("Warning: Zero PT_NOTE entries found\n"); + } } return 0; @@ -648,17 +654,23 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr) return rc; } nhdr_ptr = notes_section; - while (real_sz < max_sz) { - if (nhdr_ptr->n_namesz == 0) - break; + while (nhdr_ptr->n_namesz != 0) { sz = sizeof(Elf32_Nhdr) + ((nhdr_ptr->n_namesz + 3) & ~3) + ((nhdr_ptr->n_descsz + 3) & ~3); + if ((real_sz + sz) > max_sz) { + pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n", + nhdr_ptr->n_namesz, nhdr_ptr->n_descsz); + break; + } real_sz += sz; nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz); } kfree(notes_section); phdr_ptr->p_memsz = real_sz; + if (real_sz == 0) { + pr_warn("Warning: Zero PT_NOTE entries found\n"); + } } return 0; @@ -1082,7 +1094,7 @@ static int __init vmcore_init(void) proc_vmcore->size = vmcore_size; return 0; } -module_init(vmcore_init) +fs_initcall(vmcore_init); /* Cleanup function for vmcore module. */ void vmcore_cleanup(void) @@ -1104,4 +1116,3 @@ void vmcore_cleanup(void) } free_elfcorebuf(); } -EXPORT_SYMBOL_GPL(vmcore_cleanup); diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 5fe34c355e8..1a81373947f 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -20,15 +20,15 @@ static unsigned mounts_poll(struct file *file, poll_table *wait) struct proc_mounts *p = proc_mounts(file->private_data); struct mnt_namespace *ns = p->ns; unsigned res = POLLIN | POLLRDNORM; + int event; poll_wait(file, &p->ns->poll, wait); - br_read_lock(&vfsmount_lock); - if (p->m.poll_event != ns->event) { - p->m.poll_event = ns->event; + event = ACCESS_ONCE(ns->event); + if (p->m.poll_event != event) { + p->m.poll_event = event; res |= POLLERR | POLLPRI; } - br_read_unlock(&vfsmount_lock); return res; } @@ -234,17 +234,12 @@ static int mounts_open_common(struct inode *inode, struct file *file, rcu_read_lock(); nsp = task_nsproxy(task); - if (!nsp) { + if (!nsp || !nsp->mnt_ns) { rcu_read_unlock(); put_task_struct(task); goto err; } ns = nsp->mnt_ns; - if (!ns) { - rcu_read_unlock(); - put_task_struct(task); - goto err; - } get_mnt_ns(ns); rcu_read_unlock(); task_lock(task); @@ -272,6 +267,7 @@ static int mounts_open_common(struct inode *inode, struct file *file, p->root = root; p->m.poll_event = ns->event; p->show = show; + p->cached_event = ~0ULL; return 0; diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 12823845d32..192297b0090 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -249,6 +249,7 @@ static void parse_options(char *options) static int pstore_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); parse_options(data); return 0; diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index b8e93a40a5d..0a9b72cdfec 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -18,6 +18,8 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) "pstore: " fmt + #include <linux/atomic.h> #include <linux/types.h> #include <linux/errno.h> @@ -224,14 +226,12 @@ static void allocate_buf_for_compression(void) zlib_inflate_workspacesize()); stream.workspace = kmalloc(size, GFP_KERNEL); if (!stream.workspace) { - pr_err("pstore: No memory for compression workspace; " - "skipping compression\n"); + pr_err("No memory for compression workspace; skipping compression\n"); kfree(big_oops_buf); big_oops_buf = NULL; } } else { - pr_err("No memory for uncompressed data; " - "skipping compression\n"); + pr_err("No memory for uncompressed data; skipping compression\n"); stream.workspace = NULL; } @@ -443,8 +443,11 @@ int pstore_register(struct pstore_info *psi) pstore_get_records(0); kmsg_dump_register(&pstore_dumper); - pstore_register_console(); - pstore_register_ftrace(); + + if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) { + pstore_register_console(); + pstore_register_ftrace(); + } if (pstore_update_ms >= 0) { pstore_timer.expires = jiffies + @@ -452,8 +455,7 @@ int pstore_register(struct pstore_info *psi) add_timer(&pstore_timer); } - pr_info("pstore: Registered %s as persistent store backend\n", - psi->name); + pr_info("Registered %s as persistent store backend\n", psi->name); return 0; } @@ -494,12 +496,13 @@ void pstore_get_records(int quiet) big_oops_buf_sz); if (unzipped_len > 0) { + kfree(buf); buf = big_oops_buf; size = unzipped_len; compressed = false; } else { - pr_err("pstore: decompression failed;" - "returned %d\n", unzipped_len); + pr_err("decompression failed;returned %d\n", + unzipped_len); compressed = true; } } @@ -520,8 +523,8 @@ out: mutex_unlock(&psi->read_mutex); if (failed) - printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n", - failed, psi->name); + pr_warn("failed to load %d record(s) from '%s'\n", + failed, psi->name); } static void pstore_dowork(struct work_struct *work) diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index fa8cef2cca3..3b5744306ed 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -86,6 +86,7 @@ struct ramoops_context { struct persistent_ram_ecc_info ecc_info; unsigned int max_dump_cnt; unsigned int dump_write_cnt; + /* _read_cnt need clear on ramoops_pstore_open */ unsigned int dump_read_cnt; unsigned int console_read_cnt; unsigned int ftrace_read_cnt; @@ -101,6 +102,7 @@ static int ramoops_pstore_open(struct pstore_info *psi) cxt->dump_read_cnt = 0; cxt->console_read_cnt = 0; + cxt->ftrace_read_cnt = 0; return 0; } @@ -117,13 +119,15 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max, return NULL; prz = przs[i]; + if (!prz) + return NULL; - if (update) { - /* Update old/shadowed buffer. */ + /* Update old/shadowed buffer. */ + if (update) persistent_ram_save_old(prz); - if (!persistent_ram_old_size(prz)) - return NULL; - } + + if (!persistent_ram_old_size(prz)) + return NULL; *typep = type; *id = i; @@ -316,6 +320,7 @@ static void ramoops_free_przs(struct ramoops_context *cxt) { int i; + cxt->max_dump_cnt = 0; if (!cxt->przs) return; @@ -346,7 +351,7 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, GFP_KERNEL); if (!cxt->przs) { dev_err(dev, "failed to initialize a prz array for dumps\n"); - return -ENOMEM; + goto fail_prz; } for (i = 0; i < cxt->max_dump_cnt; i++) { @@ -428,7 +433,6 @@ static int ramoops_probe(struct platform_device *pdev) if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size)) pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size); - cxt->dump_read_cnt = 0; cxt->size = pdata->mem_size; cxt->phys_addr = pdata->mem_address; cxt->record_size = pdata->record_size; @@ -505,7 +509,6 @@ fail_buf: kfree(cxt->pstore.buf); fail_clear: cxt->pstore.bufsize = 0; - cxt->max_dump_cnt = 0; fail_cnt: kfree(cxt->fprz); fail_init_fprz: diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index de272d42676..34a1e5aa848 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -12,6 +12,8 @@ * */ +#define pr_fmt(fmt) "persistent_ram: " fmt + #include <linux/device.h> #include <linux/err.h> #include <linux/errno.h> @@ -54,7 +56,7 @@ static size_t buffer_start_add_atomic(struct persistent_ram_zone *prz, size_t a) do { old = atomic_read(&prz->buffer->start); new = old + a; - while (unlikely(new > prz->buffer_size)) + while (unlikely(new >= prz->buffer_size)) new -= prz->buffer_size; } while (atomic_cmpxchg(&prz->buffer->start, old, new) != old); @@ -91,7 +93,7 @@ static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a) old = atomic_read(&prz->buffer->start); new = old + a; - while (unlikely(new > prz->buffer_size)) + while (unlikely(new >= prz->buffer_size)) new -= prz->buffer_size; atomic_set(&prz->buffer->start, new); @@ -205,12 +207,10 @@ static void persistent_ram_ecc_old(struct persistent_ram_zone *prz) size = buffer->data + prz->buffer_size - block; numerr = persistent_ram_decode_rs8(prz, block, size, par); if (numerr > 0) { - pr_devel("persistent_ram: error in block %p, %d\n", - block, numerr); + pr_devel("error in block %p, %d\n", block, numerr); prz->corrected_bytes += numerr; } else if (numerr < 0) { - pr_devel("persistent_ram: uncorrectable error in block %p\n", - block); + pr_devel("uncorrectable error in block %p\n", block); prz->bad_blocks++; } block += prz->ecc_info.block_size; @@ -257,7 +257,7 @@ static int persistent_ram_init_ecc(struct persistent_ram_zone *prz, prz->rs_decoder = init_rs(prz->ecc_info.symsize, prz->ecc_info.poly, 0, 1, prz->ecc_info.ecc_size); if (prz->rs_decoder == NULL) { - pr_info("persistent_ram: init_rs failed\n"); + pr_info("init_rs failed\n"); return -EINVAL; } @@ -267,10 +267,10 @@ static int persistent_ram_init_ecc(struct persistent_ram_zone *prz, numerr = persistent_ram_decode_rs8(prz, buffer, sizeof(*buffer), prz->par_header); if (numerr > 0) { - pr_info("persistent_ram: error in header, %d\n", numerr); + pr_info("error in header, %d\n", numerr); prz->corrected_bytes += numerr; } else if (numerr < 0) { - pr_info("persistent_ram: uncorrectable error in header\n"); + pr_info("uncorrectable error in header\n"); prz->bad_blocks++; } @@ -317,7 +317,7 @@ void persistent_ram_save_old(struct persistent_ram_zone *prz) prz->old_log = kmalloc(size, GFP_KERNEL); } if (!prz->old_log) { - pr_err("persistent_ram: failed to allocate buffer\n"); + pr_err("failed to allocate buffer\n"); return; } @@ -396,8 +396,8 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size) pages = kmalloc(sizeof(struct page *) * page_count, GFP_KERNEL); if (!pages) { - pr_err("%s: Failed to allocate array for %u pages\n", __func__, - page_count); + pr_err("%s: Failed to allocate array for %u pages\n", + __func__, page_count); return NULL; } @@ -462,19 +462,17 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, if (prz->buffer->sig == sig) { if (buffer_size(prz) > prz->buffer_size || buffer_start(prz) > buffer_size(prz)) - pr_info("persistent_ram: found existing invalid buffer," - " size %zu, start %zu\n", - buffer_size(prz), buffer_start(prz)); + pr_info("found existing invalid buffer, size %zu, start %zu\n", + buffer_size(prz), buffer_start(prz)); else { - pr_debug("persistent_ram: found existing buffer," - " size %zu, start %zu\n", - buffer_size(prz), buffer_start(prz)); + pr_debug("found existing buffer, size %zu, start %zu\n", + buffer_size(prz), buffer_start(prz)); persistent_ram_save_old(prz); return 0; } } else { - pr_debug("persistent_ram: no valid data in buffer" - " (sig = 0x%08x)\n", prz->buffer->sig); + pr_debug("no valid data in buffer (sig = 0x%08x)\n", + prz->buffer->sig); } prz->buffer->sig = sig; @@ -509,7 +507,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, prz = kzalloc(sizeof(struct persistent_ram_zone), GFP_KERNEL); if (!prz) { - pr_err("persistent_ram: failed to allocate persistent ram zone\n"); + pr_err("failed to allocate persistent ram zone\n"); goto err; } diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 2e8caa62da7..c4bcb778886 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -27,7 +27,6 @@ static const struct super_operations qnx4_sops; -static void qnx4_put_super(struct super_block *sb); static struct inode *qnx4_alloc_inode(struct super_block *sb); static void qnx4_destroy_inode(struct inode *inode); static int qnx4_remount(struct super_block *sb, int *flags, char *data); @@ -37,7 +36,6 @@ static const struct super_operations qnx4_sops = { .alloc_inode = qnx4_alloc_inode, .destroy_inode = qnx4_destroy_inode, - .put_super = qnx4_put_super, .statfs = qnx4_statfs, .remount_fs = qnx4_remount, }; @@ -46,6 +44,7 @@ static int qnx4_remount(struct super_block *sb, int *flags, char *data) { struct qnx4_sb_info *qs; + sync_filesystem(sb); qs = qnx4_sb(sb); qs->Version = QNX4_VERSION; *flags |= MS_RDONLY; @@ -148,18 +147,19 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf) * it really _is_ a qnx4 filesystem, and to check the size * of the directory entry. */ -static const char *qnx4_checkroot(struct super_block *sb) +static const char *qnx4_checkroot(struct super_block *sb, + struct qnx4_super_block *s) { struct buffer_head *bh; struct qnx4_inode_entry *rootdir; int rd, rl; int i, j; - if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/') + if (s->RootDir.di_fname[0] != '/' || s->RootDir.di_fname[1] != '\0') return "no qnx4 filesystem (no root dir)."; QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id)); - rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1; - rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size); + rd = le32_to_cpu(s->RootDir.di_first_xtnt.xtnt_blk) - 1; + rl = le32_to_cpu(s->RootDir.di_first_xtnt.xtnt_size); for (j = 0; j < rl; j++) { bh = sb_bread(sb, rd + j); /* root dir, first block */ if (bh == NULL) @@ -189,7 +189,6 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent) struct inode *root; const char *errmsg; struct qnx4_sb_info *qs; - int ret = -EINVAL; qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL); if (!qs) @@ -198,67 +197,50 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent) sb_set_blocksize(s, QNX4_BLOCK_SIZE); + s->s_op = &qnx4_sops; + s->s_magic = QNX4_SUPER_MAGIC; + s->s_flags |= MS_RDONLY; /* Yup, read-only yet */ + /* Check the superblock signature. Since the qnx4 code is dangerous, we should leave as quickly as possible if we don't belong here... */ bh = sb_bread(s, 1); if (!bh) { printk(KERN_ERR "qnx4: unable to read the superblock\n"); - goto outnobh; + return -EINVAL; } - if ( le32_to_cpup((__le32*) bh->b_data) != QNX4_SUPER_MAGIC ) { - if (!silent) - printk(KERN_ERR "qnx4: wrong fsid in superblock.\n"); - goto out; - } - s->s_op = &qnx4_sops; - s->s_magic = QNX4_SUPER_MAGIC; - s->s_flags |= MS_RDONLY; /* Yup, read-only yet */ - qnx4_sb(s)->sb_buf = bh; - qnx4_sb(s)->sb = (struct qnx4_super_block *) bh->b_data; - /* check before allocating dentries, inodes, .. */ - errmsg = qnx4_checkroot(s); + errmsg = qnx4_checkroot(s, (struct qnx4_super_block *) bh->b_data); + brelse(bh); if (errmsg != NULL) { if (!silent) printk(KERN_ERR "qnx4: %s\n", errmsg); - goto out; + return -EINVAL; } /* does root not have inode number QNX4_ROOT_INO ?? */ root = qnx4_iget(s, QNX4_ROOT_INO * QNX4_INODES_PER_BLOCK); if (IS_ERR(root)) { printk(KERN_ERR "qnx4: get inode failed\n"); - ret = PTR_ERR(root); - goto outb; + return PTR_ERR(root); } - ret = -ENOMEM; s->s_root = d_make_root(root); if (s->s_root == NULL) - goto outb; + return -ENOMEM; - brelse(bh); return 0; - - outb: - kfree(qs->BitMap); - out: - brelse(bh); - outnobh: - kfree(qs); - s->s_fs_info = NULL; - return ret; } -static void qnx4_put_super(struct super_block *sb) +static void qnx4_kill_sb(struct super_block *sb) { struct qnx4_sb_info *qs = qnx4_sb(sb); - kfree( qs->BitMap ); - kfree( qs ); - sb->s_fs_info = NULL; - return; + kill_block_super(sb); + if (qs) { + kfree(qs->BitMap); + kfree(qs); + } } static int qnx4_readpage(struct file *file, struct page *page) @@ -409,7 +391,7 @@ static struct file_system_type qnx4_fs_type = { .owner = THIS_MODULE, .name = "qnx4", .mount = qnx4_mount, - .kill_sb = kill_block_super, + .kill_sb = qnx4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("qnx4"); diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c index d024505ba00..e62c8183777 100644 --- a/fs/qnx4/namei.c +++ b/fs/qnx4/namei.c @@ -60,10 +60,6 @@ static struct buffer_head *qnx4_find_entry(int len, struct inode *dir, struct buffer_head *bh; *res_dir = NULL; - if (!dir->i_sb) { - printk(KERN_WARNING "qnx4: no superblock on dir.\n"); - return NULL; - } bh = NULL; block = offset = blkofs = 0; while (blkofs * QNX4_BLOCK_SIZE + offset < dir->i_size) { diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h index 34e2d329c97..c9b1be2c164 100644 --- a/fs/qnx4/qnx4.h +++ b/fs/qnx4/qnx4.h @@ -10,8 +10,6 @@ #endif struct qnx4_sb_info { - struct buffer_head *sb_buf; /* superblock buffer */ - struct qnx4_super_block *sb; /* our superblock */ unsigned int Version; /* may be useful */ struct qnx4_inode_entry *BitMap; /* useful */ }; diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 8d941edfefa..65cdaab3ed4 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -55,6 +55,7 @@ static int qnx6_show_options(struct seq_file *seq, struct dentry *root) static int qnx6_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_RDONLY; return 0; } diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig index 880fd988436..c51df1dd237 100644 --- a/fs/quota/Kconfig +++ b/fs/quota/Kconfig @@ -8,9 +8,10 @@ config QUOTA help If you say Y here, you will be able to set per user limits for disk usage (also called disk quotas). Currently, it works for the - ext2, ext3, and reiserfs file system. ext3 also supports journalled - quotas for which you don't need to run quotacheck(8) after an unclean - shutdown. + ext2, ext3, ext4, jfs, ocfs2 and reiserfs file systems. + Note that gfs2 and xfs use their own quota system. + Ext3, ext4 and reiserfs also support journaled quotas for which + you don't need to run quotacheck(8) after an unclean shutdown. For further details, read the Quota mini-HOWTO, available from <http://www.tldp.org/docs.html#howto>, or the documentation provided with the quota tools. Probably the quota support is only useful for diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 831d49a4111..7f30bdc57d1 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -528,7 +528,7 @@ restart: if (atomic_read(&dquot->dq_count)) { DEFINE_WAIT(wait); - atomic_inc(&dquot->dq_count); + dqgrab(dquot); prepare_to_wait(&dquot->dq_wait_unused, &wait, TASK_UNINTERRUPTIBLE); spin_unlock(&dq_list_lock); @@ -581,9 +581,17 @@ int dquot_scan_active(struct super_block *sb, dqstats_inc(DQST_LOOKUPS); dqput(old_dquot); old_dquot = dquot; - ret = fn(dquot, priv); - if (ret < 0) - goto out; + /* + * ->release_dquot() can be racing with us. Our reference + * protects us from new calls to it so just wait for any + * outstanding call and recheck the DQ_ACTIVE_B after that. + */ + wait_on_dquot(dquot); + if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { + ret = fn(dquot, priv); + if (ret < 0) + goto out; + } spin_lock(&dq_list_lock); /* We are safe to continue now because our dquot could not * be moved out of the inuse list while we hold the reference */ @@ -624,7 +632,7 @@ int dquot_writeback_dquots(struct super_block *sb, int type) /* Now we have active dquot from which someone is * holding reference so we can safely just increase * use count */ - atomic_inc(&dquot->dq_count); + dqgrab(dquot); spin_unlock(&dq_list_lock); dqstats_inc(DQST_LOOKUPS); err = sb->dq_op->write_dquot(dquot); @@ -694,6 +702,7 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) struct dquot *dquot; unsigned long freed = 0; + spin_lock(&dq_list_lock); head = free_dquots.prev; while (head != &free_dquots && sc->nr_to_scan) { dquot = list_entry(head, struct dquot, dq_free); @@ -705,6 +714,7 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) freed++; head = free_dquots.prev; } + spin_unlock(&dq_list_lock); return freed; } diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c index 16e8abb7709..72d29177998 100644 --- a/fs/quota/netlink.c +++ b/fs/quota/netlink.c @@ -9,13 +9,25 @@ #include <net/netlink.h> #include <net/genetlink.h> +static const struct genl_multicast_group quota_mcgrps[] = { + { .name = "events", }, +}; + /* Netlink family structure for quota */ static struct genl_family quota_genl_family = { - .id = GENL_ID_GENERATE, + /* + * Needed due to multicast group ID abuse - old code assumed + * the family ID was also a valid multicast group ID (which + * isn't true) and userspace might thus rely on it. Assign a + * static ID for this group to make dealing with that easier. + */ + .id = GENL_ID_VFS_DQUOT, .hdrsize = 0, .name = "VFS_DQUOT", .version = 1, .maxattr = QUOTA_NL_A_MAX, + .mcgrps = quota_mcgrps, + .n_mcgrps = ARRAY_SIZE(quota_mcgrps), }; /** @@ -78,7 +90,7 @@ void quota_send_warning(struct kqid qid, dev_t dev, goto attr_err_out; genlmsg_end(skb, msg_head); - genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS); + genlmsg_multicast("a_genl_family, skb, 0, 0, GFP_NOFS); return; attr_err_out: printk(KERN_ERR "VFS: Not enough space to compose quota message!\n"); diff --git a/fs/quota/quota.c b/fs/quota/quota.c index dea86e8967e..ff3f0b3cfdb 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -117,6 +117,7 @@ static int quota_setinfo(struct super_block *sb, int type, void __user *addr) static void copy_to_if_dqblk(struct if_dqblk *dst, struct fs_disk_quota *src) { + memset(dst, 0, sizeof(*dst)); dst->dqb_bhardlimit = src->d_blk_hardlimit; dst->dqb_bsoftlimit = src->d_blk_softlimit; dst->dqb_curspace = src->d_bcount; @@ -277,6 +278,17 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id, return ret; } +static int quota_rmxquota(struct super_block *sb, void __user *addr) +{ + __u32 flags; + + if (copy_from_user(&flags, addr, sizeof(flags))) + return -EFAULT; + if (!sb->s_qcop->rm_xquota) + return -ENOSYS; + return sb->s_qcop->rm_xquota(sb, flags); +} + /* Copy parameters and call proper function */ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void __user *addr, struct path *path) @@ -315,8 +327,9 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, return sb->s_qcop->quota_sync(sb, type); case Q_XQUOTAON: case Q_XQUOTAOFF: - case Q_XQUOTARM: return quota_setxstate(sb, cmd, addr); + case Q_XQUOTARM: + return quota_rmxquota(sb, addr); case Q_XGETQSTAT: return quota_getxstate(sb, addr); case Q_XGETQSTATV: diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index 4884ac5ae9b..4f56de822d2 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -30,22 +30,15 @@ #include "internal.h" -const struct address_space_operations ramfs_aops = { - .readpage = simple_readpage, - .write_begin = simple_write_begin, - .write_end = simple_write_end, - .set_page_dirty = __set_page_dirty_no_writeback, -}; - const struct file_operations ramfs_file_operations = { - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .fsync = noop_fsync, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, .llseek = generic_file_llseek, }; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 8d5b438cc18..dda012ad420 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -27,24 +27,23 @@ #include "internal.h" static int ramfs_nommu_setattr(struct dentry *, struct iattr *); - -const struct address_space_operations ramfs_aops = { - .readpage = simple_readpage, - .write_begin = simple_write_begin, - .write_end = simple_write_end, - .set_page_dirty = __set_page_dirty_no_writeback, -}; +static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags); +static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma); const struct file_operations ramfs_file_operations = { .mmap = ramfs_nommu_mmap, .get_unmapped_area = ramfs_nommu_get_unmapped_area, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .fsync = noop_fsync, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, .llseek = generic_file_llseek, }; @@ -197,7 +196,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia) * - the pages to be mapped must exist * - the pages be physically contiguous in sequence */ -unsigned long ramfs_nommu_get_unmapped_area(struct file *file, +static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { @@ -256,7 +255,7 @@ out: /* * set up a mapping for shared memory segments */ -int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma) +static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma) { if (!(vma->vm_flags & VM_SHARED)) return -ENOSYS; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 39d14659a8d..d365b1c4eb3 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -43,6 +43,13 @@ static const struct super_operations ramfs_ops; static const struct inode_operations ramfs_dir_inode_operations; +static const struct address_space_operations ramfs_aops = { + .readpage = simple_readpage, + .write_begin = simple_write_begin, + .write_end = simple_write_end, + .set_page_dirty = __set_page_dirty_no_writeback, +}; + static struct backing_dev_info ramfs_backing_dev_info = { .name = "ramfs", .ra_pages = 0, /* No readahead */ @@ -275,4 +282,4 @@ int __init init_ramfs_fs(void) return err; } -module_init(init_ramfs_fs) +fs_initcall(init_ramfs_fs); diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h index 6b330639b51..a9d8ae88fa1 100644 --- a/fs/ramfs/internal.h +++ b/fs/ramfs/internal.h @@ -10,5 +10,4 @@ */ -extern const struct address_space_operations ramfs_aops; extern const struct inode_operations ramfs_file_inode_operations; diff --git a/fs/read_write.c b/fs/read_write.c index e3cd280b158..009d8542a88 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -25,11 +25,12 @@ typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *); typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *, unsigned long, loff_t); +typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *); const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, + .read = new_sync_read, + .read_iter = generic_file_read_iter, .mmap = generic_file_readonly_mmap, .splice_read = generic_file_splice_read, }; @@ -257,17 +258,29 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence) fn = no_llseek; if (file->f_mode & FMODE_LSEEK) { - if (file->f_op && file->f_op->llseek) + if (file->f_op->llseek) fn = file->f_op->llseek; } return fn(file, offset, whence); } EXPORT_SYMBOL(vfs_llseek); +static inline struct fd fdget_pos(int fd) +{ + return __to_fd(__fdget_pos(fd)); +} + +static inline void fdput_pos(struct fd f) +{ + if (f.flags & FDPUT_POS_UNLOCK) + mutex_unlock(&f.file->f_pos_lock); + fdput(f); +} + SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) { off_t retval; - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); if (!f.file) return -EBADF; @@ -278,7 +291,7 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) if (res != (loff_t)retval) retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ } - fdput(f); + fdput_pos(f); return retval; } @@ -295,7 +308,7 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned int, whence) { int retval; - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); loff_t offset; if (!f.file) @@ -315,7 +328,7 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, retval = 0; } out_putf: - fdput(f); + fdput_pos(f); return retval; } #endif @@ -378,13 +391,34 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp EXPORT_SYMBOL(do_sync_read); +ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) +{ + struct iovec iov = { .iov_base = buf, .iov_len = len }; + struct kiocb kiocb; + struct iov_iter iter; + ssize_t ret; + + init_sync_kiocb(&kiocb, filp); + kiocb.ki_pos = *ppos; + kiocb.ki_nbytes = len; + iov_iter_init(&iter, READ, &iov, 1, len); + + ret = filp->f_op->read_iter(&kiocb, &iter); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&kiocb); + *ppos = kiocb.ki_pos; + return ret; +} + +EXPORT_SYMBOL(new_sync_read); + ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_READ)) return -EBADF; - if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read)) + if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; if (unlikely(!access_ok(VERIFY_WRITE, buf, count))) return -EFAULT; @@ -394,8 +428,10 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) count = ret; if (file->f_op->read) ret = file->f_op->read(file, buf, count, pos); - else + else if (file->f_op->aio_read) ret = do_sync_read(file, buf, count, pos); + else + ret = new_sync_read(file, buf, count, pos); if (ret > 0) { fsnotify_access(file); add_rchar(current, ret); @@ -427,13 +463,34 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof EXPORT_SYMBOL(do_sync_write); +ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) +{ + struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len }; + struct kiocb kiocb; + struct iov_iter iter; + ssize_t ret; + + init_sync_kiocb(&kiocb, filp); + kiocb.ki_pos = *ppos; + kiocb.ki_nbytes = len; + iov_iter_init(&iter, WRITE, &iov, 1, len); + + ret = filp->f_op->write_iter(&kiocb, &iter); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&kiocb); + *ppos = kiocb.ki_pos; + return ret; +} + +EXPORT_SYMBOL(new_sync_write); + ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos) { mm_segment_t old_fs; const char __user *p; ssize_t ret; - if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write)) + if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; old_fs = get_fs(); @@ -443,8 +500,10 @@ ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t count = MAX_RW_COUNT; if (file->f_op->write) ret = file->f_op->write(file, p, count, pos); - else + else if (file->f_op->aio_write) ret = do_sync_write(file, p, count, pos); + else + ret = new_sync_write(file, p, count, pos); set_fs(old_fs); if (ret > 0) { fsnotify_modify(file); @@ -460,7 +519,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ if (!(file->f_mode & FMODE_WRITE)) return -EBADF; - if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write)) + if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (unlikely(!access_ok(VERIFY_READ, buf, count))) return -EFAULT; @@ -471,8 +530,10 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ file_start_write(file); if (file->f_op->write) ret = file->f_op->write(file, buf, count, pos); - else + else if (file->f_op->aio_write) ret = do_sync_write(file, buf, count, pos); + else + ret = new_sync_write(file, buf, count, pos); if (ret > 0) { fsnotify_modify(file); add_wchar(current, ret); @@ -498,7 +559,7 @@ static inline void file_pos_write(struct file *file, loff_t pos) SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { @@ -506,7 +567,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) ret = vfs_read(f.file, buf, count, &pos); if (ret >= 0) file_pos_write(f.file, pos); - fdput(f); + fdput_pos(f); } return ret; } @@ -514,7 +575,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { @@ -522,7 +583,7 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, ret = vfs_write(f.file, buf, count, &pos); if (ret >= 0) file_pos_write(f.file, pos); - fdput(f); + fdput_pos(f); } return ret; @@ -589,6 +650,25 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) } EXPORT_SYMBOL(iov_shorten); +static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iovec *iov, + unsigned long nr_segs, size_t len, loff_t *ppos, iter_fn_t fn) +{ + struct kiocb kiocb; + struct iov_iter iter; + ssize_t ret; + + init_sync_kiocb(&kiocb, filp); + kiocb.ki_pos = *ppos; + kiocb.ki_nbytes = len; + + iov_iter_init(&iter, rw, iov, nr_segs, len); + ret = fn(&kiocb, &iter); + if (ret == -EIOCBQUEUED) + ret = wait_on_sync_kiocb(&kiocb); + *ppos = kiocb.ki_pos; + return ret; +} + static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn) { @@ -726,11 +806,7 @@ static ssize_t do_readv_writev(int type, struct file *file, ssize_t ret; io_fn_t fn; iov_fn_t fnv; - - if (!file->f_op) { - ret = -EINVAL; - goto out; - } + iter_fn_t iter_fn; ret = rw_copy_check_uvector(type, uvector, nr_segs, ARRAY_SIZE(iovstack), iovstack, &iov); @@ -746,13 +822,18 @@ static ssize_t do_readv_writev(int type, struct file *file, if (type == READ) { fn = file->f_op->read; fnv = file->f_op->aio_read; + iter_fn = file->f_op->read_iter; } else { fn = (io_fn_t)file->f_op->write; fnv = file->f_op->aio_write; + iter_fn = file->f_op->write_iter; file_start_write(file); } - if (fnv) + if (iter_fn) + ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len, + pos, iter_fn); + else if (fnv) ret = do_sync_readv_writev(file, iov, nr_segs, tot_len, pos, fnv); else @@ -778,7 +859,7 @@ ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, { if (!(file->f_mode & FMODE_READ)) return -EBADF; - if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read)) + if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; return do_readv_writev(READ, file, vec, vlen, pos); @@ -791,7 +872,7 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, { if (!(file->f_mode & FMODE_WRITE)) return -EBADF; - if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write)) + if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; return do_readv_writev(WRITE, file, vec, vlen, pos); @@ -802,7 +883,7 @@ EXPORT_SYMBOL(vfs_writev); SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { @@ -810,7 +891,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, ret = vfs_readv(f.file, vec, vlen, &pos); if (ret >= 0) file_pos_write(f.file, pos); - fdput(f); + fdput_pos(f); } if (ret > 0) @@ -822,7 +903,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { @@ -830,7 +911,7 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, ret = vfs_writev(f.file, vec, vlen, &pos); if (ret >= 0) file_pos_write(f.file, pos); - fdput(f); + fdput_pos(f); } if (ret > 0) @@ -905,14 +986,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, ssize_t ret; io_fn_t fn; iov_fn_t fnv; - - ret = -EINVAL; - if (!file->f_op) - goto out; - - ret = -EFAULT; - if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) - goto out; + iter_fn_t iter_fn; ret = compat_rw_copy_check_uvector(type, uvector, nr_segs, UIO_FASTIOV, iovstack, &iov); @@ -928,13 +1002,18 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, if (type == READ) { fn = file->f_op->read; fnv = file->f_op->aio_read; + iter_fn = file->f_op->read_iter; } else { fn = (io_fn_t)file->f_op->write; fnv = file->f_op->aio_write; + iter_fn = file->f_op->write_iter; file_start_write(file); } - if (fnv) + if (iter_fn) + ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len, + pos, iter_fn); + else if (fnv) ret = do_sync_readv_writev(file, iov, nr_segs, tot_len, pos, fnv); else @@ -965,7 +1044,7 @@ static size_t compat_readv(struct file *file, goto out; ret = -EINVAL; - if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read)) + if (!(file->f_mode & FMODE_CAN_READ)) goto out; ret = compat_do_readv_writev(READ, file, vec, vlen, pos); @@ -977,11 +1056,11 @@ out: return ret; } -COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd, +COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, const struct compat_iovec __user *,vec, - unsigned long, vlen) + compat_ulong_t, vlen) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret; loff_t pos; @@ -991,13 +1070,13 @@ COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd, ret = compat_readv(f.file, vec, vlen, &pos); if (ret >= 0) f.file->f_pos = pos; - fdput(f); + fdput_pos(f); return ret; } -COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, - const struct compat_iovec __user *,vec, - unsigned long, vlen, loff_t, pos) +static long __compat_sys_preadv64(unsigned long fd, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t pos) { struct fd f; ssize_t ret; @@ -1014,12 +1093,22 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, return ret; } -COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd, +#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 +COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, const struct compat_iovec __user *,vec, - unsigned long, vlen, u32, pos_low, u32, pos_high) + unsigned long, vlen, loff_t, pos) +{ + return __compat_sys_preadv64(fd, vec, vlen, pos); +} +#endif + +COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, + const struct compat_iovec __user *,vec, + compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; - return compat_sys_preadv64(fd, vec, vlen, pos); + + return __compat_sys_preadv64(fd, vec, vlen, pos); } static size_t compat_writev(struct file *file, @@ -1032,7 +1121,7 @@ static size_t compat_writev(struct file *file, goto out; ret = -EINVAL; - if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write)) + if (!(file->f_mode & FMODE_CAN_WRITE)) goto out; ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos); @@ -1044,11 +1133,11 @@ out: return ret; } -COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd, +COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, const struct compat_iovec __user *, vec, - unsigned long, vlen) + compat_ulong_t, vlen) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret; loff_t pos; @@ -1058,13 +1147,13 @@ COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd, ret = compat_writev(f.file, vec, vlen, &pos); if (ret >= 0) f.file->f_pos = pos; - fdput(f); + fdput_pos(f); return ret; } -COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, - const struct compat_iovec __user *,vec, - unsigned long, vlen, loff_t, pos) +static long __compat_sys_pwritev64(unsigned long fd, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t pos) { struct fd f; ssize_t ret; @@ -1081,12 +1170,22 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, return ret; } -COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd, +#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 +COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, loff_t, pos) +{ + return __compat_sys_pwritev64(fd, vec, vlen, pos); +} +#endif + +COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, const struct compat_iovec __user *,vec, - unsigned long, vlen, u32, pos_low, u32, pos_high) + compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; - return compat_sys_pwritev64(fd, vec, vlen, pos); + + return __compat_sys_pwritev64(fd, vec, vlen, pos); } #endif diff --git a/fs/readdir.c b/fs/readdir.c index 93d71e57431..33fd92208cb 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -13,6 +13,7 @@ #include <linux/stat.h> #include <linux/file.h> #include <linux/fs.h> +#include <linux/fsnotify.h> #include <linux/dirent.h> #include <linux/security.h> #include <linux/syscalls.h> @@ -24,7 +25,7 @@ int iterate_dir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); int res = -ENOTDIR; - if (!file->f_op || !file->f_op->iterate) + if (!file->f_op->iterate) goto out; res = security_file_permission(file, MAY_READ); @@ -40,6 +41,7 @@ int iterate_dir(struct file *file, struct dir_context *ctx) ctx->pos = file->f_pos; res = file->f_op->iterate(file, ctx); file->f_pos = ctx->pos; + fsnotify_access(file); file_accessed(file); } mutex_unlock(&inode->i_mutex); diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h index f096b80e73d..4a211f5b34b 100644 --- a/fs/reiserfs/acl.h +++ b/fs/reiserfs/acl.h @@ -48,18 +48,18 @@ static inline int reiserfs_acl_count(size_t size) #ifdef CONFIG_REISERFS_FS_POSIX_ACL struct posix_acl *reiserfs_get_acl(struct inode *inode, int type); +int reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); int reiserfs_acl_chmod(struct inode *inode); int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, struct inode *dir, struct dentry *dentry, struct inode *inode); int reiserfs_cache_default_acl(struct inode *dir); -extern const struct xattr_handler reiserfs_posix_acl_default_handler; -extern const struct xattr_handler reiserfs_posix_acl_access_handler; #else #define reiserfs_cache_default_acl(inode) 0 #define reiserfs_get_acl NULL +#define reiserfs_set_acl NULL static inline int reiserfs_acl_chmod(struct inode *inode) { diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index dc9a6829f7c..dc198bc64c6 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -50,8 +50,10 @@ static inline void get_bit_address(struct super_block *s, unsigned int *bmap_nr, unsigned int *offset) { - /* It is in the bitmap block number equal to the block - * number divided by the number of bits in a block. */ + /* + * It is in the bitmap block number equal to the block + * number divided by the number of bits in a block. + */ *bmap_nr = block >> (s->s_blocksize_bits + 3); /* Within that bitmap block it is located at bit offset *offset. */ *offset = block & ((s->s_blocksize << 3) - 1); @@ -71,10 +73,12 @@ int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value) get_bit_address(s, block, &bmap, &offset); - /* Old format filesystem? Unlikely, but the bitmaps are all up front so - * we need to account for it. */ + /* + * Old format filesystem? Unlikely, but the bitmaps are all + * up front so we need to account for it. + */ if (unlikely(test_bit(REISERFS_OLD_FORMAT, - &(REISERFS_SB(s)->s_properties)))) { + &REISERFS_SB(s)->s_properties))) { b_blocknr_t bmap1 = REISERFS_SB(s)->s_sbh->b_blocknr + 1; if (block >= bmap1 && block <= bmap1 + bmap_count) { @@ -108,8 +112,11 @@ int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value) return 1; } -/* searches in journal structures for a given block number (bmap, off). If block - is found in reiserfs journal it suggests next free block candidate to test. */ +/* + * Searches in journal structures for a given block number (bmap, off). + * If block is found in reiserfs journal it suggests next free block + * candidate to test. + */ static inline int is_block_in_journal(struct super_block *s, unsigned int bmap, int off, int *next) { @@ -120,7 +127,7 @@ static inline int is_block_in_journal(struct super_block *s, unsigned int bmap, *next = tmp; PROC_INFO_INC(s, scan_bitmap.in_journal_hint); } else { - (*next) = off + 1; /* inc offset to avoid looping. */ + (*next) = off + 1; /* inc offset to avoid looping. */ PROC_INFO_INC(s, scan_bitmap.in_journal_nohint); } PROC_INFO_INC(s, scan_bitmap.retry); @@ -129,8 +136,10 @@ static inline int is_block_in_journal(struct super_block *s, unsigned int bmap, return 0; } -/* it searches for a window of zero bits with given minimum and maximum lengths in one bitmap - * block; */ +/* + * Searches for a window of zero bits with given minimum and maximum + * lengths in one bitmap block + */ static int scan_bitmap_block(struct reiserfs_transaction_handle *th, unsigned int bmap_n, int *beg, int boundary, int min, int max, int unfm) @@ -142,14 +151,9 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th, int org = *beg; BUG_ON(!th->t_trans_id); - RFALSE(bmap_n >= reiserfs_bmap_count(s), "Bitmap %u is out of " "range (0..%u)", bmap_n, reiserfs_bmap_count(s) - 1); PROC_INFO_INC(s, scan_bitmap.bmap); -/* this is unclear and lacks comments, explain how journal bitmaps - work here for the reader. Convey a sense of the design here. What - is a window? */ -/* - I mean `a window of zero bits' as in description of this function - Zam. */ if (!bi) { reiserfs_error(s, "jdm-4055", "NULL bitmap info pointer " @@ -162,18 +166,21 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th, return 0; while (1) { - cont: +cont: if (bi->free_count < min) { brelse(bh); - return 0; // No free blocks in this bitmap + return 0; /* No free blocks in this bitmap */ } /* search for a first zero bit -- beginning of a window */ *beg = reiserfs_find_next_zero_le_bit ((unsigned long *)(bh->b_data), boundary, *beg); - if (*beg + min > boundary) { /* search for a zero bit fails or the rest of bitmap block - * cannot contain a zero window of minimum size */ + /* + * search for a zero bit fails or the rest of bitmap block + * cannot contain a zero window of minimum size + */ + if (*beg + min > boundary) { brelse(bh); return 0; } @@ -187,49 +194,75 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th, next = end; break; } - /* finding the other end of zero bit window requires looking into journal structures (in - * case of searching for free blocks for unformatted nodes) */ + + /* + * finding the other end of zero bit window requires + * looking into journal structures (in case of + * searching for free blocks for unformatted nodes) + */ if (unfm && is_block_in_journal(s, bmap_n, end, &next)) break; } - /* now (*beg) points to beginning of zero bits window, - * (end) points to one bit after the window end */ - if (end - *beg >= min) { /* it seems we have found window of proper size */ + /* + * now (*beg) points to beginning of zero bits window, + * (end) points to one bit after the window end + */ + + /* found window of proper size */ + if (end - *beg >= min) { int i; reiserfs_prepare_for_journal(s, bh, 1); - /* try to set all blocks used checking are they still free */ + /* + * try to set all blocks used checking are + * they still free + */ for (i = *beg; i < end; i++) { - /* It seems that we should not check in journal again. */ + /* Don't check in journal again. */ if (reiserfs_test_and_set_le_bit (i, bh->b_data)) { - /* bit was set by another process - * while we slept in prepare_for_journal() */ + /* + * bit was set by another process while + * we slept in prepare_for_journal() + */ PROC_INFO_INC(s, scan_bitmap.stolen); - if (i >= *beg + min) { /* we can continue with smaller set of allocated blocks, - * if length of this set is more or equal to `min' */ + + /* + * we can continue with smaller set + * of allocated blocks, if length of + * this set is more or equal to `min' + */ + if (i >= *beg + min) { end = i; break; } - /* otherwise we clear all bit were set ... */ + + /* + * otherwise we clear all bit + * were set ... + */ while (--i >= *beg) reiserfs_clear_le_bit (i, bh->b_data); reiserfs_restore_prepared_buffer(s, bh); *beg = org; - /* ... and search again in current block from beginning */ + + /* + * Search again in current block + * from beginning + */ goto cont; } } bi->free_count -= (end - *beg); - journal_mark_dirty(th, s, bh); + journal_mark_dirty(th, bh); brelse(bh); /* free block count calculation */ reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg)); - journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s)); + journal_mark_dirty(th, SB_BUFFER_WITH_SB(s)); return end - (*beg); } else { @@ -268,11 +301,13 @@ static inline int block_group_used(struct super_block *s, u32 id) int bm = bmap_hash_id(s, id); struct reiserfs_bitmap_info *info = &SB_AP_BITMAP(s)[bm]; - /* If we don't have cached information on this bitmap block, we're + /* + * If we don't have cached information on this bitmap block, we're * going to have to load it later anyway. Loading it here allows us * to make a better decision. This favors long-term performance gain * with a better on-disk layout vs. a short term gain of skipping the - * read and potentially having a bad placement. */ + * read and potentially having a bad placement. + */ if (info->free_count == UINT_MAX) { struct buffer_head *bh = reiserfs_read_bitmap_block(s, bm); brelse(bh); @@ -305,26 +340,26 @@ __le32 reiserfs_choose_packing(struct inode * dir) return packing; } -/* Tries to find contiguous zero bit window (given size) in given region of - * bitmap and place new blocks there. Returns number of allocated blocks. */ +/* + * Tries to find contiguous zero bit window (given size) in given region of + * bitmap and place new blocks there. Returns number of allocated blocks. + */ static int scan_bitmap(struct reiserfs_transaction_handle *th, b_blocknr_t * start, b_blocknr_t finish, int min, int max, int unfm, sector_t file_block) { int nr_allocated = 0; struct super_block *s = th->t_super; - /* find every bm and bmap and bmap_nr in this file, and change them all to bitmap_blocknr - * - Hans, it is not a block number - Zam. */ - unsigned int bm, off; unsigned int end_bm, end_off; unsigned int off_max = s->s_blocksize << 3; BUG_ON(!th->t_trans_id); - PROC_INFO_INC(s, scan_bitmap.call); + + /* No point in looking for more free blocks */ if (SB_FREE_BLOCKS(s) <= 0) - return 0; // No point in looking for more free blocks + return 0; get_bit_address(s, *start, &bm, &off); get_bit_address(s, finish, &end_bm, &end_off); @@ -333,7 +368,8 @@ static int scan_bitmap(struct reiserfs_transaction_handle *th, if (end_bm > reiserfs_bmap_count(s)) end_bm = reiserfs_bmap_count(s); - /* When the bitmap is more than 10% free, anyone can allocate. + /* + * When the bitmap is more than 10% free, anyone can allocate. * When it's less than 10% free, only files that already use the * bitmap are allowed. Once we pass 80% full, this restriction * is lifted. @@ -371,7 +407,7 @@ static int scan_bitmap(struct reiserfs_transaction_handle *th, nr_allocated = scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm); - ret: +ret: *start = bm * off_max + off; return nr_allocated; @@ -388,9 +424,7 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th, unsigned int nr, offset; BUG_ON(!th->t_trans_id); - PROC_INFO_INC(s, free_block); - rs = SB_DISK_SUPER_BLOCK(s); sbh = SB_BUFFER_WITH_SB(s); apbi = SB_AP_BITMAP(s); @@ -415,14 +449,14 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th, "block %lu: bit already cleared", block); } apbi[nr].free_count++; - journal_mark_dirty(th, s, bmbh); + journal_mark_dirty(th, bmbh); brelse(bmbh); reiserfs_prepare_for_journal(s, sbh, 1); /* update super block */ set_sb_free_blocks(rs, sb_free_blocks(rs) + 1); - journal_mark_dirty(th, s, sbh); + journal_mark_dirty(th, sbh); if (for_unformatted) { int depth = reiserfs_write_unlock_nested(s); dquot_free_block_nodirty(inode, 1); @@ -435,8 +469,8 @@ void reiserfs_free_block(struct reiserfs_transaction_handle *th, int for_unformatted) { struct super_block *s = th->t_super; - BUG_ON(!th->t_trans_id); + BUG_ON(!th->t_trans_id); RFALSE(!s, "vs-4061: trying to free block on nonexistent device"); if (!is_reusable(s, block, 1)) return; @@ -471,6 +505,7 @@ static void __discard_prealloc(struct reiserfs_transaction_handle *th, unsigned long save = ei->i_prealloc_block; int dirty = 0; struct inode *inode = &ei->vfs_inode; + BUG_ON(!th->t_trans_id); #ifdef CONFIG_REISERFS_CHECK if (ei->i_prealloc_count < 0) @@ -486,7 +521,7 @@ static void __discard_prealloc(struct reiserfs_transaction_handle *th, if (dirty) reiserfs_update_sd(th, inode); ei->i_prealloc_block = save; - list_del_init(&(ei->i_prealloc_list)); + list_del_init(&ei->i_prealloc_list); } /* FIXME: It should be inline function */ @@ -494,6 +529,7 @@ void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th, struct inode *inode) { struct reiserfs_inode_info *ei = REISERFS_I(inode); + BUG_ON(!th->t_trans_id); if (ei->i_prealloc_count) __discard_prealloc(th, ei); @@ -504,7 +540,6 @@ void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th) struct list_head *plist = &SB_JOURNAL(th->t_super)->j_prealloc_list; BUG_ON(!th->t_trans_id); - while (!list_empty(plist)) { struct reiserfs_inode_info *ei; ei = list_entry(plist->next, struct reiserfs_inode_info, @@ -532,7 +567,8 @@ int reiserfs_parse_alloc_options(struct super_block *s, char *options) { char *this_char, *value; - REISERFS_SB(s)->s_alloc_options.bits = 0; /* clear default settings */ + /* clear default settings */ + REISERFS_SB(s)->s_alloc_options.bits = 0; while ((this_char = strsep(&options, ":")) != NULL) { if ((value = strchr(this_char, '=')) != NULL) @@ -562,7 +598,7 @@ int reiserfs_parse_alloc_options(struct super_block *s, char *options) if (!strcmp(this_char, "displacing_new_packing_localities")) { SET_OPTION(displacing_new_packing_localities); continue; - }; + } if (!strcmp(this_char, "old_hashed_relocation")) { SET_OPTION(old_hashed_relocation); @@ -729,11 +765,12 @@ void show_alloc_options(struct seq_file *seq, struct super_block *s) static inline void new_hashed_relocation(reiserfs_blocknr_hint_t * hint) { char *hash_in; + if (hint->formatted_node) { hash_in = (char *)&hint->key.k_dir_id; } else { if (!hint->inode) { - //hint->search_start = hint->beg; + /*hint->search_start = hint->beg;*/ hash_in = (char *)&hint->key.k_dir_id; } else if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) @@ -757,6 +794,7 @@ static void dirid_groups(reiserfs_blocknr_hint_t * hint) __u32 dirid = 0; int bm = 0; struct super_block *sb = hint->th->t_super; + if (hint->inode) dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); else if (hint->formatted_node) @@ -786,7 +824,8 @@ static void oid_groups(reiserfs_blocknr_hint_t * hint) dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); - /* keep the root dir and it's first set of subdirs close to + /* + * keep the root dir and it's first set of subdirs close to * the start of the disk */ if (dirid <= 2) @@ -800,7 +839,8 @@ static void oid_groups(reiserfs_blocknr_hint_t * hint) } } -/* returns 1 if it finds an indirect item and gets valid hint info +/* + * returns 1 if it finds an indirect item and gets valid hint info * from it, otherwise 0 */ static int get_left_neighbor(reiserfs_blocknr_hint_t * hint) @@ -812,25 +852,29 @@ static int get_left_neighbor(reiserfs_blocknr_hint_t * hint) __le32 *item; int ret = 0; - if (!hint->path) /* reiserfs code can call this function w/o pointer to path - * structure supplied; then we rely on supplied search_start */ + /* + * reiserfs code can call this function w/o pointer to path + * structure supplied; then we rely on supplied search_start + */ + if (!hint->path) return 0; path = hint->path; bh = get_last_bh(path); RFALSE(!bh, "green-4002: Illegal path specified to get_left_neighbor"); - ih = get_ih(path); + ih = tp_item_head(path); pos_in_item = path->pos_in_item; - item = get_item(path); + item = tp_item_body(path); hint->search_start = bh->b_blocknr; + /* + * for indirect item: go to left and look for the first non-hole entry + * in the indirect item + */ if (!hint->formatted_node && is_indirect_le_ih(ih)) { - /* for indirect item: go to left and look for the first non-hole entry - in the indirect item */ if (pos_in_item == I_UNFM_NUM(ih)) pos_in_item--; -// pos_in_item = I_UNFM_NUM (ih) - 1; while (pos_in_item >= 0) { int t = get_block_num(item, pos_in_item); if (t) { @@ -846,10 +890,12 @@ static int get_left_neighbor(reiserfs_blocknr_hint_t * hint) return ret; } -/* should be, if formatted node, then try to put on first part of the device - specified as number of percent with mount option device, else try to put - on last of device. This is not to say it is good code to do so, - but the effect should be measured. */ +/* + * should be, if formatted node, then try to put on first part of the device + * specified as number of percent with mount option device, else try to put + * on last of device. This is not to say it is good code to do so, + * but the effect should be measured. + */ static inline void set_border_in_hint(struct super_block *s, reiserfs_blocknr_hint_t * hint) { @@ -975,21 +1021,27 @@ static void determine_search_start(reiserfs_blocknr_hint_t * hint, set_border_in_hint(s, hint); #ifdef DISPLACE_NEW_PACKING_LOCALITIES - /* whenever we create a new directory, we displace it. At first we will - hash for location, later we might look for a moderately empty place for - it */ + /* + * whenever we create a new directory, we displace it. At first + * we will hash for location, later we might look for a moderately + * empty place for it + */ if (displacing_new_packing_localities(s) && hint->th->displace_new_blocks) { displace_new_packing_locality(hint); - /* we do not continue determine_search_start, - * if new packing locality is being displaced */ + /* + * we do not continue determine_search_start, + * if new packing locality is being displaced + */ return; } #endif - /* all persons should feel encouraged to add more special cases here and - * test them */ + /* + * all persons should feel encouraged to add more special cases + * here and test them + */ if (displacing_large_files(s) && !hint->formatted_node && this_blocknr_allocation_would_make_it_a_large_file(hint)) { @@ -997,8 +1049,10 @@ static void determine_search_start(reiserfs_blocknr_hint_t * hint, return; } - /* if none of our special cases is relevant, use the left neighbor in the - tree order of the new node we are allocating for */ + /* + * if none of our special cases is relevant, use the left + * neighbor in the tree order of the new node we are allocating for + */ if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes, s)) { hash_formatted_node(hint); return; @@ -1006,10 +1060,13 @@ static void determine_search_start(reiserfs_blocknr_hint_t * hint, unfm_hint = get_left_neighbor(hint); - /* Mimic old block allocator behaviour, that is if VFS allowed for preallocation, - new blocks are displaced based on directory ID. Also, if suggested search_start - is less than last preallocated block, we start searching from it, assuming that - HDD dataflow is faster in forward direction */ + /* + * Mimic old block allocator behaviour, that is if VFS allowed for + * preallocation, new blocks are displaced based on directory ID. + * Also, if suggested search_start is less than last preallocated + * block, we start searching from it, assuming that HDD dataflow + * is faster in forward direction + */ if (TEST_OPTION(old_way, s)) { if (!hint->formatted_node) { if (!reiserfs_hashed_relocation(s)) @@ -1038,11 +1095,13 @@ static void determine_search_start(reiserfs_blocknr_hint_t * hint, TEST_OPTION(old_hashed_relocation, s)) { old_hashed_relocation(hint); } + /* new_hashed_relocation works with both formatted/unformatted nodes */ if ((!unfm_hint || hint->formatted_node) && TEST_OPTION(new_hashed_relocation, s)) { new_hashed_relocation(hint); } + /* dirid grouping works only on unformatted nodes */ if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups, s)) { dirid_groups(hint); @@ -1080,8 +1139,6 @@ static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint) return CARRY_ON; } -/* XXX I know it could be merged with upper-level function; - but may be result function would be too complex. */ static inline int allocate_without_wrapping_disk(reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs, b_blocknr_t start, @@ -1109,7 +1166,10 @@ static inline int allocate_without_wrapping_disk(reiserfs_blocknr_hint_t * hint, /* do we have something to fill prealloc. array also ? */ if (nr_allocated > 0) { - /* it means prealloc_size was greater that 0 and we do preallocation */ + /* + * it means prealloc_size was greater that 0 and + * we do preallocation + */ list_add(&REISERFS_I(hint->inode)->i_prealloc_list, &SB_JOURNAL(hint->th->t_super)-> j_prealloc_list); @@ -1177,7 +1237,8 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start start = 0; finish = hint->beg; break; - default: /* We've tried searching everywhere, not enough space */ + default: + /* We've tried searching everywhere, not enough space */ /* Free the blocks */ if (!hint->formatted_node) { #ifdef REISERQUOTA_DEBUG @@ -1262,8 +1323,11 @@ static int use_preallocated_list_if_available(reiserfs_blocknr_hint_t * hint, return amount_needed; } -int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs, int amount_needed, int reserved_by_us /* Amount of blocks we have - already reserved */ ) +int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *hint, + b_blocknr_t *new_blocknrs, + int amount_needed, + /* Amount of blocks we have already reserved */ + int reserved_by_us) { int initial_amount_needed = amount_needed; int ret; @@ -1275,15 +1339,21 @@ int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t * hint, b_blocknr_t * new return NO_DISK_SPACE; /* should this be if !hint->inode && hint->preallocate? */ /* do you mean hint->formatted_node can be removed ? - Zam */ - /* hint->formatted_node cannot be removed because we try to access - inode information here, and there is often no inode assotiated with - metadata allocations - green */ + /* + * hint->formatted_node cannot be removed because we try to access + * inode information here, and there is often no inode associated with + * metadata allocations - green + */ if (!hint->formatted_node && hint->preallocate) { amount_needed = use_preallocated_list_if_available (hint, new_blocknrs, amount_needed); - if (amount_needed == 0) /* all blocknrs we need we got from - prealloc. list */ + + /* + * We have all the block numbers we need from the + * prealloc list + */ + if (amount_needed == 0) return CARRY_ON; new_blocknrs += (initial_amount_needed - amount_needed); } @@ -1297,10 +1367,12 @@ int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t * hint, b_blocknr_t * new ret = blocknrs_and_prealloc_arrays_from_search_start (hint, new_blocknrs, amount_needed); - /* we used prealloc. list to fill (partially) new_blocknrs array. If final allocation fails we - * need to return blocks back to prealloc. list or just free them. -- Zam (I chose second - * variant) */ - + /* + * We used prealloc. list to fill (partially) new_blocknrs array. + * If final allocation fails we need to return blocks back to + * prealloc. list or just free them. -- Zam (I chose second + * variant) + */ if (ret != CARRY_ON) { while (amount_needed++ < initial_amount_needed) { reiserfs_free_block(hint->th, hint->inode, @@ -1339,10 +1411,12 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, struct reiserfs_bitmap_info *info = SB_AP_BITMAP(sb) + bitmap; struct buffer_head *bh; - /* Way old format filesystems had the bitmaps packed up front. - * I doubt there are any of these left, but just in case... */ + /* + * Way old format filesystems had the bitmaps packed up front. + * I doubt there are any of these left, but just in case... + */ if (unlikely(test_bit(REISERFS_OLD_FORMAT, - &(REISERFS_SB(sb)->s_properties)))) + &REISERFS_SB(sb)->s_properties))) block = REISERFS_SB(sb)->s_sbh->b_blocknr + 1 + bitmap; else if (bitmap == 0) block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1; diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 1fd2051109a..d9f5a60dd59 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -59,7 +59,10 @@ static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *d int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) { - struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */ + + /* key of current position in the directory (key of directory entry) */ + struct cpu_key pos_key; + INITIALIZE_PATH(path_to_entry); struct buffer_head *bh; int item_num, entry_num; @@ -77,21 +80,28 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) reiserfs_check_lock_depth(inode->i_sb, "readdir"); - /* form key for search the next directory entry using f_pos field of - file structure */ + /* + * form key for search the next directory entry using + * f_pos field of file structure + */ make_cpu_key(&pos_key, inode, ctx->pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3); next_pos = cpu_key_k_offset(&pos_key); path_to_entry.reada = PATH_READA; while (1) { - research: - /* search the directory item, containing entry with specified key */ +research: + /* + * search the directory item, containing entry with + * specified key + */ search_res = search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry, &de); if (search_res == IO_ERROR) { - // FIXME: we could just skip part of directory which could - // not be read + /* + * FIXME: we could just skip part of directory + * which could not be read + */ ret = -EIO; goto out; } @@ -102,40 +112,49 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) store_ih(&tmp_ih, ih); /* we must have found item, that is item of this directory, */ - RFALSE(COMP_SHORT_KEYS(&(ih->ih_key), &pos_key), + RFALSE(COMP_SHORT_KEYS(&ih->ih_key, &pos_key), "vs-9000: found item %h does not match to dir we readdir %K", ih, &pos_key); RFALSE(item_num > B_NR_ITEMS(bh) - 1, "vs-9005 item_num == %d, item amount == %d", item_num, B_NR_ITEMS(bh)); - /* and entry must be not more than number of entries in the item */ - RFALSE(I_ENTRY_COUNT(ih) < entry_num, + /* + * and entry must be not more than number of entries + * in the item + */ + RFALSE(ih_entry_count(ih) < entry_num, "vs-9010: entry number is too big %d (%d)", - entry_num, I_ENTRY_COUNT(ih)); + entry_num, ih_entry_count(ih)); + /* + * go through all entries in the directory item beginning + * from the entry, that has been found + */ if (search_res == POSITION_FOUND - || entry_num < I_ENTRY_COUNT(ih)) { - /* go through all entries in the directory item beginning from the entry, that has been found */ + || entry_num < ih_entry_count(ih)) { struct reiserfs_de_head *deh = B_I_DEH(bh, ih) + entry_num; - for (; entry_num < I_ENTRY_COUNT(ih); + for (; entry_num < ih_entry_count(ih); entry_num++, deh++) { int d_reclen; char *d_name; ino_t d_ino; + loff_t cur_pos = deh_offset(deh); + /* it is hidden entry */ if (!de_visible(deh)) - /* it is hidden entry */ continue; d_reclen = entry_length(bh, ih, entry_num); d_name = B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh); if (d_reclen <= 0 || d_name + d_reclen > bh->b_data + bh->b_size) { - /* There is corrupted data in entry, - * We'd better stop here */ + /* + * There is corrupted data in entry, + * We'd better stop here + */ pathrelse(&path_to_entry); ret = -EIO; goto out; @@ -144,10 +163,10 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) if (!d_name[d_reclen - 1]) d_reclen = strlen(d_name); + /* too big to send back to VFS */ if (d_reclen > REISERFS_MAX_NAME(inode->i_sb-> s_blocksize)) { - /* too big to send back to VFS */ continue; } @@ -172,10 +191,14 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) goto research; } } - // Note, that we copy name to user space via temporary - // buffer (local_buf) because filldir will block if - // user space buffer is swapped out. At that time - // entry can move to somewhere else + + /* + * Note, that we copy name to user space via + * temporary buffer (local_buf) because + * filldir will block if user space buffer is + * swapped out. At that time entry can move to + * somewhere else + */ memcpy(local_buf, d_name, d_reclen); /* @@ -196,8 +219,9 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) if (local_buf != small_buf) { kfree(local_buf); } - // next entry should be looked for with such offset - next_pos = deh_offset(deh) + 1; + + /* deh_offset(deh) may be invalid now. */ + next_pos = cur_pos + 1; if (item_moved(&tmp_ih, &path_to_entry)) { set_cpu_key_k_offset(&pos_key, @@ -207,22 +231,26 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) } /* for */ } + /* end of directory has been reached */ if (item_num != B_NR_ITEMS(bh) - 1) - // end of directory has been reached goto end; - /* item we went through is last item of node. Using right - delimiting key check is it directory end */ + /* + * item we went through is last item of node. Using right + * delimiting key check is it directory end + */ rkey = get_rkey(&path_to_entry, inode->i_sb); if (!comp_le_keys(rkey, &MIN_KEY)) { - /* set pos_key to key, that is the smallest and greater - that key of the last entry in the item */ + /* + * set pos_key to key, that is the smallest and greater + * that key of the last entry in the item + */ set_cpu_key_k_offset(&pos_key, next_pos); continue; } + /* end of directory has been reached */ if (COMP_SHORT_KEYS(rkey, &pos_key)) { - // end of directory has been reached goto end; } @@ -246,71 +274,73 @@ static int reiserfs_readdir(struct file *file, struct dir_context *ctx) return reiserfs_readdir_inode(file_inode(file), ctx); } -/* compose directory item containing "." and ".." entries (entries are - not aligned to 4 byte boundary) */ -/* the last four params are LE */ +/* + * compose directory item containing "." and ".." entries (entries are + * not aligned to 4 byte boundary) + */ void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid, __le32 par_dirid, __le32 par_objid) { - struct reiserfs_de_head *deh; + struct reiserfs_de_head *dot, *dotdot; memset(body, 0, EMPTY_DIR_SIZE_V1); - deh = (struct reiserfs_de_head *)body; + dot = (struct reiserfs_de_head *)body; + dotdot = dot + 1; /* direntry header of "." */ - put_deh_offset(&(deh[0]), DOT_OFFSET); + put_deh_offset(dot, DOT_OFFSET); /* these two are from make_le_item_head, and are are LE */ - deh[0].deh_dir_id = dirid; - deh[0].deh_objectid = objid; - deh[0].deh_state = 0; /* Endian safe if 0 */ - put_deh_location(&(deh[0]), EMPTY_DIR_SIZE_V1 - strlen(".")); - mark_de_visible(&(deh[0])); + dot->deh_dir_id = dirid; + dot->deh_objectid = objid; + dot->deh_state = 0; /* Endian safe if 0 */ + put_deh_location(dot, EMPTY_DIR_SIZE_V1 - strlen(".")); + mark_de_visible(dot); /* direntry header of ".." */ - put_deh_offset(&(deh[1]), DOT_DOT_OFFSET); + put_deh_offset(dotdot, DOT_DOT_OFFSET); /* key of ".." for the root directory */ /* these two are from the inode, and are are LE */ - deh[1].deh_dir_id = par_dirid; - deh[1].deh_objectid = par_objid; - deh[1].deh_state = 0; /* Endian safe if 0 */ - put_deh_location(&(deh[1]), deh_location(&(deh[0])) - strlen("..")); - mark_de_visible(&(deh[1])); + dotdot->deh_dir_id = par_dirid; + dotdot->deh_objectid = par_objid; + dotdot->deh_state = 0; /* Endian safe if 0 */ + put_deh_location(dotdot, deh_location(dot) - strlen("..")); + mark_de_visible(dotdot); /* copy ".." and "." */ - memcpy(body + deh_location(&(deh[0])), ".", 1); - memcpy(body + deh_location(&(deh[1])), "..", 2); + memcpy(body + deh_location(dot), ".", 1); + memcpy(body + deh_location(dotdot), "..", 2); } /* compose directory item containing "." and ".." entries */ void make_empty_dir_item(char *body, __le32 dirid, __le32 objid, __le32 par_dirid, __le32 par_objid) { - struct reiserfs_de_head *deh; + struct reiserfs_de_head *dot, *dotdot; memset(body, 0, EMPTY_DIR_SIZE); - deh = (struct reiserfs_de_head *)body; + dot = (struct reiserfs_de_head *)body; + dotdot = dot + 1; /* direntry header of "." */ - put_deh_offset(&(deh[0]), DOT_OFFSET); + put_deh_offset(dot, DOT_OFFSET); /* these two are from make_le_item_head, and are are LE */ - deh[0].deh_dir_id = dirid; - deh[0].deh_objectid = objid; - deh[0].deh_state = 0; /* Endian safe if 0 */ - put_deh_location(&(deh[0]), EMPTY_DIR_SIZE - ROUND_UP(strlen("."))); - mark_de_visible(&(deh[0])); + dot->deh_dir_id = dirid; + dot->deh_objectid = objid; + dot->deh_state = 0; /* Endian safe if 0 */ + put_deh_location(dot, EMPTY_DIR_SIZE - ROUND_UP(strlen("."))); + mark_de_visible(dot); /* direntry header of ".." */ - put_deh_offset(&(deh[1]), DOT_DOT_OFFSET); + put_deh_offset(dotdot, DOT_DOT_OFFSET); /* key of ".." for the root directory */ /* these two are from the inode, and are are LE */ - deh[1].deh_dir_id = par_dirid; - deh[1].deh_objectid = par_objid; - deh[1].deh_state = 0; /* Endian safe if 0 */ - put_deh_location(&(deh[1]), - deh_location(&(deh[0])) - ROUND_UP(strlen(".."))); - mark_de_visible(&(deh[1])); + dotdot->deh_dir_id = par_dirid; + dotdot->deh_objectid = par_objid; + dotdot->deh_state = 0; /* Endian safe if 0 */ + put_deh_location(dotdot, deh_location(dot) - ROUND_UP(strlen(".."))); + mark_de_visible(dotdot); /* copy ".." and "." */ - memcpy(body + deh_location(&(deh[0])), ".", 1); - memcpy(body + deh_location(&(deh[1])), "..", 2); + memcpy(body + deh_location(dot), ".", 1); + memcpy(body + deh_location(dotdot), "..", 2); } diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c index 2b7882b508d..54fdf196bfb 100644 --- a/fs/reiserfs/do_balan.c +++ b/fs/reiserfs/do_balan.c @@ -2,18 +2,13 @@ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README */ -/* Now we have all buffers that must be used in balancing of the tree */ -/* Further calculations can not cause schedule(), and thus the buffer */ -/* tree will be stable until the balancing will be finished */ -/* balance the tree according to the analysis made before, */ -/* and using buffers obtained after all above. */ - -/** - ** balance_leaf_when_delete - ** balance_leaf - ** do_balance - ** - **/ +/* + * Now we have all buffers that must be used in balancing of the tree + * Further calculations can not cause schedule(), and thus the buffer + * tree will be stable until the balancing will be finished + * balance the tree according to the analysis made before, + * and using buffers obtained after all above. + */ #include <asm/uaccess.h> #include <linux/time.h> @@ -61,48 +56,190 @@ static inline void buffer_info_init_bh(struct tree_balance *tb, inline void do_balance_mark_leaf_dirty(struct tree_balance *tb, struct buffer_head *bh, int flag) { - journal_mark_dirty(tb->transaction_handle, - tb->transaction_handle->t_super, bh); + journal_mark_dirty(tb->transaction_handle, bh); } #define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty #define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty -/* summary: - if deleting something ( tb->insert_size[0] < 0 ) - return(balance_leaf_when_delete()); (flag d handled here) - else - if lnum is larger than 0 we put items into the left node - if rnum is larger than 0 we put items into the right node - if snum1 is larger than 0 we put items into the new node s1 - if snum2 is larger than 0 we put items into the new node s2 -Note that all *num* count new items being created. - -It would be easier to read balance_leaf() if each of these summary -lines was a separate procedure rather than being inlined. I think -that there are many passages here and in balance_leaf_when_delete() in -which two calls to one procedure can replace two passages, and it -might save cache space and improve software maintenance costs to do so. - -Vladimir made the perceptive comment that we should offload most of -the decision making in this function into fix_nodes/check_balance, and -then create some sort of structure in tb that says what actions should -be performed by do_balance. - --Hans */ - -/* Balance leaf node in case of delete or cut: insert_size[0] < 0 +/* + * summary: + * if deleting something ( tb->insert_size[0] < 0 ) + * return(balance_leaf_when_delete()); (flag d handled here) + * else + * if lnum is larger than 0 we put items into the left node + * if rnum is larger than 0 we put items into the right node + * if snum1 is larger than 0 we put items into the new node s1 + * if snum2 is larger than 0 we put items into the new node s2 + * Note that all *num* count new items being created. + */ + +static void balance_leaf_when_delete_del(struct tree_balance *tb) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int item_pos = PATH_LAST_POSITION(tb->tb_path); + struct buffer_info bi; +#ifdef CONFIG_REISERFS_CHECK + struct item_head *ih = item_head(tbS0, item_pos); +#endif + + RFALSE(ih_item_len(ih) + IH_SIZE != -tb->insert_size[0], + "vs-12013: mode Delete, insert size %d, ih to be deleted %h", + -tb->insert_size[0], ih); + + buffer_info_init_tbS0(tb, &bi); + leaf_delete_items(&bi, 0, item_pos, 1, -1); + + if (!item_pos && tb->CFL[0]) { + if (B_NR_ITEMS(tbS0)) { + replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0); + } else { + if (!PATH_H_POSITION(tb->tb_path, 1)) + replace_key(tb, tb->CFL[0], tb->lkey[0], + PATH_H_PPARENT(tb->tb_path, 0), 0); + } + } + + RFALSE(!item_pos && !tb->CFL[0], + "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0], + tb->L[0]); +} + +/* cut item in S[0] */ +static void balance_leaf_when_delete_cut(struct tree_balance *tb) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int item_pos = PATH_LAST_POSITION(tb->tb_path); + struct item_head *ih = item_head(tbS0, item_pos); + int pos_in_item = tb->tb_path->pos_in_item; + struct buffer_info bi; + buffer_info_init_tbS0(tb, &bi); + + if (is_direntry_le_ih(ih)) { + /* + * UFS unlink semantics are such that you can only + * delete one directory entry at a time. + * + * when we cut a directory tb->insert_size[0] means + * number of entries to be cut (always 1) + */ + tb->insert_size[0] = -1; + leaf_cut_from_buffer(&bi, item_pos, pos_in_item, + -tb->insert_size[0]); + + RFALSE(!item_pos && !pos_in_item && !tb->CFL[0], + "PAP-12030: can not change delimiting key. CFL[0]=%p", + tb->CFL[0]); + + if (!item_pos && !pos_in_item && tb->CFL[0]) + replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0); + } else { + leaf_cut_from_buffer(&bi, item_pos, pos_in_item, + -tb->insert_size[0]); + + RFALSE(!ih_item_len(ih), + "PAP-12035: cut must leave non-zero dynamic " + "length of item"); + } +} + +static int balance_leaf_when_delete_left(struct tree_balance *tb) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + + /* L[0] must be joined with S[0] */ + if (tb->lnum[0] == -1) { + /* R[0] must be also joined with S[0] */ + if (tb->rnum[0] == -1) { + if (tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0)) { + /* + * all contents of all the + * 3 buffers will be in L[0] + */ + if (PATH_H_POSITION(tb->tb_path, 1) == 0 && + 1 < B_NR_ITEMS(tb->FR[0])) + replace_key(tb, tb->CFL[0], + tb->lkey[0], tb->FR[0], 1); + + leaf_move_items(LEAF_FROM_S_TO_L, tb, n, -1, + NULL); + leaf_move_items(LEAF_FROM_R_TO_L, tb, + B_NR_ITEMS(tb->R[0]), -1, + NULL); + + reiserfs_invalidate_buffer(tb, tbS0); + reiserfs_invalidate_buffer(tb, tb->R[0]); + + return 0; + } + + /* all contents of all the 3 buffers will be in R[0] */ + leaf_move_items(LEAF_FROM_S_TO_R, tb, n, -1, NULL); + leaf_move_items(LEAF_FROM_L_TO_R, tb, + B_NR_ITEMS(tb->L[0]), -1, NULL); + + /* right_delimiting_key is correct in R[0] */ + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); + + reiserfs_invalidate_buffer(tb, tbS0); + reiserfs_invalidate_buffer(tb, tb->L[0]); + + return -1; + } + + RFALSE(tb->rnum[0] != 0, + "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]); + /* all contents of L[0] and S[0] will be in L[0] */ + leaf_shift_left(tb, n, -1); + + reiserfs_invalidate_buffer(tb, tbS0); + + return 0; + } + + /* + * a part of contents of S[0] will be in L[0] and + * the rest part of S[0] will be in R[0] + */ + + RFALSE((tb->lnum[0] + tb->rnum[0] < n) || + (tb->lnum[0] + tb->rnum[0] > n + 1), + "PAP-12050: rnum(%d) and lnum(%d) and item " + "number(%d) in S[0] are not consistent", + tb->rnum[0], tb->lnum[0], n); + RFALSE((tb->lnum[0] + tb->rnum[0] == n) && + (tb->lbytes != -1 || tb->rbytes != -1), + "PAP-12055: bad rbytes (%d)/lbytes (%d) " + "parameters when items are not split", + tb->rbytes, tb->lbytes); + RFALSE((tb->lnum[0] + tb->rnum[0] == n + 1) && + (tb->lbytes < 1 || tb->rbytes != -1), + "PAP-12060: bad rbytes (%d)/lbytes (%d) " + "parameters when items are split", + tb->rbytes, tb->lbytes); + + leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + + reiserfs_invalidate_buffer(tb, tbS0); + + return 0; +} + +/* + * Balance leaf node in case of delete or cut: insert_size[0] < 0 * * lnum, rnum can have values >= -1 * -1 means that the neighbor must be joined with S * 0 means that nothing should be done with the neighbor - * >0 means to shift entirely or partly the specified number of items to the neighbor + * >0 means to shift entirely or partly the specified number of items + * to the neighbor */ static int balance_leaf_when_delete(struct tree_balance *tb, int flag) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); int item_pos = PATH_LAST_POSITION(tb->tb_path); - int pos_in_item = tb->tb_path->pos_in_item; struct buffer_info bi; int n; struct item_head *ih; @@ -114,1527 +251,1202 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag) RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0), "PAP-12010: tree can not be empty"); - ih = B_N_PITEM_HEAD(tbS0, item_pos); + ih = item_head(tbS0, item_pos); buffer_info_init_tbS0(tb, &bi); /* Delete or truncate the item */ - switch (flag) { - case M_DELETE: /* delete item in S[0] */ + BUG_ON(flag != M_DELETE && flag != M_CUT); + if (flag == M_DELETE) + balance_leaf_when_delete_del(tb); + else /* M_CUT */ + balance_leaf_when_delete_cut(tb); - RFALSE(ih_item_len(ih) + IH_SIZE != -tb->insert_size[0], - "vs-12013: mode Delete, insert size %d, ih to be deleted %h", - -tb->insert_size[0], ih); - leaf_delete_items(&bi, 0, item_pos, 1, -1); + /* + * the rule is that no shifting occurs unless by shifting + * a node can be freed + */ + n = B_NR_ITEMS(tbS0); - if (!item_pos && tb->CFL[0]) { - if (B_NR_ITEMS(tbS0)) { - replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, - 0); - } else { - if (!PATH_H_POSITION(tb->tb_path, 1)) - replace_key(tb, tb->CFL[0], tb->lkey[0], - PATH_H_PPARENT(tb->tb_path, - 0), 0); - } - } - RFALSE(!item_pos && !tb->CFL[0], - "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0], - tb->L[0]); + /* L[0] takes part in balancing */ + if (tb->lnum[0]) + return balance_leaf_when_delete_left(tb); + + if (tb->rnum[0] == -1) { + /* all contents of R[0] and S[0] will be in R[0] */ + leaf_shift_right(tb, n, -1); + reiserfs_invalidate_buffer(tb, tbS0); + return 0; + } + + RFALSE(tb->rnum[0], + "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]); + return 0; +} - break; +static void balance_leaf_insert_left(struct tree_balance *tb, + struct item_head *ih, const char *body) +{ + int ret; + struct buffer_info bi; + int n = B_NR_ITEMS(tb->L[0]); + + if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) { + /* part of new item falls into L[0] */ + int new_item_len, shift; + int version; + + ret = leaf_shift_left(tb, tb->lnum[0] - 1, -1); + + /* Calculate item length to insert to S[0] */ + new_item_len = ih_item_len(ih) - tb->lbytes; + + /* Calculate and check item length to insert to L[0] */ + put_ih_item_len(ih, ih_item_len(ih) - new_item_len); + + RFALSE(ih_item_len(ih) <= 0, + "PAP-12080: there is nothing to insert into L[0]: " + "ih_item_len=%d", ih_item_len(ih)); + + /* Insert new item into L[0] */ + buffer_info_init_left(tb, &bi); + leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body, + min_t(int, tb->zeroes_num, ih_item_len(ih))); + + version = ih_version(ih); + + /* + * Calculate key component, item length and body to + * insert into S[0] + */ + shift = 0; + if (is_indirect_le_ih(ih)) + shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; + + add_le_ih_k_offset(ih, tb->lbytes << shift); + + put_ih_item_len(ih, new_item_len); + if (tb->lbytes > tb->zeroes_num) { + body += (tb->lbytes - tb->zeroes_num); + tb->zeroes_num = 0; + } else + tb->zeroes_num -= tb->lbytes; + + RFALSE(ih_item_len(ih) <= 0, + "PAP-12085: there is nothing to insert into S[0]: " + "ih_item_len=%d", ih_item_len(ih)); + } else { + /* new item in whole falls into L[0] */ + /* Shift lnum[0]-1 items to L[0] */ + ret = leaf_shift_left(tb, tb->lnum[0] - 1, tb->lbytes); + + /* Insert new item into L[0] */ + buffer_info_init_left(tb, &bi); + leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body, + tb->zeroes_num); + tb->insert_size[0] = 0; + tb->zeroes_num = 0; + } +} - case M_CUT:{ /* cut item in S[0] */ - if (is_direntry_le_ih(ih)) { +static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb, + struct item_head *ih, + const char *body) +{ + int n = B_NR_ITEMS(tb->L[0]); + struct buffer_info bi; - /* UFS unlink semantics are such that you can only delete one directory entry at a time. */ - /* when we cut a directory tb->insert_size[0] means number of entries to be cut (always 1) */ - tb->insert_size[0] = -1; - leaf_cut_from_buffer(&bi, item_pos, pos_in_item, - -tb->insert_size[0]); + RFALSE(tb->zeroes_num, + "PAP-12090: invalid parameter in case of a directory"); + + /* directory item */ + if (tb->lbytes > tb->pos_in_item) { + /* new directory entry falls into L[0] */ + struct item_head *pasted; + int ret, l_pos_in_item = tb->pos_in_item; + + /* + * Shift lnum[0] - 1 items in whole. + * Shift lbytes - 1 entries from given directory item + */ + ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes - 1); + if (ret && !tb->item_pos) { + pasted = item_head(tb->L[0], B_NR_ITEMS(tb->L[0]) - 1); + l_pos_in_item += ih_entry_count(pasted) - + (tb->lbytes - 1); + } - RFALSE(!item_pos && !pos_in_item && !tb->CFL[0], - "PAP-12030: can not change delimiting key. CFL[0]=%p", - tb->CFL[0]); + /* Append given directory entry to directory item */ + buffer_info_init_left(tb, &bi); + leaf_paste_in_buffer(&bi, n + tb->item_pos - ret, + l_pos_in_item, tb->insert_size[0], + body, tb->zeroes_num); + + /* + * previous string prepared space for pasting new entry, + * following string pastes this entry + */ + + /* + * when we have merge directory item, pos_in_item + * has been changed too + */ + + /* paste new directory entry. 1 is entry number */ + leaf_paste_entries(&bi, n + tb->item_pos - ret, + l_pos_in_item, 1, + (struct reiserfs_de_head *) body, + body + DEH_SIZE, tb->insert_size[0]); + tb->insert_size[0] = 0; + } else { + /* new directory item doesn't fall into L[0] */ + /* + * Shift lnum[0]-1 items in whole. Shift lbytes + * directory entries from directory item number lnum[0] + */ + leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + } - if (!item_pos && !pos_in_item && tb->CFL[0]) { - replace_key(tb, tb->CFL[0], tb->lkey[0], - tbS0, 0); - } - } else { - leaf_cut_from_buffer(&bi, item_pos, pos_in_item, - -tb->insert_size[0]); + /* Calculate new position to append in item body */ + tb->pos_in_item -= tb->lbytes; +} - RFALSE(!ih_item_len(ih), - "PAP-12035: cut must leave non-zero dynamic length of item"); - } - break; +static void balance_leaf_paste_left_shift(struct tree_balance *tb, + struct item_head *ih, + const char *body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tb->L[0]); + struct buffer_info bi; + + if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) { + balance_leaf_paste_left_shift_dirent(tb, ih, body); + return; + } + + RFALSE(tb->lbytes <= 0, + "PAP-12095: there is nothing to shift to L[0]. " + "lbytes=%d", tb->lbytes); + RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)), + "PAP-12100: incorrect position to paste: " + "item_len=%d, pos_in_item=%d", + ih_item_len(item_head(tbS0, tb->item_pos)), tb->pos_in_item); + + /* appended item will be in L[0] in whole */ + if (tb->lbytes >= tb->pos_in_item) { + struct item_head *tbS0_pos_ih, *tbL0_ih; + struct item_head *tbS0_0_ih; + struct reiserfs_key *left_delim_key; + int ret, l_n, version, temp_l; + + tbS0_pos_ih = item_head(tbS0, tb->item_pos); + tbS0_0_ih = item_head(tbS0, 0); + + /* + * this bytes number must be appended + * to the last item of L[h] + */ + l_n = tb->lbytes - tb->pos_in_item; + + /* Calculate new insert_size[0] */ + tb->insert_size[0] -= l_n; + + RFALSE(tb->insert_size[0] <= 0, + "PAP-12105: there is nothing to paste into " + "L[0]. insert_size=%d", tb->insert_size[0]); + + ret = leaf_shift_left(tb, tb->lnum[0], + ih_item_len(tbS0_pos_ih)); + + tbL0_ih = item_head(tb->L[0], n + tb->item_pos - ret); + + /* Append to body of item in L[0] */ + buffer_info_init_left(tb, &bi); + leaf_paste_in_buffer(&bi, n + tb->item_pos - ret, + ih_item_len(tbL0_ih), l_n, body, + min_t(int, l_n, tb->zeroes_num)); + + /* + * 0-th item in S0 can be only of DIRECT type + * when l_n != 0 + */ + temp_l = l_n; + + RFALSE(ih_item_len(tbS0_0_ih), + "PAP-12106: item length must be 0"); + RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key, + leaf_key(tb->L[0], n + tb->item_pos - ret)), + "PAP-12107: items must be of the same file"); + + if (is_indirect_le_ih(tbL0_ih)) { + int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; + temp_l = l_n << shift; } + /* update key of first item in S0 */ + version = ih_version(tbS0_0_ih); + add_le_key_k_offset(version, &tbS0_0_ih->ih_key, temp_l); + + /* update left delimiting key */ + left_delim_key = internal_key(tb->CFL[0], tb->lkey[0]); + add_le_key_k_offset(version, left_delim_key, temp_l); + + /* + * Calculate new body, position in item and + * insert_size[0] + */ + if (l_n > tb->zeroes_num) { + body += (l_n - tb->zeroes_num); + tb->zeroes_num = 0; + } else + tb->zeroes_num -= l_n; + tb->pos_in_item = 0; + + RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key, + leaf_key(tb->L[0], + B_NR_ITEMS(tb->L[0]) - 1)) || + !op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size) || + !op_is_left_mergeable(left_delim_key, tbS0->b_size), + "PAP-12120: item must be merge-able with left " + "neighboring item"); + } else { + /* only part of the appended item will be in L[0] */ + + /* Calculate position in item for append in S[0] */ + tb->pos_in_item -= tb->lbytes; + + RFALSE(tb->pos_in_item <= 0, + "PAP-12125: no place for paste. pos_in_item=%d", + tb->pos_in_item); + + /* + * Shift lnum[0] - 1 items in whole. + * Shift lbytes - 1 byte from item number lnum[0] + */ + leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + } +} + - default: - print_cur_tb("12040"); - reiserfs_panic(tb->tb_sb, "PAP-12040", - "unexpected mode: %s(%d)", - (flag == - M_PASTE) ? "PASTE" : ((flag == - M_INSERT) ? "INSERT" : - "UNKNOWN"), flag); +/* appended item will be in L[0] in whole */ +static void balance_leaf_paste_left_whole(struct tree_balance *tb, + struct item_head *ih, + const char *body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tb->L[0]); + struct buffer_info bi; + struct item_head *pasted; + int ret; + + /* if we paste into first item of S[0] and it is left mergable */ + if (!tb->item_pos && + op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size)) { + /* + * then increment pos_in_item by the size of the + * last item in L[0] + */ + pasted = item_head(tb->L[0], n - 1); + if (is_direntry_le_ih(pasted)) + tb->pos_in_item += ih_entry_count(pasted); + else + tb->pos_in_item += ih_item_len(pasted); } - /* the rule is that no shifting occurs unless by shifting a node can be freed */ - n = B_NR_ITEMS(tbS0); - if (tb->lnum[0]) { /* L[0] takes part in balancing */ - if (tb->lnum[0] == -1) { /* L[0] must be joined with S[0] */ - if (tb->rnum[0] == -1) { /* R[0] must be also joined with S[0] */ - if (tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0)) { - /* all contents of all the 3 buffers will be in L[0] */ - if (PATH_H_POSITION(tb->tb_path, 1) == 0 - && 1 < B_NR_ITEMS(tb->FR[0])) - replace_key(tb, tb->CFL[0], - tb->lkey[0], - tb->FR[0], 1); - - leaf_move_items(LEAF_FROM_S_TO_L, tb, n, - -1, NULL); - leaf_move_items(LEAF_FROM_R_TO_L, tb, - B_NR_ITEMS(tb->R[0]), - -1, NULL); - - reiserfs_invalidate_buffer(tb, tbS0); - reiserfs_invalidate_buffer(tb, - tb->R[0]); - - return 0; - } - /* all contents of all the 3 buffers will be in R[0] */ - leaf_move_items(LEAF_FROM_S_TO_R, tb, n, -1, - NULL); - leaf_move_items(LEAF_FROM_L_TO_R, tb, - B_NR_ITEMS(tb->L[0]), -1, NULL); + /* + * Shift lnum[0] - 1 items in whole. + * Shift lbytes - 1 byte from item number lnum[0] + */ + ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + + /* Append to body of item in L[0] */ + buffer_info_init_left(tb, &bi); + leaf_paste_in_buffer(&bi, n + tb->item_pos - ret, tb->pos_in_item, + tb->insert_size[0], body, tb->zeroes_num); + + /* if appended item is directory, paste entry */ + pasted = item_head(tb->L[0], n + tb->item_pos - ret); + if (is_direntry_le_ih(pasted)) + leaf_paste_entries(&bi, n + tb->item_pos - ret, + tb->pos_in_item, 1, + (struct reiserfs_de_head *)body, + body + DEH_SIZE, tb->insert_size[0]); + + /* + * if appended item is indirect item, put unformatted node + * into un list + */ + if (is_indirect_le_ih(pasted)) + set_ih_free_space(pasted, 0); - /* right_delimiting_key is correct in R[0] */ - replace_key(tb, tb->CFR[0], tb->rkey[0], - tb->R[0], 0); + tb->insert_size[0] = 0; + tb->zeroes_num = 0; +} - reiserfs_invalidate_buffer(tb, tbS0); - reiserfs_invalidate_buffer(tb, tb->L[0]); +static void balance_leaf_paste_left(struct tree_balance *tb, + struct item_head *ih, const char *body) +{ + /* we must shift the part of the appended item */ + if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) + balance_leaf_paste_left_shift(tb, ih, body); + else + balance_leaf_paste_left_whole(tb, ih, body); +} - return -1; - } +/* Shift lnum[0] items from S[0] to the left neighbor L[0] */ +static void balance_leaf_left(struct tree_balance *tb, struct item_head *ih, + const char *body, int flag) +{ + if (tb->lnum[0] <= 0) + return; - RFALSE(tb->rnum[0] != 0, - "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]); - /* all contents of L[0] and S[0] will be in L[0] */ - leaf_shift_left(tb, n, -1); + /* new item or it part falls to L[0], shift it too */ + if (tb->item_pos < tb->lnum[0]) { + BUG_ON(flag != M_INSERT && flag != M_PASTE); - reiserfs_invalidate_buffer(tb, tbS0); + if (flag == M_INSERT) + balance_leaf_insert_left(tb, ih, body); + else /* M_PASTE */ + balance_leaf_paste_left(tb, ih, body); + } else + /* new item doesn't fall into L[0] */ + leaf_shift_left(tb, tb->lnum[0], tb->lbytes); +} + + +static void balance_leaf_insert_right(struct tree_balance *tb, + struct item_head *ih, const char *body) +{ + + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + struct buffer_info bi; + int ret; - return 0; + /* new item or part of it doesn't fall into R[0] */ + if (n - tb->rnum[0] >= tb->item_pos) { + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + return; + } + + /* new item or its part falls to R[0] */ + + /* part of new item falls into R[0] */ + if (tb->item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) { + loff_t old_key_comp, old_len, r_zeroes_number; + const char *r_body; + int version, shift; + loff_t offset; + + leaf_shift_right(tb, tb->rnum[0] - 1, -1); + + version = ih_version(ih); + + /* Remember key component and item length */ + old_key_comp = le_ih_k_offset(ih); + old_len = ih_item_len(ih); + + /* + * Calculate key component and item length to insert + * into R[0] + */ + shift = 0; + if (is_indirect_le_ih(ih)) + shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; + offset = le_ih_k_offset(ih) + ((old_len - tb->rbytes) << shift); + set_le_ih_k_offset(ih, offset); + put_ih_item_len(ih, tb->rbytes); + + /* Insert part of the item into R[0] */ + buffer_info_init_right(tb, &bi); + if ((old_len - tb->rbytes) > tb->zeroes_num) { + r_zeroes_number = 0; + r_body = body + (old_len - tb->rbytes) - tb->zeroes_num; + } else { + r_body = body; + r_zeroes_number = tb->zeroes_num - + (old_len - tb->rbytes); + tb->zeroes_num -= r_zeroes_number; } - /* a part of contents of S[0] will be in L[0] and the rest part of S[0] will be in R[0] */ - - RFALSE((tb->lnum[0] + tb->rnum[0] < n) || - (tb->lnum[0] + tb->rnum[0] > n + 1), - "PAP-12050: rnum(%d) and lnum(%d) and item number(%d) in S[0] are not consistent", - tb->rnum[0], tb->lnum[0], n); - RFALSE((tb->lnum[0] + tb->rnum[0] == n) && - (tb->lbytes != -1 || tb->rbytes != -1), - "PAP-12055: bad rbytes (%d)/lbytes (%d) parameters when items are not split", - tb->rbytes, tb->lbytes); - RFALSE((tb->lnum[0] + tb->rnum[0] == n + 1) && - (tb->lbytes < 1 || tb->rbytes != -1), - "PAP-12060: bad rbytes (%d)/lbytes (%d) parameters when items are split", - tb->rbytes, tb->lbytes); - leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number); + + /* Replace right delimiting key by first key in R[0] */ + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); + + /* + * Calculate key component and item length to + * insert into S[0] + */ + set_le_ih_k_offset(ih, old_key_comp); + put_ih_item_len(ih, old_len - tb->rbytes); + + tb->insert_size[0] -= tb->rbytes; + + } else { + /* whole new item falls into R[0] */ + + /* Shift rnum[0]-1 items to R[0] */ + ret = leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes); + + /* Insert new item into R[0] */ + buffer_info_init_right(tb, &bi); + leaf_insert_into_buf(&bi, tb->item_pos - n + tb->rnum[0] - 1, + ih, body, tb->zeroes_num); + + if (tb->item_pos - n + tb->rnum[0] - 1 == 0) + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); + + tb->zeroes_num = tb->insert_size[0] = 0; + } +} + + +static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb, + struct item_head *ih, const char *body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + struct buffer_info bi; + int entry_count; + + RFALSE(tb->zeroes_num, + "PAP-12145: invalid parameter in case of a directory"); + entry_count = ih_entry_count(item_head(tbS0, tb->item_pos)); + + /* new directory entry falls into R[0] */ + if (entry_count - tb->rbytes < tb->pos_in_item) { + int paste_entry_position; + + RFALSE(tb->rbytes - 1 >= entry_count || !tb->insert_size[0], + "PAP-12150: no enough of entries to shift to R[0]: " + "rbytes=%d, entry_count=%d", tb->rbytes, entry_count); + + /* + * Shift rnum[0]-1 items in whole. + * Shift rbytes-1 directory entries from directory + * item number rnum[0] + */ + leaf_shift_right(tb, tb->rnum[0], tb->rbytes - 1); + + /* Paste given directory entry to directory item */ + paste_entry_position = tb->pos_in_item - entry_count + + tb->rbytes - 1; + buffer_info_init_right(tb, &bi); + leaf_paste_in_buffer(&bi, 0, paste_entry_position, + tb->insert_size[0], body, tb->zeroes_num); + + /* paste entry */ + leaf_paste_entries(&bi, 0, paste_entry_position, 1, + (struct reiserfs_de_head *) body, + body + DEH_SIZE, tb->insert_size[0]); + + /* change delimiting keys */ + if (paste_entry_position == 0) + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); + + tb->insert_size[0] = 0; + tb->pos_in_item++; + } else { + /* new directory entry doesn't fall into R[0] */ leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + } +} - reiserfs_invalidate_buffer(tb, tbS0); +static void balance_leaf_paste_right_shift(struct tree_balance *tb, + struct item_head *ih, const char *body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n_shift, n_rem, r_zeroes_number, version; + unsigned long temp_rem; + const char *r_body; + struct buffer_info bi; - return 0; + /* we append to directory item */ + if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) { + balance_leaf_paste_right_shift_dirent(tb, ih, body); + return; } - if (tb->rnum[0] == -1) { - /* all contents of R[0] and S[0] will be in R[0] */ - leaf_shift_right(tb, n, -1); - reiserfs_invalidate_buffer(tb, tbS0); - return 0; + /* regular object */ + + /* + * Calculate number of bytes which must be shifted + * from appended item + */ + n_shift = tb->rbytes - tb->insert_size[0]; + if (n_shift < 0) + n_shift = 0; + + RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)), + "PAP-12155: invalid position to paste. ih_item_len=%d, " + "pos_in_item=%d", tb->pos_in_item, + ih_item_len(item_head(tbS0, tb->item_pos))); + + leaf_shift_right(tb, tb->rnum[0], n_shift); + + /* + * Calculate number of bytes which must remain in body + * after appending to R[0] + */ + n_rem = tb->insert_size[0] - tb->rbytes; + if (n_rem < 0) + n_rem = 0; + + temp_rem = n_rem; + + version = ih_version(item_head(tb->R[0], 0)); + + if (is_indirect_le_key(version, leaf_key(tb->R[0], 0))) { + int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; + temp_rem = n_rem << shift; } - RFALSE(tb->rnum[0], - "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]); - return 0; + add_le_key_k_offset(version, leaf_key(tb->R[0], 0), temp_rem); + add_le_key_k_offset(version, internal_key(tb->CFR[0], tb->rkey[0]), + temp_rem); + + do_balance_mark_internal_dirty(tb, tb->CFR[0], 0); + + /* Append part of body into R[0] */ + buffer_info_init_right(tb, &bi); + if (n_rem > tb->zeroes_num) { + r_zeroes_number = 0; + r_body = body + n_rem - tb->zeroes_num; + } else { + r_body = body; + r_zeroes_number = tb->zeroes_num - n_rem; + tb->zeroes_num -= r_zeroes_number; + } + + leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem, + r_body, r_zeroes_number); + + if (is_indirect_le_ih(item_head(tb->R[0], 0))) + set_ih_free_space(item_head(tb->R[0], 0), 0); + + tb->insert_size[0] = n_rem; + if (!n_rem) + tb->pos_in_item++; } -static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item header of inserted item (this is on little endian) */ - const char *body, /* body of inserted item or bytes to paste */ - int flag, /* i - insert, d - delete, c - cut, p - paste - (see comment to do_balance) */ - struct item_head *insert_key, /* in our processing of one level we sometimes determine what - must be inserted into the next higher level. This insertion - consists of a key or two keys and their corresponding - pointers */ - struct buffer_head **insert_ptr /* inserted node-ptrs for the next level */ - ) +static void balance_leaf_paste_right_whole(struct tree_balance *tb, + struct item_head *ih, const char *body) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - int item_pos = PATH_LAST_POSITION(tb->tb_path); /* index into the array of item headers in S[0] - of the affected item */ + int n = B_NR_ITEMS(tbS0); + struct item_head *pasted; struct buffer_info bi; - struct buffer_head *S_new[2]; /* new nodes allocated to hold what could not fit into S */ - int snum[2]; /* number of items that will be placed - into S_new (includes partially shifted - items) */ - int sbytes[2]; /* if an item is partially shifted into S_new then - if it is a directory item - it is the number of entries from the item that are shifted into S_new - else - it is the number of bytes from the item that are shifted into S_new - */ - int n, i; - int ret_val; - int pos_in_item; - int zeros_num; - PROC_INFO_INC(tb->tb_sb, balance_at[0]); + buffer_info_init_right(tb, &bi); + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + + /* append item in R[0] */ + if (tb->pos_in_item >= 0) { + buffer_info_init_right(tb, &bi); + leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->rnum[0], + tb->pos_in_item, tb->insert_size[0], body, + tb->zeroes_num); + } - /* Make balance in case insert_size[0] < 0 */ - if (tb->insert_size[0] < 0) - return balance_leaf_when_delete(tb, flag); + /* paste new entry, if item is directory item */ + pasted = item_head(tb->R[0], tb->item_pos - n + tb->rnum[0]); + if (is_direntry_le_ih(pasted) && tb->pos_in_item >= 0) { + leaf_paste_entries(&bi, tb->item_pos - n + tb->rnum[0], + tb->pos_in_item, 1, + (struct reiserfs_de_head *)body, + body + DEH_SIZE, tb->insert_size[0]); - zeros_num = 0; - if (flag == M_INSERT && !body) - zeros_num = ih_item_len(ih); + if (!tb->pos_in_item) { - pos_in_item = tb->tb_path->pos_in_item; - /* for indirect item pos_in_item is measured in unformatted node - pointers. Recalculate to bytes */ - if (flag != M_INSERT - && is_indirect_le_ih(B_N_PITEM_HEAD(tbS0, item_pos))) - pos_in_item *= UNFM_P_SIZE; - - if (tb->lnum[0] > 0) { - /* Shift lnum[0] items from S[0] to the left neighbor L[0] */ - if (item_pos < tb->lnum[0]) { - /* new item or it part falls to L[0], shift it too */ - n = B_NR_ITEMS(tb->L[0]); - - switch (flag) { - case M_INSERT: /* insert item into L[0] */ - - if (item_pos == tb->lnum[0] - 1 - && tb->lbytes != -1) { - /* part of new item falls into L[0] */ - int new_item_len; - int version; - - ret_val = - leaf_shift_left(tb, tb->lnum[0] - 1, - -1); - - /* Calculate item length to insert to S[0] */ - new_item_len = - ih_item_len(ih) - tb->lbytes; - /* Calculate and check item length to insert to L[0] */ - put_ih_item_len(ih, - ih_item_len(ih) - - new_item_len); - - RFALSE(ih_item_len(ih) <= 0, - "PAP-12080: there is nothing to insert into L[0]: ih_item_len=%d", - ih_item_len(ih)); - - /* Insert new item into L[0] */ - buffer_info_init_left(tb, &bi); - leaf_insert_into_buf(&bi, - n + item_pos - - ret_val, ih, body, - zeros_num > - ih_item_len(ih) ? - ih_item_len(ih) : - zeros_num); - - version = ih_version(ih); - - /* Calculate key component, item length and body to insert into S[0] */ - set_le_ih_k_offset(ih, - le_ih_k_offset(ih) + - (tb-> - lbytes << - (is_indirect_le_ih - (ih) ? tb->tb_sb-> - s_blocksize_bits - - UNFM_P_SHIFT : - 0))); - - put_ih_item_len(ih, new_item_len); - if (tb->lbytes > zeros_num) { - body += - (tb->lbytes - zeros_num); - zeros_num = 0; - } else - zeros_num -= tb->lbytes; - - RFALSE(ih_item_len(ih) <= 0, - "PAP-12085: there is nothing to insert into S[0]: ih_item_len=%d", - ih_item_len(ih)); - } else { - /* new item in whole falls into L[0] */ - /* Shift lnum[0]-1 items to L[0] */ - ret_val = - leaf_shift_left(tb, tb->lnum[0] - 1, - tb->lbytes); - /* Insert new item into L[0] */ - buffer_info_init_left(tb, &bi); - leaf_insert_into_buf(&bi, - n + item_pos - - ret_val, ih, body, - zeros_num); - tb->insert_size[0] = 0; - zeros_num = 0; - } - break; - - case M_PASTE: /* append item in L[0] */ - - if (item_pos == tb->lnum[0] - 1 - && tb->lbytes != -1) { - /* we must shift the part of the appended item */ - if (is_direntry_le_ih - (B_N_PITEM_HEAD(tbS0, item_pos))) { - - RFALSE(zeros_num, - "PAP-12090: invalid parameter in case of a directory"); - /* directory item */ - if (tb->lbytes > pos_in_item) { - /* new directory entry falls into L[0] */ - struct item_head - *pasted; - int l_pos_in_item = - pos_in_item; - - /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 entries from given directory item */ - ret_val = - leaf_shift_left(tb, - tb-> - lnum - [0], - tb-> - lbytes - - - 1); - if (ret_val - && !item_pos) { - pasted = - B_N_PITEM_HEAD - (tb->L[0], - B_NR_ITEMS - (tb-> - L[0]) - - 1); - l_pos_in_item += - I_ENTRY_COUNT - (pasted) - - (tb-> - lbytes - - 1); - } - - /* Append given directory entry to directory item */ - buffer_info_init_left(tb, &bi); - leaf_paste_in_buffer - (&bi, - n + item_pos - - ret_val, - l_pos_in_item, - tb->insert_size[0], - body, zeros_num); - - /* previous string prepared space for pasting new entry, following string pastes this entry */ - - /* when we have merge directory item, pos_in_item has been changed too */ - - /* paste new directory entry. 1 is entry number */ - leaf_paste_entries(&bi, - n + - item_pos - - - ret_val, - l_pos_in_item, - 1, - (struct - reiserfs_de_head - *) - body, - body - + - DEH_SIZE, - tb-> - insert_size - [0] - ); - tb->insert_size[0] = 0; - } else { - /* new directory item doesn't fall into L[0] */ - /* Shift lnum[0]-1 items in whole. Shift lbytes directory entries from directory item number lnum[0] */ - leaf_shift_left(tb, - tb-> - lnum[0], - tb-> - lbytes); - } - /* Calculate new position to append in item body */ - pos_in_item -= tb->lbytes; - } else { - /* regular object */ - RFALSE(tb->lbytes <= 0, - "PAP-12095: there is nothing to shift to L[0]. lbytes=%d", - tb->lbytes); - RFALSE(pos_in_item != - ih_item_len - (B_N_PITEM_HEAD - (tbS0, item_pos)), - "PAP-12100: incorrect position to paste: item_len=%d, pos_in_item=%d", - ih_item_len - (B_N_PITEM_HEAD - (tbS0, item_pos)), - pos_in_item); - - if (tb->lbytes >= pos_in_item) { - /* appended item will be in L[0] in whole */ - int l_n; - - /* this bytes number must be appended to the last item of L[h] */ - l_n = - tb->lbytes - - pos_in_item; - - /* Calculate new insert_size[0] */ - tb->insert_size[0] -= - l_n; - - RFALSE(tb-> - insert_size[0] <= - 0, - "PAP-12105: there is nothing to paste into L[0]. insert_size=%d", - tb-> - insert_size[0]); - ret_val = - leaf_shift_left(tb, - tb-> - lnum - [0], - ih_item_len - (B_N_PITEM_HEAD - (tbS0, - item_pos))); - /* Append to body of item in L[0] */ - buffer_info_init_left(tb, &bi); - leaf_paste_in_buffer - (&bi, - n + item_pos - - ret_val, - ih_item_len - (B_N_PITEM_HEAD - (tb->L[0], - n + item_pos - - ret_val)), l_n, - body, - zeros_num > - l_n ? l_n : - zeros_num); - /* 0-th item in S0 can be only of DIRECT type when l_n != 0 */ - { - int version; - int temp_l = - l_n; - - RFALSE - (ih_item_len - (B_N_PITEM_HEAD - (tbS0, - 0)), - "PAP-12106: item length must be 0"); - RFALSE - (comp_short_le_keys - (B_N_PKEY - (tbS0, 0), - B_N_PKEY - (tb->L[0], - n + - item_pos - - - ret_val)), - "PAP-12107: items must be of the same file"); - if (is_indirect_le_ih(B_N_PITEM_HEAD(tb->L[0], n + item_pos - ret_val))) { - temp_l = - l_n - << - (tb-> - tb_sb-> - s_blocksize_bits - - - UNFM_P_SHIFT); - } - /* update key of first item in S0 */ - version = - ih_version - (B_N_PITEM_HEAD - (tbS0, 0)); - set_le_key_k_offset - (version, - B_N_PKEY - (tbS0, 0), - le_key_k_offset - (version, - B_N_PKEY - (tbS0, - 0)) + - temp_l); - /* update left delimiting key */ - set_le_key_k_offset - (version, - B_N_PDELIM_KEY - (tb-> - CFL[0], - tb-> - lkey[0]), - le_key_k_offset - (version, - B_N_PDELIM_KEY - (tb-> - CFL[0], - tb-> - lkey[0])) - + temp_l); - } - - /* Calculate new body, position in item and insert_size[0] */ - if (l_n > zeros_num) { - body += - (l_n - - zeros_num); - zeros_num = 0; - } else - zeros_num -= - l_n; - pos_in_item = 0; - - RFALSE - (comp_short_le_keys - (B_N_PKEY(tbS0, 0), - B_N_PKEY(tb->L[0], - B_NR_ITEMS - (tb-> - L[0]) - - 1)) - || - !op_is_left_mergeable - (B_N_PKEY(tbS0, 0), - tbS0->b_size) - || - !op_is_left_mergeable - (B_N_PDELIM_KEY - (tb->CFL[0], - tb->lkey[0]), - tbS0->b_size), - "PAP-12120: item must be merge-able with left neighboring item"); - } else { /* only part of the appended item will be in L[0] */ - - /* Calculate position in item for append in S[0] */ - pos_in_item -= - tb->lbytes; - - RFALSE(pos_in_item <= 0, - "PAP-12125: no place for paste. pos_in_item=%d", - pos_in_item); - - /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */ - leaf_shift_left(tb, - tb-> - lnum[0], - tb-> - lbytes); - } - } - } else { /* appended item will be in L[0] in whole */ - - struct item_head *pasted; - - if (!item_pos && op_is_left_mergeable(B_N_PKEY(tbS0, 0), tbS0->b_size)) { /* if we paste into first item of S[0] and it is left mergable */ - /* then increment pos_in_item by the size of the last item in L[0] */ - pasted = - B_N_PITEM_HEAD(tb->L[0], - n - 1); - if (is_direntry_le_ih(pasted)) - pos_in_item += - ih_entry_count - (pasted); - else - pos_in_item += - ih_item_len(pasted); - } - - /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */ - ret_val = - leaf_shift_left(tb, tb->lnum[0], - tb->lbytes); - /* Append to body of item in L[0] */ - buffer_info_init_left(tb, &bi); - leaf_paste_in_buffer(&bi, - n + item_pos - - ret_val, - pos_in_item, - tb->insert_size[0], - body, zeros_num); - - /* if appended item is directory, paste entry */ - pasted = - B_N_PITEM_HEAD(tb->L[0], - n + item_pos - - ret_val); - if (is_direntry_le_ih(pasted)) - leaf_paste_entries(&bi, - n + - item_pos - - ret_val, - pos_in_item, - 1, - (struct - reiserfs_de_head - *)body, - body + - DEH_SIZE, - tb-> - insert_size - [0] - ); - /* if appended item is indirect item, put unformatted node into un list */ - if (is_indirect_le_ih(pasted)) - set_ih_free_space(pasted, 0); - tb->insert_size[0] = 0; - zeros_num = 0; - } - break; - default: /* cases d and t */ - reiserfs_panic(tb->tb_sb, "PAP-12130", - "lnum > 0: unexpected mode: " - " %s(%d)", - (flag == - M_DELETE) ? "DELETE" : ((flag == - M_CUT) - ? "CUT" - : - "UNKNOWN"), - flag); - } - } else { - /* new item doesn't fall into L[0] */ - leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + RFALSE(tb->item_pos - n + tb->rnum[0], + "PAP-12165: directory item must be first " + "item of node when pasting is in 0th position"); + + /* update delimiting keys */ + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); } } - /* tb->lnum[0] > 0 */ - /* Calculate new item position */ - item_pos -= (tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0)); - - if (tb->rnum[0] > 0) { - /* shift rnum[0] items from S[0] to the right neighbor R[0] */ - n = B_NR_ITEMS(tbS0); - switch (flag) { - - case M_INSERT: /* insert item */ - if (n - tb->rnum[0] < item_pos) { /* new item or its part falls to R[0] */ - if (item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) { /* part of new item falls into R[0] */ - loff_t old_key_comp, old_len, - r_zeros_number; - const char *r_body; - int version; - loff_t offset; - - leaf_shift_right(tb, tb->rnum[0] - 1, - -1); - - version = ih_version(ih); - /* Remember key component and item length */ - old_key_comp = le_ih_k_offset(ih); - old_len = ih_item_len(ih); - - /* Calculate key component and item length to insert into R[0] */ - offset = - le_ih_k_offset(ih) + - ((old_len - - tb-> - rbytes) << (is_indirect_le_ih(ih) - ? tb->tb_sb-> - s_blocksize_bits - - UNFM_P_SHIFT : 0)); - set_le_ih_k_offset(ih, offset); - put_ih_item_len(ih, tb->rbytes); - /* Insert part of the item into R[0] */ - buffer_info_init_right(tb, &bi); - if ((old_len - tb->rbytes) > zeros_num) { - r_zeros_number = 0; - r_body = - body + (old_len - - tb->rbytes) - - zeros_num; - } else { - r_body = body; - r_zeros_number = - zeros_num - (old_len - - tb->rbytes); - zeros_num -= r_zeros_number; - } - - leaf_insert_into_buf(&bi, 0, ih, r_body, - r_zeros_number); - - /* Replace right delimiting key by first key in R[0] */ - replace_key(tb, tb->CFR[0], tb->rkey[0], - tb->R[0], 0); - - /* Calculate key component and item length to insert into S[0] */ - set_le_ih_k_offset(ih, old_key_comp); - put_ih_item_len(ih, - old_len - tb->rbytes); - - tb->insert_size[0] -= tb->rbytes; - - } else { /* whole new item falls into R[0] */ - - /* Shift rnum[0]-1 items to R[0] */ - ret_val = - leaf_shift_right(tb, - tb->rnum[0] - 1, - tb->rbytes); - /* Insert new item into R[0] */ - buffer_info_init_right(tb, &bi); - leaf_insert_into_buf(&bi, - item_pos - n + - tb->rnum[0] - 1, - ih, body, - zeros_num); - - if (item_pos - n + tb->rnum[0] - 1 == 0) { - replace_key(tb, tb->CFR[0], - tb->rkey[0], - tb->R[0], 0); - - } - zeros_num = tb->insert_size[0] = 0; - } - } else { /* new item or part of it doesn't fall into R[0] */ - - leaf_shift_right(tb, tb->rnum[0], tb->rbytes); - } - break; + if (is_indirect_le_ih(pasted)) + set_ih_free_space(pasted, 0); + tb->zeroes_num = tb->insert_size[0] = 0; +} - case M_PASTE: /* append item */ - - if (n - tb->rnum[0] <= item_pos) { /* pasted item or part of it falls to R[0] */ - if (item_pos == n - tb->rnum[0] && tb->rbytes != -1) { /* we must shift the part of the appended item */ - if (is_direntry_le_ih(B_N_PITEM_HEAD(tbS0, item_pos))) { /* we append to directory item */ - int entry_count; - - RFALSE(zeros_num, - "PAP-12145: invalid parameter in case of a directory"); - entry_count = - I_ENTRY_COUNT(B_N_PITEM_HEAD - (tbS0, - item_pos)); - if (entry_count - tb->rbytes < - pos_in_item) - /* new directory entry falls into R[0] */ - { - int paste_entry_position; - - RFALSE(tb->rbytes - 1 >= - entry_count - || !tb-> - insert_size[0], - "PAP-12150: no enough of entries to shift to R[0]: rbytes=%d, entry_count=%d", - tb->rbytes, - entry_count); - /* Shift rnum[0]-1 items in whole. Shift rbytes-1 directory entries from directory item number rnum[0] */ - leaf_shift_right(tb, - tb-> - rnum - [0], - tb-> - rbytes - - 1); - /* Paste given directory entry to directory item */ - paste_entry_position = - pos_in_item - - entry_count + - tb->rbytes - 1; - buffer_info_init_right(tb, &bi); - leaf_paste_in_buffer - (&bi, 0, - paste_entry_position, - tb->insert_size[0], - body, zeros_num); - /* paste entry */ - leaf_paste_entries(&bi, - 0, - paste_entry_position, - 1, - (struct - reiserfs_de_head - *) - body, - body - + - DEH_SIZE, - tb-> - insert_size - [0] - ); - - if (paste_entry_position - == 0) { - /* change delimiting keys */ - replace_key(tb, - tb-> - CFR - [0], - tb-> - rkey - [0], - tb-> - R - [0], - 0); - } - - tb->insert_size[0] = 0; - pos_in_item++; - } else { /* new directory entry doesn't fall into R[0] */ - - leaf_shift_right(tb, - tb-> - rnum - [0], - tb-> - rbytes); - } - } else { /* regular object */ - - int n_shift, n_rem, - r_zeros_number; - const char *r_body; - - /* Calculate number of bytes which must be shifted from appended item */ - if ((n_shift = - tb->rbytes - - tb->insert_size[0]) < 0) - n_shift = 0; - - RFALSE(pos_in_item != - ih_item_len - (B_N_PITEM_HEAD - (tbS0, item_pos)), - "PAP-12155: invalid position to paste. ih_item_len=%d, pos_in_item=%d", - pos_in_item, - ih_item_len - (B_N_PITEM_HEAD - (tbS0, item_pos))); - - leaf_shift_right(tb, - tb->rnum[0], - n_shift); - /* Calculate number of bytes which must remain in body after appending to R[0] */ - if ((n_rem = - tb->insert_size[0] - - tb->rbytes) < 0) - n_rem = 0; - - { - int version; - unsigned long temp_rem = - n_rem; - - version = - ih_version - (B_N_PITEM_HEAD - (tb->R[0], 0)); - if (is_indirect_le_key - (version, - B_N_PKEY(tb->R[0], - 0))) { - temp_rem = - n_rem << - (tb->tb_sb-> - s_blocksize_bits - - - UNFM_P_SHIFT); - } - set_le_key_k_offset - (version, - B_N_PKEY(tb->R[0], - 0), - le_key_k_offset - (version, - B_N_PKEY(tb->R[0], - 0)) + - temp_rem); - set_le_key_k_offset - (version, - B_N_PDELIM_KEY(tb-> - CFR - [0], - tb-> - rkey - [0]), - le_key_k_offset - (version, - B_N_PDELIM_KEY - (tb->CFR[0], - tb->rkey[0])) + - temp_rem); - } -/* k_offset (B_N_PKEY(tb->R[0],0)) += n_rem; - k_offset (B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0])) += n_rem;*/ - do_balance_mark_internal_dirty - (tb, tb->CFR[0], 0); - - /* Append part of body into R[0] */ - buffer_info_init_right(tb, &bi); - if (n_rem > zeros_num) { - r_zeros_number = 0; - r_body = - body + n_rem - - zeros_num; - } else { - r_body = body; - r_zeros_number = - zeros_num - n_rem; - zeros_num -= - r_zeros_number; - } - - leaf_paste_in_buffer(&bi, 0, - n_shift, - tb-> - insert_size - [0] - - n_rem, - r_body, - r_zeros_number); - - if (is_indirect_le_ih - (B_N_PITEM_HEAD - (tb->R[0], 0))) { -#if 0 - RFALSE(n_rem, - "PAP-12160: paste more than one unformatted node pointer"); -#endif - set_ih_free_space - (B_N_PITEM_HEAD - (tb->R[0], 0), 0); - } - tb->insert_size[0] = n_rem; - if (!n_rem) - pos_in_item++; - } - } else { /* pasted item in whole falls into R[0] */ - - struct item_head *pasted; - - ret_val = - leaf_shift_right(tb, tb->rnum[0], - tb->rbytes); - /* append item in R[0] */ - if (pos_in_item >= 0) { - buffer_info_init_right(tb, &bi); - leaf_paste_in_buffer(&bi, - item_pos - - n + - tb-> - rnum[0], - pos_in_item, - tb-> - insert_size - [0], body, - zeros_num); - } - - /* paste new entry, if item is directory item */ - pasted = - B_N_PITEM_HEAD(tb->R[0], - item_pos - n + - tb->rnum[0]); - if (is_direntry_le_ih(pasted) - && pos_in_item >= 0) { - leaf_paste_entries(&bi, - item_pos - - n + - tb->rnum[0], - pos_in_item, - 1, - (struct - reiserfs_de_head - *)body, - body + - DEH_SIZE, - tb-> - insert_size - [0] - ); - if (!pos_in_item) { - - RFALSE(item_pos - n + - tb->rnum[0], - "PAP-12165: directory item must be first item of node when pasting is in 0th position"); - - /* update delimiting keys */ - replace_key(tb, - tb->CFR[0], - tb->rkey[0], - tb->R[0], - 0); - } - } - - if (is_indirect_le_ih(pasted)) - set_ih_free_space(pasted, 0); - zeros_num = tb->insert_size[0] = 0; - } - } else { /* new item doesn't fall into R[0] */ - - leaf_shift_right(tb, tb->rnum[0], tb->rbytes); - } - break; - default: /* cases d and t */ - reiserfs_panic(tb->tb_sb, "PAP-12175", - "rnum > 0: unexpected mode: %s(%d)", - (flag == - M_DELETE) ? "DELETE" : ((flag == - M_CUT) ? "CUT" - : "UNKNOWN"), - flag); - } +static void balance_leaf_paste_right(struct tree_balance *tb, + struct item_head *ih, const char *body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + /* new item doesn't fall into R[0] */ + if (n - tb->rnum[0] > tb->item_pos) { + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + return; } - /* tb->rnum[0] > 0 */ - RFALSE(tb->blknum[0] > 3, - "PAP-12180: blknum can not be %d. It must be <= 3", - tb->blknum[0]); - RFALSE(tb->blknum[0] < 0, - "PAP-12185: blknum can not be %d. It must be >= 0", - tb->blknum[0]); + /* pasted item or part of it falls to R[0] */ - /* if while adding to a node we discover that it is possible to split - it in two, and merge the left part into the left neighbor and the - right part into the right neighbor, eliminating the node */ - if (tb->blknum[0] == 0) { /* node S[0] is empty now */ + if (tb->item_pos == n - tb->rnum[0] && tb->rbytes != -1) + /* we must shift the part of the appended item */ + balance_leaf_paste_right_shift(tb, ih, body); + else + /* pasted item in whole falls into R[0] */ + balance_leaf_paste_right_whole(tb, ih, body); +} - RFALSE(!tb->lnum[0] || !tb->rnum[0], - "PAP-12190: lnum and rnum must not be zero"); - /* if insertion was done before 0-th position in R[0], right - delimiting key of the tb->L[0]'s and left delimiting key are - not set correctly */ - if (tb->CFL[0]) { - if (!tb->CFR[0]) - reiserfs_panic(tb->tb_sb, "vs-12195", - "CFR not initialized"); - copy_key(B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0]), - B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0])); - do_balance_mark_internal_dirty(tb, tb->CFL[0], 0); +/* shift rnum[0] items from S[0] to the right neighbor R[0] */ +static void balance_leaf_right(struct tree_balance *tb, struct item_head *ih, + const char *body, int flag) +{ + if (tb->rnum[0] <= 0) + return; + + BUG_ON(flag != M_INSERT && flag != M_PASTE); + + if (flag == M_INSERT) + balance_leaf_insert_right(tb, ih, body); + else /* M_PASTE */ + balance_leaf_paste_right(tb, ih, body); +} + +static void balance_leaf_new_nodes_insert(struct tree_balance *tb, + struct item_head *ih, + const char *body, + struct item_head *insert_key, + struct buffer_head **insert_ptr, + int i) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + struct buffer_info bi; + int shift; + + /* new item or it part don't falls into S_new[i] */ + if (n - tb->snum[i] >= tb->item_pos) { + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, + tb->snum[i], tb->sbytes[i], tb->S_new[i]); + return; + } + + /* new item or it's part falls to first new node S_new[i] */ + + /* part of new item falls into S_new[i] */ + if (tb->item_pos == n - tb->snum[i] + 1 && tb->sbytes[i] != -1) { + int old_key_comp, old_len, r_zeroes_number; + const char *r_body; + int version; + + /* Move snum[i]-1 items from S[0] to S_new[i] */ + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i] - 1, -1, + tb->S_new[i]); + + /* Remember key component and item length */ + version = ih_version(ih); + old_key_comp = le_ih_k_offset(ih); + old_len = ih_item_len(ih); + + /* + * Calculate key component and item length to insert + * into S_new[i] + */ + shift = 0; + if (is_indirect_le_ih(ih)) + shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; + set_le_ih_k_offset(ih, + le_ih_k_offset(ih) + + ((old_len - tb->sbytes[i]) << shift)); + + put_ih_item_len(ih, tb->sbytes[i]); + + /* Insert part of the item into S_new[i] before 0-th item */ + buffer_info_init_bh(tb, &bi, tb->S_new[i]); + + if ((old_len - tb->sbytes[i]) > tb->zeroes_num) { + r_zeroes_number = 0; + r_body = body + (old_len - tb->sbytes[i]) - + tb->zeroes_num; + } else { + r_body = body; + r_zeroes_number = tb->zeroes_num - (old_len - + tb->sbytes[i]); + tb->zeroes_num -= r_zeroes_number; } - reiserfs_invalidate_buffer(tb, tbS0); - return 0; + leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number); + + /* + * Calculate key component and item length to + * insert into S[i] + */ + set_le_ih_k_offset(ih, old_key_comp); + put_ih_item_len(ih, old_len - tb->sbytes[i]); + tb->insert_size[0] -= tb->sbytes[i]; + } else { + /* whole new item falls into S_new[i] */ + + /* + * Shift snum[0] - 1 items to S_new[i] + * (sbytes[i] of split item) + */ + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, + tb->snum[i] - 1, tb->sbytes[i], tb->S_new[i]); + + /* Insert new item into S_new[i] */ + buffer_info_init_bh(tb, &bi, tb->S_new[i]); + leaf_insert_into_buf(&bi, tb->item_pos - n + tb->snum[i] - 1, + ih, body, tb->zeroes_num); + + tb->zeroes_num = tb->insert_size[0] = 0; } +} - /* Fill new nodes that appear in place of S[0] */ +/* we append to directory item */ +static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb, + struct item_head *ih, + const char *body, + struct item_head *insert_key, + struct buffer_head **insert_ptr, + int i) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + struct item_head *aux_ih = item_head(tbS0, tb->item_pos); + int entry_count = ih_entry_count(aux_ih); + struct buffer_info bi; + + if (entry_count - tb->sbytes[i] < tb->pos_in_item && + tb->pos_in_item <= entry_count) { + /* new directory entry falls into S_new[i] */ + + RFALSE(!tb->insert_size[0], + "PAP-12215: insert_size is already 0"); + RFALSE(tb->sbytes[i] - 1 >= entry_count, + "PAP-12220: there are no so much entries (%d), only %d", + tb->sbytes[i] - 1, entry_count); + + /* + * Shift snum[i]-1 items in whole. + * Shift sbytes[i] directory entries + * from directory item number snum[i] + */ + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], + tb->sbytes[i] - 1, tb->S_new[i]); + + /* + * Paste given directory entry to + * directory item + */ + buffer_info_init_bh(tb, &bi, tb->S_new[i]); + leaf_paste_in_buffer(&bi, 0, tb->pos_in_item - entry_count + + tb->sbytes[i] - 1, tb->insert_size[0], + body, tb->zeroes_num); + + /* paste new directory entry */ + leaf_paste_entries(&bi, 0, tb->pos_in_item - entry_count + + tb->sbytes[i] - 1, 1, + (struct reiserfs_de_head *) body, + body + DEH_SIZE, tb->insert_size[0]); + + tb->insert_size[0] = 0; + tb->pos_in_item++; + } else { + /* new directory entry doesn't fall into S_new[i] */ + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], + tb->sbytes[i], tb->S_new[i]); + } + +} - /* I am told that this copying is because we need an array to enable - the looping code. -Hans */ - snum[0] = tb->s1num, snum[1] = tb->s2num; - sbytes[0] = tb->s1bytes; - sbytes[1] = tb->s2bytes; +static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb, + struct item_head *ih, + const char *body, + struct item_head *insert_key, + struct buffer_head **insert_ptr, + int i) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + struct item_head *aux_ih = item_head(tbS0, tb->item_pos); + int n_shift, n_rem, r_zeroes_number, shift; + const char *r_body; + struct item_head *tmp; + struct buffer_info bi; + + RFALSE(ih, "PAP-12210: ih must be 0"); + + if (is_direntry_le_ih(aux_ih)) { + balance_leaf_new_nodes_paste_dirent(tb, ih, body, insert_key, + insert_ptr, i); + return; + } + + /* regular object */ + + + RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)) || + tb->insert_size[0] <= 0, + "PAP-12225: item too short or insert_size <= 0"); + + /* + * Calculate number of bytes which must be shifted from appended item + */ + n_shift = tb->sbytes[i] - tb->insert_size[0]; + if (n_shift < 0) + n_shift = 0; + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], n_shift, + tb->S_new[i]); + + /* + * Calculate number of bytes which must remain in body after + * append to S_new[i] + */ + n_rem = tb->insert_size[0] - tb->sbytes[i]; + if (n_rem < 0) + n_rem = 0; + + /* Append part of body into S_new[0] */ + buffer_info_init_bh(tb, &bi, tb->S_new[i]); + if (n_rem > tb->zeroes_num) { + r_zeroes_number = 0; + r_body = body + n_rem - tb->zeroes_num; + } else { + r_body = body; + r_zeroes_number = tb->zeroes_num - n_rem; + tb->zeroes_num -= r_zeroes_number; + } + + leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem, + r_body, r_zeroes_number); + + tmp = item_head(tb->S_new[i], 0); + shift = 0; + if (is_indirect_le_ih(tmp)) { + set_ih_free_space(tmp, 0); + shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; + } + add_le_ih_k_offset(tmp, n_rem << shift); + + tb->insert_size[0] = n_rem; + if (!n_rem) + tb->pos_in_item++; +} + +static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb, + struct item_head *ih, + const char *body, + struct item_head *insert_key, + struct buffer_head **insert_ptr, + int i) + +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + int leaf_mi; + struct item_head *pasted; + struct buffer_info bi; + +#ifdef CONFIG_REISERFS_CHECK + struct item_head *ih_check = item_head(tbS0, tb->item_pos); + + if (!is_direntry_le_ih(ih_check) && + (tb->pos_in_item != ih_item_len(ih_check) || + tb->insert_size[0] <= 0)) + reiserfs_panic(tb->tb_sb, + "PAP-12235", + "pos_in_item must be equal to ih_item_len"); +#endif + + leaf_mi = leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], + tb->sbytes[i], tb->S_new[i]); + + RFALSE(leaf_mi, + "PAP-12240: unexpected value returned by leaf_move_items (%d)", + leaf_mi); + + /* paste into item */ + buffer_info_init_bh(tb, &bi, tb->S_new[i]); + leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->snum[i], + tb->pos_in_item, tb->insert_size[0], + body, tb->zeroes_num); + + pasted = item_head(tb->S_new[i], tb->item_pos - n + + tb->snum[i]); + if (is_direntry_le_ih(pasted)) + leaf_paste_entries(&bi, tb->item_pos - n + tb->snum[i], + tb->pos_in_item, 1, + (struct reiserfs_de_head *)body, + body + DEH_SIZE, tb->insert_size[0]); + + /* if we paste to indirect item update ih_free_space */ + if (is_indirect_le_ih(pasted)) + set_ih_free_space(pasted, 0); + + tb->zeroes_num = tb->insert_size[0] = 0; + +} +static void balance_leaf_new_nodes_paste(struct tree_balance *tb, + struct item_head *ih, + const char *body, + struct item_head *insert_key, + struct buffer_head **insert_ptr, + int i) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + + /* pasted item doesn't fall into S_new[i] */ + if (n - tb->snum[i] > tb->item_pos) { + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, + tb->snum[i], tb->sbytes[i], tb->S_new[i]); + return; + } + + /* pasted item or part if it falls to S_new[i] */ + + if (tb->item_pos == n - tb->snum[i] && tb->sbytes[i] != -1) + /* we must shift part of the appended item */ + balance_leaf_new_nodes_paste_shift(tb, ih, body, insert_key, + insert_ptr, i); + else + /* item falls wholly into S_new[i] */ + balance_leaf_new_nodes_paste_whole(tb, ih, body, insert_key, + insert_ptr, i); +} + +/* Fill new nodes that appear in place of S[0] */ +static void balance_leaf_new_nodes(struct tree_balance *tb, + struct item_head *ih, + const char *body, + struct item_head *insert_key, + struct buffer_head **insert_ptr, + int flag) +{ + int i; for (i = tb->blknum[0] - 2; i >= 0; i--) { + BUG_ON(flag != M_INSERT && flag != M_PASTE); - RFALSE(!snum[i], "PAP-12200: snum[%d] == %d. Must be > 0", i, - snum[i]); + RFALSE(!tb->snum[i], + "PAP-12200: snum[%d] == %d. Must be > 0", i, + tb->snum[i]); /* here we shift from S to S_new nodes */ - S_new[i] = get_FEB(tb); + tb->S_new[i] = get_FEB(tb); /* initialized block type and tree level */ - set_blkh_level(B_BLK_HEAD(S_new[i]), DISK_LEAF_NODE_LEVEL); - - n = B_NR_ITEMS(tbS0); - - switch (flag) { - case M_INSERT: /* insert item */ - - if (n - snum[i] < item_pos) { /* new item or it's part falls to first new node S_new[i] */ - if (item_pos == n - snum[i] + 1 && sbytes[i] != -1) { /* part of new item falls into S_new[i] */ - int old_key_comp, old_len, - r_zeros_number; - const char *r_body; - int version; - - /* Move snum[i]-1 items from S[0] to S_new[i] */ - leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, - snum[i] - 1, -1, - S_new[i]); - /* Remember key component and item length */ - version = ih_version(ih); - old_key_comp = le_ih_k_offset(ih); - old_len = ih_item_len(ih); - - /* Calculate key component and item length to insert into S_new[i] */ - set_le_ih_k_offset(ih, - le_ih_k_offset(ih) + - ((old_len - - sbytes[i]) << - (is_indirect_le_ih - (ih) ? tb->tb_sb-> - s_blocksize_bits - - UNFM_P_SHIFT : - 0))); - - put_ih_item_len(ih, sbytes[i]); - - /* Insert part of the item into S_new[i] before 0-th item */ - buffer_info_init_bh(tb, &bi, S_new[i]); - - if ((old_len - sbytes[i]) > zeros_num) { - r_zeros_number = 0; - r_body = - body + (old_len - - sbytes[i]) - - zeros_num; - } else { - r_body = body; - r_zeros_number = - zeros_num - (old_len - - sbytes[i]); - zeros_num -= r_zeros_number; - } - - leaf_insert_into_buf(&bi, 0, ih, r_body, - r_zeros_number); - - /* Calculate key component and item length to insert into S[i] */ - set_le_ih_k_offset(ih, old_key_comp); - put_ih_item_len(ih, - old_len - sbytes[i]); - tb->insert_size[0] -= sbytes[i]; - } else { /* whole new item falls into S_new[i] */ - - /* Shift snum[0] - 1 items to S_new[i] (sbytes[i] of split item) */ - leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, - snum[i] - 1, sbytes[i], - S_new[i]); - - /* Insert new item into S_new[i] */ - buffer_info_init_bh(tb, &bi, S_new[i]); - leaf_insert_into_buf(&bi, - item_pos - n + - snum[i] - 1, ih, - body, zeros_num); - - zeros_num = tb->insert_size[0] = 0; - } - } - - else { /* new item or it part don't falls into S_new[i] */ + set_blkh_level(B_BLK_HEAD(tb->S_new[i]), DISK_LEAF_NODE_LEVEL); + + if (flag == M_INSERT) + balance_leaf_new_nodes_insert(tb, ih, body, insert_key, + insert_ptr, i); + else /* M_PASTE */ + balance_leaf_new_nodes_paste(tb, ih, body, insert_key, + insert_ptr, i); + + memcpy(insert_key + i, leaf_key(tb->S_new[i], 0), KEY_SIZE); + insert_ptr[i] = tb->S_new[i]; + + RFALSE(!buffer_journaled(tb->S_new[i]) + || buffer_journal_dirty(tb->S_new[i]) + || buffer_dirty(tb->S_new[i]), + "PAP-12247: S_new[%d] : (%b)", + i, tb->S_new[i]); + } +} - leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, - snum[i], sbytes[i], S_new[i]); - } - break; +static void balance_leaf_finish_node_insert(struct tree_balance *tb, + struct item_head *ih, + const char *body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + struct buffer_info bi; + buffer_info_init_tbS0(tb, &bi); + leaf_insert_into_buf(&bi, tb->item_pos, ih, body, tb->zeroes_num); - case M_PASTE: /* append item */ - - if (n - snum[i] <= item_pos) { /* pasted item or part if it falls to S_new[i] */ - if (item_pos == n - snum[i] && sbytes[i] != -1) { /* we must shift part of the appended item */ - struct item_head *aux_ih; - - RFALSE(ih, "PAP-12210: ih must be 0"); - - aux_ih = B_N_PITEM_HEAD(tbS0, item_pos); - if (is_direntry_le_ih(aux_ih)) { - /* we append to directory item */ - - int entry_count; - - entry_count = - ih_entry_count(aux_ih); - - if (entry_count - sbytes[i] < - pos_in_item - && pos_in_item <= - entry_count) { - /* new directory entry falls into S_new[i] */ - - RFALSE(!tb-> - insert_size[0], - "PAP-12215: insert_size is already 0"); - RFALSE(sbytes[i] - 1 >= - entry_count, - "PAP-12220: there are no so much entries (%d), only %d", - sbytes[i] - 1, - entry_count); - - /* Shift snum[i]-1 items in whole. Shift sbytes[i] directory entries from directory item number snum[i] */ - leaf_move_items - (LEAF_FROM_S_TO_SNEW, - tb, snum[i], - sbytes[i] - 1, - S_new[i]); - /* Paste given directory entry to directory item */ - buffer_info_init_bh(tb, &bi, S_new[i]); - leaf_paste_in_buffer - (&bi, 0, - pos_in_item - - entry_count + - sbytes[i] - 1, - tb->insert_size[0], - body, zeros_num); - /* paste new directory entry */ - leaf_paste_entries(&bi, - 0, - pos_in_item - - - entry_count - + - sbytes - [i] - - 1, 1, - (struct - reiserfs_de_head - *) - body, - body - + - DEH_SIZE, - tb-> - insert_size - [0] - ); - tb->insert_size[0] = 0; - pos_in_item++; - } else { /* new directory entry doesn't fall into S_new[i] */ - leaf_move_items - (LEAF_FROM_S_TO_SNEW, - tb, snum[i], - sbytes[i], - S_new[i]); - } - } else { /* regular object */ - - int n_shift, n_rem, - r_zeros_number; - const char *r_body; - - RFALSE(pos_in_item != - ih_item_len - (B_N_PITEM_HEAD - (tbS0, item_pos)) - || tb->insert_size[0] <= - 0, - "PAP-12225: item too short or insert_size <= 0"); - - /* Calculate number of bytes which must be shifted from appended item */ - n_shift = - sbytes[i] - - tb->insert_size[0]; - if (n_shift < 0) - n_shift = 0; - leaf_move_items - (LEAF_FROM_S_TO_SNEW, tb, - snum[i], n_shift, - S_new[i]); - - /* Calculate number of bytes which must remain in body after append to S_new[i] */ - n_rem = - tb->insert_size[0] - - sbytes[i]; - if (n_rem < 0) - n_rem = 0; - /* Append part of body into S_new[0] */ - buffer_info_init_bh(tb, &bi, S_new[i]); - if (n_rem > zeros_num) { - r_zeros_number = 0; - r_body = - body + n_rem - - zeros_num; - } else { - r_body = body; - r_zeros_number = - zeros_num - n_rem; - zeros_num -= - r_zeros_number; - } - - leaf_paste_in_buffer(&bi, 0, - n_shift, - tb-> - insert_size - [0] - - n_rem, - r_body, - r_zeros_number); - { - struct item_head *tmp; - - tmp = - B_N_PITEM_HEAD(S_new - [i], - 0); - if (is_indirect_le_ih - (tmp)) { - set_ih_free_space - (tmp, 0); - set_le_ih_k_offset - (tmp, - le_ih_k_offset - (tmp) + - (n_rem << - (tb-> - tb_sb-> - s_blocksize_bits - - - UNFM_P_SHIFT))); - } else { - set_le_ih_k_offset - (tmp, - le_ih_k_offset - (tmp) + - n_rem); - } - } - - tb->insert_size[0] = n_rem; - if (!n_rem) - pos_in_item++; - } - } else - /* item falls wholly into S_new[i] */ - { - int leaf_mi; - struct item_head *pasted; + /* If we insert the first key change the delimiting key */ + if (tb->item_pos == 0) { + if (tb->CFL[0]) /* can be 0 in reiserfsck */ + replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0); -#ifdef CONFIG_REISERFS_CHECK - struct item_head *ih_check = - B_N_PITEM_HEAD(tbS0, item_pos); - - if (!is_direntry_le_ih(ih_check) - && (pos_in_item != ih_item_len(ih_check) - || tb->insert_size[0] <= 0)) - reiserfs_panic(tb->tb_sb, - "PAP-12235", - "pos_in_item " - "must be equal " - "to ih_item_len"); -#endif /* CONFIG_REISERFS_CHECK */ - - leaf_mi = - leaf_move_items(LEAF_FROM_S_TO_SNEW, - tb, snum[i], - sbytes[i], - S_new[i]); - - RFALSE(leaf_mi, - "PAP-12240: unexpected value returned by leaf_move_items (%d)", - leaf_mi); - - /* paste into item */ - buffer_info_init_bh(tb, &bi, S_new[i]); - leaf_paste_in_buffer(&bi, - item_pos - n + - snum[i], - pos_in_item, - tb->insert_size[0], - body, zeros_num); - - pasted = - B_N_PITEM_HEAD(S_new[i], - item_pos - n + - snum[i]); - if (is_direntry_le_ih(pasted)) { - leaf_paste_entries(&bi, - item_pos - - n + snum[i], - pos_in_item, - 1, - (struct - reiserfs_de_head - *)body, - body + - DEH_SIZE, - tb-> - insert_size - [0] - ); - } - - /* if we paste to indirect item update ih_free_space */ - if (is_indirect_le_ih(pasted)) - set_ih_free_space(pasted, 0); - zeros_num = tb->insert_size[0] = 0; - } - } + } +} - else { /* pasted item doesn't fall into S_new[i] */ +static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb, + struct item_head *ih, + const char *body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + struct item_head *pasted = item_head(tbS0, tb->item_pos); + struct buffer_info bi; - leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, - snum[i], sbytes[i], S_new[i]); - } - break; - default: /* cases d and t */ - reiserfs_panic(tb->tb_sb, "PAP-12245", - "blknum > 2: unexpected mode: %s(%d)", - (flag == - M_DELETE) ? "DELETE" : ((flag == - M_CUT) ? "CUT" - : "UNKNOWN"), - flag); + if (tb->pos_in_item >= 0 && tb->pos_in_item <= ih_entry_count(pasted)) { + RFALSE(!tb->insert_size[0], + "PAP-12260: insert_size is 0 already"); + + /* prepare space */ + buffer_info_init_tbS0(tb, &bi); + leaf_paste_in_buffer(&bi, tb->item_pos, tb->pos_in_item, + tb->insert_size[0], body, tb->zeroes_num); + + /* paste entry */ + leaf_paste_entries(&bi, tb->item_pos, tb->pos_in_item, 1, + (struct reiserfs_de_head *)body, + body + DEH_SIZE, tb->insert_size[0]); + + if (!tb->item_pos && !tb->pos_in_item) { + RFALSE(!tb->CFL[0] || !tb->L[0], + "PAP-12270: CFL[0]/L[0] must be specified"); + if (tb->CFL[0]) + replace_key(tb, tb->CFL[0], tb->lkey[0], + tbS0, 0); } - memcpy(insert_key + i, B_N_PKEY(S_new[i], 0), KEY_SIZE); - insert_ptr[i] = S_new[i]; - - RFALSE(!buffer_journaled(S_new[i]) - || buffer_journal_dirty(S_new[i]) - || buffer_dirty(S_new[i]), "PAP-12247: S_new[%d] : (%b)", - i, S_new[i]); + tb->insert_size[0] = 0; } +} - /* if the affected item was not wholly shifted then we perform all necessary operations on that part or whole of the - affected item which remains in S */ - if (0 <= item_pos && item_pos < tb->s0num) { /* if we must insert or append into buffer S[0] */ +static void balance_leaf_finish_node_paste(struct tree_balance *tb, + struct item_head *ih, + const char *body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + struct buffer_info bi; + struct item_head *pasted = item_head(tbS0, tb->item_pos); - switch (flag) { - case M_INSERT: /* insert item into S[0] */ - buffer_info_init_tbS0(tb, &bi); - leaf_insert_into_buf(&bi, item_pos, ih, body, - zeros_num); + /* when directory, may be new entry already pasted */ + if (is_direntry_le_ih(pasted)) { + balance_leaf_finish_node_paste_dirent(tb, ih, body); + return; + } - /* If we insert the first key change the delimiting key */ - if (item_pos == 0) { - if (tb->CFL[0]) /* can be 0 in reiserfsck */ - replace_key(tb, tb->CFL[0], tb->lkey[0], - tbS0, 0); + /* regular object */ - } - break; + if (tb->pos_in_item == ih_item_len(pasted)) { + RFALSE(tb->insert_size[0] <= 0, + "PAP-12275: insert size must not be %d", + tb->insert_size[0]); + buffer_info_init_tbS0(tb, &bi); + leaf_paste_in_buffer(&bi, tb->item_pos, + tb->pos_in_item, tb->insert_size[0], body, + tb->zeroes_num); - case M_PASTE:{ /* append item in S[0] */ - struct item_head *pasted; - - pasted = B_N_PITEM_HEAD(tbS0, item_pos); - /* when directory, may be new entry already pasted */ - if (is_direntry_le_ih(pasted)) { - if (pos_in_item >= 0 && - pos_in_item <= - ih_entry_count(pasted)) { - - RFALSE(!tb->insert_size[0], - "PAP-12260: insert_size is 0 already"); - - /* prepare space */ - buffer_info_init_tbS0(tb, &bi); - leaf_paste_in_buffer(&bi, - item_pos, - pos_in_item, - tb-> - insert_size - [0], body, - zeros_num); - - /* paste entry */ - leaf_paste_entries(&bi, - item_pos, - pos_in_item, - 1, - (struct - reiserfs_de_head - *)body, - body + - DEH_SIZE, - tb-> - insert_size - [0] - ); - if (!item_pos && !pos_in_item) { - RFALSE(!tb->CFL[0] - || !tb->L[0], - "PAP-12270: CFL[0]/L[0] must be specified"); - if (tb->CFL[0]) { - replace_key(tb, - tb-> - CFL - [0], - tb-> - lkey - [0], - tbS0, - 0); - - } - } - tb->insert_size[0] = 0; - } - } else { /* regular object */ - if (pos_in_item == ih_item_len(pasted)) { - - RFALSE(tb->insert_size[0] <= 0, - "PAP-12275: insert size must not be %d", - tb->insert_size[0]); - buffer_info_init_tbS0(tb, &bi); - leaf_paste_in_buffer(&bi, - item_pos, - pos_in_item, - tb-> - insert_size - [0], body, - zeros_num); - - if (is_indirect_le_ih(pasted)) { -#if 0 - RFALSE(tb-> - insert_size[0] != - UNFM_P_SIZE, - "PAP-12280: insert_size for indirect item must be %d, not %d", - UNFM_P_SIZE, - tb-> - insert_size[0]); -#endif - set_ih_free_space - (pasted, 0); - } - tb->insert_size[0] = 0; - } + if (is_indirect_le_ih(pasted)) + set_ih_free_space(pasted, 0); + + tb->insert_size[0] = 0; + } #ifdef CONFIG_REISERFS_CHECK - else { - if (tb->insert_size[0]) { - print_cur_tb("12285"); - reiserfs_panic(tb-> - tb_sb, - "PAP-12285", - "insert_size " - "must be 0 " - "(%d)", - tb->insert_size[0]); - } - } -#endif /* CONFIG_REISERFS_CHECK */ - - } - } /* case M_PASTE: */ + else if (tb->insert_size[0]) { + print_cur_tb("12285"); + reiserfs_panic(tb->tb_sb, "PAP-12285", + "insert_size must be 0 (%d)", tb->insert_size[0]); + } +#endif +} + +/* + * if the affected item was not wholly shifted then we + * perform all necessary operations on that part or whole + * of the affected item which remains in S + */ +static void balance_leaf_finish_node(struct tree_balance *tb, + struct item_head *ih, + const char *body, int flag) +{ + /* if we must insert or append into buffer S[0] */ + if (0 <= tb->item_pos && tb->item_pos < tb->s0num) { + if (flag == M_INSERT) + balance_leaf_finish_node_insert(tb, ih, body); + else /* M_PASTE */ + balance_leaf_finish_node_paste(tb, ih, body); + } +} + +/** + * balance_leaf - reiserfs tree balancing algorithm + * @tb: tree balance state + * @ih: item header of inserted item (little endian) + * @body: body of inserted item or bytes to paste + * @flag: i - insert, d - delete, c - cut, p - paste (see do_balance) + * passed back: + * @insert_key: key to insert new nodes + * @insert_ptr: array of nodes to insert at the next level + * + * In our processing of one level we sometimes determine what must be + * inserted into the next higher level. This insertion consists of a + * key or two keys and their corresponding pointers. + */ +static int balance_leaf(struct tree_balance *tb, struct item_head *ih, + const char *body, int flag, + struct item_head *insert_key, + struct buffer_head **insert_ptr) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + + PROC_INFO_INC(tb->tb_sb, balance_at[0]); + + /* Make balance in case insert_size[0] < 0 */ + if (tb->insert_size[0] < 0) + return balance_leaf_when_delete(tb, flag); + + tb->item_pos = PATH_LAST_POSITION(tb->tb_path), + tb->pos_in_item = tb->tb_path->pos_in_item, + tb->zeroes_num = 0; + if (flag == M_INSERT && !body) + tb->zeroes_num = ih_item_len(ih); + + /* + * for indirect item pos_in_item is measured in unformatted node + * pointers. Recalculate to bytes + */ + if (flag != M_INSERT + && is_indirect_le_ih(item_head(tbS0, tb->item_pos))) + tb->pos_in_item *= UNFM_P_SIZE; + + balance_leaf_left(tb, ih, body, flag); + + /* tb->lnum[0] > 0 */ + /* Calculate new item position */ + tb->item_pos -= (tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0)); + + balance_leaf_right(tb, ih, body, flag); + + /* tb->rnum[0] > 0 */ + RFALSE(tb->blknum[0] > 3, + "PAP-12180: blknum can not be %d. It must be <= 3", tb->blknum[0]); + RFALSE(tb->blknum[0] < 0, + "PAP-12185: blknum can not be %d. It must be >= 0", tb->blknum[0]); + + /* + * if while adding to a node we discover that it is possible to split + * it in two, and merge the left part into the left neighbor and the + * right part into the right neighbor, eliminating the node + */ + if (tb->blknum[0] == 0) { /* node S[0] is empty now */ + + RFALSE(!tb->lnum[0] || !tb->rnum[0], + "PAP-12190: lnum and rnum must not be zero"); + /* + * if insertion was done before 0-th position in R[0], right + * delimiting key of the tb->L[0]'s and left delimiting key are + * not set correctly + */ + if (tb->CFL[0]) { + if (!tb->CFR[0]) + reiserfs_panic(tb->tb_sb, "vs-12195", + "CFR not initialized"); + copy_key(internal_key(tb->CFL[0], tb->lkey[0]), + internal_key(tb->CFR[0], tb->rkey[0])); + do_balance_mark_internal_dirty(tb, tb->CFL[0], 0); } + + reiserfs_invalidate_buffer(tb, tbS0); + return 0; } + + balance_leaf_new_nodes(tb, ih, body, insert_key, insert_ptr, flag); + + balance_leaf_finish_node(tb, ih, body, flag); + #ifdef CONFIG_REISERFS_CHECK if (flag == M_PASTE && tb->insert_size[0]) { print_cur_tb("12290"); @@ -1642,9 +1454,11 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h "PAP-12290", "insert_size is still not 0 (%d)", tb->insert_size[0]); } -#endif /* CONFIG_REISERFS_CHECK */ +#endif + + /* Leaf level of the tree is balanced (end of balance_leaf) */ return 0; -} /* Leaf level of the tree is balanced (end of balance_leaf) */ +} /* Make empty node */ void make_empty_node(struct buffer_info *bi) @@ -1683,9 +1497,7 @@ struct buffer_head *get_FEB(struct tree_balance *tb) return tb->used[i]; } -/* This is now used because reiserfs_free_block has to be able to -** schedule. -*/ +/* This is now used because reiserfs_free_block has to be able to schedule. */ static void store_thrown(struct tree_balance *tb, struct buffer_head *bh) { int i; @@ -1751,10 +1563,10 @@ void replace_key(struct tree_balance *tb, struct buffer_head *dest, int n_dest, if (B_IS_ITEMS_LEVEL(src)) /* source buffer contains leaf node */ - memcpy(B_N_PDELIM_KEY(dest, n_dest), B_N_PITEM_HEAD(src, n_src), + memcpy(internal_key(dest, n_dest), item_head(src, n_src), KEY_SIZE); else - memcpy(B_N_PDELIM_KEY(dest, n_dest), B_N_PDELIM_KEY(src, n_src), + memcpy(internal_key(dest, n_dest), internal_key(src, n_src), KEY_SIZE); do_balance_mark_internal_dirty(tb, dest, 0); @@ -1840,8 +1652,10 @@ static int check_before_balancing(struct tree_balance *tb) "mount point."); } - /* double check that buffers that we will modify are unlocked. (fix_nodes should already have - prepped all of these for us). */ + /* + * double check that buffers that we will modify are unlocked. + * (fix_nodes should already have prepped all of these for us). + */ if (tb->lnum[0]) { retval |= locked_or_not_in_tree(tb, tb->L[0], "L[0]"); retval |= locked_or_not_in_tree(tb, tb->FL[0], "FL[0]"); @@ -1934,49 +1748,51 @@ static void check_internal_levels(struct tree_balance *tb) #endif -/* Now we have all of the buffers that must be used in balancing of - the tree. We rely on the assumption that schedule() will not occur - while do_balance works. ( Only interrupt handlers are acceptable.) - We balance the tree according to the analysis made before this, - using buffers already obtained. For SMP support it will someday be - necessary to add ordered locking of tb. */ - -/* Some interesting rules of balancing: - - we delete a maximum of two nodes per level per balancing: we never - delete R, when we delete two of three nodes L, S, R then we move - them into R. - - we only delete L if we are deleting two nodes, if we delete only - one node we delete S - - if we shift leaves then we shift as much as we can: this is a - deliberate policy of extremism in node packing which results in - higher average utilization after repeated random balance operations - at the cost of more memory copies and more balancing as a result of - small insertions to full nodes. - - if we shift internal nodes we try to evenly balance the node - utilization, with consequent less balancing at the cost of lower - utilization. - - one could argue that the policy for directories in leaves should be - that of internal nodes, but we will wait until another day to - evaluate this.... It would be nice to someday measure and prove - these assumptions as to what is optimal.... +/* + * Now we have all of the buffers that must be used in balancing of + * the tree. We rely on the assumption that schedule() will not occur + * while do_balance works. ( Only interrupt handlers are acceptable.) + * We balance the tree according to the analysis made before this, + * using buffers already obtained. For SMP support it will someday be + * necessary to add ordered locking of tb. + */ -*/ +/* + * Some interesting rules of balancing: + * we delete a maximum of two nodes per level per balancing: we never + * delete R, when we delete two of three nodes L, S, R then we move + * them into R. + * + * we only delete L if we are deleting two nodes, if we delete only + * one node we delete S + * + * if we shift leaves then we shift as much as we can: this is a + * deliberate policy of extremism in node packing which results in + * higher average utilization after repeated random balance operations + * at the cost of more memory copies and more balancing as a result of + * small insertions to full nodes. + * + * if we shift internal nodes we try to evenly balance the node + * utilization, with consequent less balancing at the cost of lower + * utilization. + * + * one could argue that the policy for directories in leaves should be + * that of internal nodes, but we will wait until another day to + * evaluate this.... It would be nice to someday measure and prove + * these assumptions as to what is optimal.... + */ static inline void do_balance_starts(struct tree_balance *tb) { - /* use print_cur_tb() to see initial state of struct - tree_balance */ + /* use print_cur_tb() to see initial state of struct tree_balance */ /* store_print_tb (tb); */ /* do not delete, just comment it out */ -/* print_tb(flag, PATH_LAST_POSITION(tb->tb_path), tb->tb_path->pos_in_item, tb, - "check");*/ + /* + print_tb(flag, PATH_LAST_POSITION(tb->tb_path), + tb->tb_path->pos_in_item, tb, "check"); + */ RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB"); #ifdef CONFIG_REISERFS_CHECK REISERFS_SB(tb->tb_sb)->cur_tb = tb; @@ -1992,9 +1808,10 @@ static inline void do_balance_completed(struct tree_balance *tb) REISERFS_SB(tb->tb_sb)->cur_tb = NULL; #endif - /* reiserfs_free_block is no longer schedule safe. So, we need to - ** put the buffers we want freed on the thrown list during do_balance, - ** and then free them now + /* + * reiserfs_free_block is no longer schedule safe. So, we need to + * put the buffers we want freed on the thrown list during do_balance, + * and then free them now */ REISERFS_SB(tb->tb_sb)->s_do_balance++; @@ -2005,36 +1822,40 @@ static inline void do_balance_completed(struct tree_balance *tb) free_thrown(tb); } -void do_balance(struct tree_balance *tb, /* tree_balance structure */ - struct item_head *ih, /* item header of inserted item */ - const char *body, /* body of inserted item or bytes to paste */ - int flag) -{ /* i - insert, d - delete - c - cut, p - paste - - Cut means delete part of an item - (includes removing an entry from a - directory). - - Delete means delete whole item. - - Insert means add a new item into the - tree. - - Paste means to append to the end of an - existing file or to insert a directory - entry. */ - int child_pos, /* position of a child node in its parent */ - h; /* level of the tree being processed */ - struct item_head insert_key[2]; /* in our processing of one level - we sometimes determine what - must be inserted into the next - higher level. This insertion - consists of a key or two keys - and their corresponding - pointers */ - struct buffer_head *insert_ptr[2]; /* inserted node-ptrs for the next - level */ +/* + * do_balance - balance the tree + * + * @tb: tree_balance structure + * @ih: item header of inserted item + * @body: body of inserted item or bytes to paste + * @flag: 'i' - insert, 'd' - delete, 'c' - cut, 'p' paste + * + * Cut means delete part of an item (includes removing an entry from a + * directory). + * + * Delete means delete whole item. + * + * Insert means add a new item into the tree. + * + * Paste means to append to the end of an existing file or to + * insert a directory entry. + */ +void do_balance(struct tree_balance *tb, struct item_head *ih, + const char *body, int flag) +{ + int child_pos; /* position of a child node in its parent */ + int h; /* level of the tree being processed */ + + /* + * in our processing of one level we sometimes determine what + * must be inserted into the next higher level. This insertion + * consists of a key or two keys and their corresponding + * pointers + */ + struct item_head insert_key[2]; + + /* inserted node-ptrs for the next level */ + struct buffer_head *insert_ptr[2]; tb->tb_mode = flag; tb->need_balance_dirty = 0; @@ -2051,12 +1872,14 @@ void do_balance(struct tree_balance *tb, /* tree_balance structure */ return; } - atomic_inc(&(fs_generation(tb->tb_sb))); + atomic_inc(&fs_generation(tb->tb_sb)); do_balance_starts(tb); - /* balance leaf returns 0 except if combining L R and S into - one node. see balance_internal() for explanation of this - line of code. */ + /* + * balance_leaf returns 0 except if combining L R and S into + * one node. see balance_internal() for explanation of this + * line of code. + */ child_pos = PATH_H_B_ITEM_ORDER(tb->tb_path, 0) + balance_leaf(tb, ih, body, flag, insert_key, insert_ptr); @@ -2066,9 +1889,8 @@ void do_balance(struct tree_balance *tb, /* tree_balance structure */ /* Balance internal level of the tree. */ for (h = 1; h < MAX_HEIGHT && tb->insert_size[h]; h++) - child_pos = - balance_internal(tb, h, child_pos, insert_key, insert_ptr); + child_pos = balance_internal(tb, h, child_pos, insert_key, + insert_ptr); do_balance_completed(tb); - } diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index dcaafcfc23b..db9e80ba53a 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -15,20 +15,20 @@ #include <linux/quotaops.h> /* -** We pack the tails of files on file close, not at the time they are written. -** This implies an unnecessary copy of the tail and an unnecessary indirect item -** insertion/balancing, for files that are written in one write. -** It avoids unnecessary tail packings (balances) for files that are written in -** multiple writes and are small enough to have tails. -** -** file_release is called by the VFS layer when the file is closed. If -** this is the last open file descriptor, and the file -** small enough to have a tail, and the tail is currently in an -** unformatted node, the tail is converted back into a direct item. -** -** We use reiserfs_truncate_file to pack the tail, since it already has -** all the conditions coded. -*/ + * We pack the tails of files on file close, not at the time they are written. + * This implies an unnecessary copy of the tail and an unnecessary indirect item + * insertion/balancing, for files that are written in one write. + * It avoids unnecessary tail packings (balances) for files that are written in + * multiple writes and are small enough to have tails. + * + * file_release is called by the VFS layer when the file is closed. If + * this is the last open file descriptor, and the file + * small enough to have a tail, and the tail is currently in an + * unformatted node, the tail is converted back into a direct item. + * + * We use reiserfs_truncate_file to pack the tail, since it already has + * all the conditions coded. + */ static int reiserfs_file_release(struct inode *inode, struct file *filp) { @@ -41,10 +41,10 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp) if (atomic_add_unless(&REISERFS_I(inode)->openers, -1, 1)) return 0; - mutex_lock(&(REISERFS_I(inode)->tailpack)); + mutex_lock(&REISERFS_I(inode)->tailpack); if (!atomic_dec_and_test(&REISERFS_I(inode)->openers)) { - mutex_unlock(&(REISERFS_I(inode)->tailpack)); + mutex_unlock(&REISERFS_I(inode)->tailpack); return 0; } @@ -52,31 +52,35 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp) if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) || !tail_has_to_be_packed(inode)) && REISERFS_I(inode)->i_prealloc_count <= 0) { - mutex_unlock(&(REISERFS_I(inode)->tailpack)); + mutex_unlock(&REISERFS_I(inode)->tailpack); return 0; } reiserfs_write_lock(inode->i_sb); - /* freeing preallocation only involves relogging blocks that + /* + * freeing preallocation only involves relogging blocks that * are already in the current transaction. preallocation gets * freed at the end of each transaction, so it is impossible for * us to log any additional blocks (including quota blocks) */ err = journal_begin(&th, inode->i_sb, 1); if (err) { - /* uh oh, we can't allow the inode to go away while there + /* + * uh oh, we can't allow the inode to go away while there * is still preallocation blocks pending. Try to join the * aborted transaction */ jbegin_failure = err; - err = journal_join_abort(&th, inode->i_sb, 1); + err = journal_join_abort(&th, inode->i_sb); if (err) { - /* hmpf, our choices here aren't good. We can pin the inode - * which will disallow unmount from every happening, we can - * do nothing, which will corrupt random memory on unmount, - * or we can forcibly remove the file from the preallocation - * list, which will leak blocks on disk. Lets pin the inode + /* + * hmpf, our choices here aren't good. We can pin + * the inode which will disallow unmount from ever + * happening, we can do nothing, which will corrupt + * random memory on unmount, or we can forcibly + * remove the file from the preallocation list, which + * will leak blocks on disk. Lets pin the inode * and let the admin know what is going on. */ igrab(inode); @@ -92,7 +96,7 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp) #ifdef REISERFS_PREALLOCATE reiserfs_discard_prealloc(&th, inode); #endif - err = journal_end(&th, inode->i_sb, 1); + err = journal_end(&th); /* copy back the error code from journal_begin */ if (!err) @@ -102,35 +106,38 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp) (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) && tail_has_to_be_packed(inode)) { - /* if regular file is released by last holder and it has been - appended (we append by unformatted node only) or its direct - item(s) had to be converted, then it may have to be - indirect2direct converted */ + /* + * if regular file is released by last holder and it has been + * appended (we append by unformatted node only) or its direct + * item(s) had to be converted, then it may have to be + * indirect2direct converted + */ err = reiserfs_truncate_file(inode, 0); } - out: +out: reiserfs_write_unlock(inode->i_sb); - mutex_unlock(&(REISERFS_I(inode)->tailpack)); + mutex_unlock(&REISERFS_I(inode)->tailpack); return err; } static int reiserfs_file_open(struct inode *inode, struct file *file) { int err = dquot_file_open(inode, file); + + /* somebody might be tailpacking on final close; wait for it */ if (!atomic_inc_not_zero(&REISERFS_I(inode)->openers)) { - /* somebody might be tailpacking on final close; wait for it */ - mutex_lock(&(REISERFS_I(inode)->tailpack)); + mutex_lock(&REISERFS_I(inode)->tailpack); atomic_inc(&REISERFS_I(inode)->openers); - mutex_unlock(&(REISERFS_I(inode)->tailpack)); + mutex_unlock(&REISERFS_I(inode)->tailpack); } return err; } void reiserfs_vfs_truncate_file(struct inode *inode) { - mutex_lock(&(REISERFS_I(inode)->tailpack)); + mutex_lock(&REISERFS_I(inode)->tailpack); reiserfs_truncate_file(inode, 1); - mutex_unlock(&(REISERFS_I(inode)->tailpack)); + mutex_unlock(&REISERFS_I(inode)->tailpack); } /* Sync a reiserfs file. */ @@ -205,10 +212,11 @@ int reiserfs_commit_page(struct inode *inode, struct page *page, set_buffer_uptodate(bh); if (logit) { reiserfs_prepare_for_journal(s, bh, 1); - journal_mark_dirty(&th, s, bh); + journal_mark_dirty(&th, bh); } else if (!buffer_dirty(bh)) { mark_buffer_dirty(bh); - /* do data=ordered on any page past the end + /* + * do data=ordered on any page past the end * of file and any buffer marked BH_New. */ if (reiserfs_data_ordered(inode->i_sb) && @@ -219,8 +227,8 @@ int reiserfs_commit_page(struct inode *inode, struct page *page, } } if (logit) { - ret = journal_end(&th, s, bh_per_page + 1); - drop_write_lock: + ret = journal_end(&th); +drop_write_lock: reiserfs_write_unlock(s); } /* @@ -235,8 +243,8 @@ int reiserfs_commit_page(struct inode *inode, struct page *page, } const struct file_operations reiserfs_file_operations = { - .read = do_sync_read, - .write = do_sync_write, + .read = new_sync_read, + .write = new_sync_write, .unlocked_ioctl = reiserfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = reiserfs_compat_ioctl, @@ -245,10 +253,10 @@ const struct file_operations reiserfs_file_operations = { .open = reiserfs_file_open, .release = reiserfs_file_release, .fsync = reiserfs_sync_file, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, .llseek = generic_file_llseek, }; @@ -260,4 +268,5 @@ const struct inode_operations reiserfs_file_inode_operations = { .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, .get_acl = reiserfs_get_acl, + .set_acl = reiserfs_set_acl, }; diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c index dc4d4153031..6b0ddb2a909 100644 --- a/fs/reiserfs/fix_node.c +++ b/fs/reiserfs/fix_node.c @@ -2,59 +2,32 @@ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README */ -/** - ** old_item_num - ** old_entry_num - ** set_entry_sizes - ** create_virtual_node - ** check_left - ** check_right - ** directory_part_size - ** get_num_ver - ** set_parameters - ** is_leaf_removable - ** are_leaves_removable - ** get_empty_nodes - ** get_lfree - ** get_rfree - ** is_left_neighbor_in_cache - ** decrement_key - ** get_far_parent - ** get_parents - ** can_node_be_removed - ** ip_check_balance - ** dc_check_balance_internal - ** dc_check_balance_leaf - ** dc_check_balance - ** check_balance - ** get_direct_parent - ** get_neighbors - ** fix_nodes - ** - ** - **/ - #include <linux/time.h> #include <linux/slab.h> #include <linux/string.h> #include "reiserfs.h" #include <linux/buffer_head.h> -/* To make any changes in the tree we find a node, that contains item - to be changed/deleted or position in the node we insert a new item - to. We call this node S. To do balancing we need to decide what we - will shift to left/right neighbor, or to a new node, where new item - will be etc. To make this analysis simpler we build virtual - node. Virtual node is an array of items, that will replace items of - node S. (For instance if we are going to delete an item, virtual - node does not contain it). Virtual node keeps information about - item sizes and types, mergeability of first and last items, sizes - of all entries in directory item. We use this array of items when - calculating what we can shift to neighbors and how many nodes we - have to have if we do not any shiftings, if we shift to left/right - neighbor or to both. */ - -/* taking item number in virtual node, returns number of item, that it has in source buffer */ +/* + * To make any changes in the tree we find a node that contains item + * to be changed/deleted or position in the node we insert a new item + * to. We call this node S. To do balancing we need to decide what we + * will shift to left/right neighbor, or to a new node, where new item + * will be etc. To make this analysis simpler we build virtual + * node. Virtual node is an array of items, that will replace items of + * node S. (For instance if we are going to delete an item, virtual + * node does not contain it). Virtual node keeps information about + * item sizes and types, mergeability of first and last items, sizes + * of all entries in directory item. We use this array of items when + * calculating what we can shift to neighbors and how many nodes we + * have to have if we do not any shiftings, if we shift to left/right + * neighbor or to both. + */ + +/* + * Takes item number in virtual node, returns number of item + * that it has in source buffer + */ static inline int old_item_num(int new_num, int affected_item_num, int mode) { if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num) @@ -105,14 +78,17 @@ static void create_virtual_node(struct tree_balance *tb, int h) vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item); /* first item in the node */ - ih = B_N_PITEM_HEAD(Sh, 0); + ih = item_head(Sh, 0); /* define the mergeability for 0-th item (if it is not being deleted) */ - if (op_is_left_mergeable(&(ih->ih_key), Sh->b_size) + if (op_is_left_mergeable(&ih->ih_key, Sh->b_size) && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num)) vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE; - /* go through all items those remain in the virtual node (except for the new (inserted) one) */ + /* + * go through all items that remain in the virtual + * node (except for the new (inserted) one) + */ for (new_num = 0; new_num < vn->vn_nr_item; new_num++) { int j; struct virtual_item *vi = vn->vn_vi + new_num; @@ -128,11 +104,13 @@ static void create_virtual_node(struct tree_balance *tb, int h) vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE; vi->vi_ih = ih + j; - vi->vi_item = B_I_PITEM(Sh, ih + j); + vi->vi_item = ih_item_body(Sh, ih + j); vi->vi_uarea = vn->vn_free_ptr; - // FIXME: there is no check, that item operation did not - // consume too much memory + /* + * FIXME: there is no check that item operation did not + * consume too much memory + */ vn->vn_free_ptr += op_create_vi(vn, vi, is_affected, tb->insert_size[0]); if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr) @@ -145,7 +123,8 @@ static void create_virtual_node(struct tree_balance *tb, int h) if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) { vn->vn_vi[new_num].vi_item_len += tb->insert_size[0]; - vi->vi_new_data = vn->vn_data; // pointer to data which is going to be pasted + /* pointer to data which is going to be pasted */ + vi->vi_new_data = vn->vn_data; } } @@ -164,11 +143,14 @@ static void create_virtual_node(struct tree_balance *tb, int h) tb->insert_size[0]); } - /* set right merge flag we take right delimiting key and check whether it is a mergeable item */ + /* + * set right merge flag we take right delimiting key and + * check whether it is a mergeable item + */ if (tb->CFR[0]) { struct reiserfs_key *key; - key = B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]); + key = internal_key(tb->CFR[0], tb->rkey[0]); if (op_is_left_mergeable(key, Sh->b_size) && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) @@ -179,12 +161,19 @@ static void create_virtual_node(struct tree_balance *tb, int h) if (op_is_left_mergeable(key, Sh->b_size) && !(vn->vn_mode != M_DELETE || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) { - /* we delete last item and it could be merged with right neighbor's first item */ + /* + * we delete last item and it could be merged + * with right neighbor's first item + */ if (! (B_NR_ITEMS(Sh) == 1 - && is_direntry_le_ih(B_N_PITEM_HEAD(Sh, 0)) - && I_ENTRY_COUNT(B_N_PITEM_HEAD(Sh, 0)) == 1)) { - /* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */ + && is_direntry_le_ih(item_head(Sh, 0)) + && ih_entry_count(item_head(Sh, 0)) == 1)) { + /* + * node contains more than 1 item, or item + * is not directory item, or this item + * contains more than 1 entry + */ print_block(Sh, 0, -1, -1); reiserfs_panic(tb->tb_sb, "vs-8045", "rdkey %k, affected item==%d " @@ -198,8 +187,10 @@ static void create_virtual_node(struct tree_balance *tb, int h) } } -/* using virtual node check, how many items can be shifted to left - neighbor */ +/* + * Using virtual node check, how many items can be + * shifted to left neighbor + */ static void check_left(struct tree_balance *tb, int h, int cur_free) { int i; @@ -259,9 +250,13 @@ static void check_left(struct tree_balance *tb, int h, int cur_free) } /* the item cannot be shifted entirely, try to split it */ - /* check whether L[0] can hold ih and at least one byte of the item body */ + /* + * check whether L[0] can hold ih and at least one byte + * of the item body + */ + + /* cannot shift even a part of the current item */ if (cur_free <= ih_size) { - /* cannot shift even a part of the current item */ tb->lbytes = -1; return; } @@ -278,8 +273,10 @@ static void check_left(struct tree_balance *tb, int h, int cur_free) return; } -/* using virtual node check, how many items can be shifted to right - neighbor */ +/* + * Using virtual node check, how many items can be + * shifted to right neighbor + */ static void check_right(struct tree_balance *tb, int h, int cur_free) { int i; @@ -338,13 +335,21 @@ static void check_right(struct tree_balance *tb, int h, int cur_free) continue; } - /* check whether R[0] can hold ih and at least one byte of the item body */ - if (cur_free <= ih_size) { /* cannot shift even a part of the current item */ + /* + * check whether R[0] can hold ih and at least one + * byte of the item body + */ + + /* cannot shift even a part of the current item */ + if (cur_free <= ih_size) { tb->rbytes = -1; return; } - /* R[0] can hold the header of the item and at least one byte of its body */ + /* + * R[0] can hold the header of the item and at least + * one byte of its body + */ cur_free -= ih_size; /* cur_free is still > 0 */ tb->rbytes = op_check_right(vi, cur_free); @@ -361,45 +366,64 @@ static void check_right(struct tree_balance *tb, int h, int cur_free) /* * from - number of items, which are shifted to left neighbor entirely * to - number of item, which are shifted to right neighbor entirely - * from_bytes - number of bytes of boundary item (or directory entries) which are shifted to left neighbor - * to_bytes - number of bytes of boundary item (or directory entries) which are shifted to right neighbor */ + * from_bytes - number of bytes of boundary item (or directory entries) + * which are shifted to left neighbor + * to_bytes - number of bytes of boundary item (or directory entries) + * which are shifted to right neighbor + */ static int get_num_ver(int mode, struct tree_balance *tb, int h, int from, int from_bytes, int to, int to_bytes, short *snum012, int flow) { int i; int cur_free; - // int bytes; int units; struct virtual_node *vn = tb->tb_vn; - // struct virtual_item * vi; - int total_node_size, max_node_size, current_item_size; int needed_nodes; - int start_item, /* position of item we start filling node from */ - end_item, /* position of item we finish filling node by */ - start_bytes, /* number of first bytes (entries for directory) of start_item-th item - we do not include into node that is being filled */ - end_bytes; /* number of last bytes (entries for directory) of end_item-th item - we do node include into node that is being filled */ - int split_item_positions[2]; /* these are positions in virtual item of - items, that are split between S[0] and - S1new and S1new and S2new */ + + /* position of item we start filling node from */ + int start_item; + + /* position of item we finish filling node by */ + int end_item; + + /* + * number of first bytes (entries for directory) of start_item-th item + * we do not include into node that is being filled + */ + int start_bytes; + + /* + * number of last bytes (entries for directory) of end_item-th item + * we do node include into node that is being filled + */ + int end_bytes; + + /* + * these are positions in virtual item of items, that are split + * between S[0] and S1new and S1new and S2new + */ + int split_item_positions[2]; split_item_positions[0] = -1; split_item_positions[1] = -1; - /* We only create additional nodes if we are in insert or paste mode - or we are in replace mode at the internal level. If h is 0 and - the mode is M_REPLACE then in fix_nodes we change the mode to - paste or insert before we get here in the code. */ + /* + * We only create additional nodes if we are in insert or paste mode + * or we are in replace mode at the internal level. If h is 0 and + * the mode is M_REPLACE then in fix_nodes we change the mode to + * paste or insert before we get here in the code. + */ RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE), "vs-8100: insert_size < 0 in overflow"); max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h)); - /* snum012 [0-2] - number of items, that lay - to S[0], first new node and second new node */ + /* + * snum012 [0-2] - number of items, that lay + * to S[0], first new node and second new node + */ snum012[3] = -1; /* s1bytes */ snum012[4] = -1; /* s2bytes */ @@ -416,20 +440,22 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, total_node_size = 0; cur_free = max_node_size; - // start from 'from'-th item + /* start from 'from'-th item */ start_item = from; - // skip its first 'start_bytes' units + /* skip its first 'start_bytes' units */ start_bytes = ((from_bytes != -1) ? from_bytes : 0); - // last included item is the 'end_item'-th one + /* last included item is the 'end_item'-th one */ end_item = vn->vn_nr_item - to - 1; - // do not count last 'end_bytes' units of 'end_item'-th item + /* do not count last 'end_bytes' units of 'end_item'-th item */ end_bytes = (to_bytes != -1) ? to_bytes : 0; - /* go through all item beginning from the start_item-th item and ending by - the end_item-th item. Do not count first 'start_bytes' units of - 'start_item'-th item and last 'end_bytes' of 'end_item'-th item */ - + /* + * go through all item beginning from the start_item-th item + * and ending by the end_item-th item. Do not count first + * 'start_bytes' units of 'start_item'-th item and last + * 'end_bytes' of 'end_item'-th item + */ for (i = start_item; i <= end_item; i++) { struct virtual_item *vi = vn->vn_vi + i; int skip_from_end = ((i == end_item) ? end_bytes : 0); @@ -439,7 +465,10 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, /* get size of current item */ current_item_size = vi->vi_item_len; - /* do not take in calculation head part (from_bytes) of from-th item */ + /* + * do not take in calculation head part (from_bytes) + * of from-th item + */ current_item_size -= op_part_size(vi, 0 /*from start */ , start_bytes); @@ -455,9 +484,11 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, continue; } + /* + * virtual item length is longer, than max size of item in + * a node. It is impossible for direct item + */ if (current_item_size > max_node_size) { - /* virtual item length is longer, than max size of item in - a node. It is impossible for direct item */ RFALSE(is_direct_le_ih(vi->vi_ih), "vs-8110: " "direct item length is %d. It can not be longer than %d", @@ -466,15 +497,18 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, flow = 1; } + /* as we do not split items, take new node and continue */ if (!flow) { - /* as we do not split items, take new node and continue */ needed_nodes++; i--; total_node_size = 0; continue; } - // calculate number of item units which fit into node being - // filled + + /* + * calculate number of item units which fit into node being + * filled + */ { int free_space; @@ -482,17 +516,17 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, units = op_check_left(vi, free_space, start_bytes, skip_from_end); + /* + * nothing fits into current node, take new + * node and continue + */ if (units == -1) { - /* nothing fits into current node, take new node and continue */ needed_nodes++, i--, total_node_size = 0; continue; } } /* something fits into the current node */ - //if (snum012[3] != -1 || needed_nodes != 1) - // reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required"); - //snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units; start_bytes += units; snum012[needed_nodes - 1 + 3] = units; @@ -508,9 +542,11 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, total_node_size = 0; } - // sum012[4] (if it is not -1) contains number of units of which - // are to be in S1new, snum012[3] - to be in S0. They are supposed - // to be S1bytes and S2bytes correspondingly, so recalculate + /* + * sum012[4] (if it is not -1) contains number of units of which + * are to be in S1new, snum012[3] - to be in S0. They are supposed + * to be S1bytes and S2bytes correspondingly, so recalculate + */ if (snum012[4] > 0) { int split_item_num; int bytes_to_r, bytes_to_l; @@ -527,7 +563,7 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, ((split_item_positions[0] == split_item_positions[1]) ? snum012[3] : 0); - // s2bytes + /* s2bytes */ snum012[4] = op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] - bytes_to_r - bytes_to_l - bytes_to_S1new; @@ -555,7 +591,7 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, ((split_item_positions[0] == split_item_positions[1] && snum012[4] != -1) ? snum012[4] : 0); - // s1bytes + /* s1bytes */ snum012[3] = op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] - bytes_to_r - bytes_to_l - bytes_to_S2new; @@ -565,7 +601,8 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, } -/* Set parameters for balancing. +/* + * Set parameters for balancing. * Performs write of results of analysis of balancing into structure tb, * where it will later be used by the functions that actually do the balancing. * Parameters: @@ -575,11 +612,12 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, * rnum number of items from S[h] that must be shifted to R[h]; * blk_num number of blocks that S[h] will be splitted into; * s012 number of items that fall into splitted nodes. - * lbytes number of bytes which flow to the left neighbor from the item that is not - * not shifted entirely - * rbytes number of bytes which flow to the right neighbor from the item that is not - * not shifted entirely - * s1bytes number of bytes which flow to the first new node when S[0] splits (this number is contained in s012 array) + * lbytes number of bytes which flow to the left neighbor from the + * item that is not not shifted entirely + * rbytes number of bytes which flow to the right neighbor from the + * item that is not not shifted entirely + * s1bytes number of bytes which flow to the first new node when + * S[0] splits (this number is contained in s012 array) */ static void set_parameters(struct tree_balance *tb, int h, int lnum, @@ -590,12 +628,14 @@ static void set_parameters(struct tree_balance *tb, int h, int lnum, tb->rnum[h] = rnum; tb->blknum[h] = blk_num; - if (h == 0) { /* only for leaf level */ + /* only for leaf level */ + if (h == 0) { if (s012 != NULL) { - tb->s0num = *s012++, - tb->s1num = *s012++, tb->s2num = *s012++; - tb->s1bytes = *s012++; - tb->s2bytes = *s012; + tb->s0num = *s012++; + tb->snum[0] = *s012++; + tb->snum[1] = *s012++; + tb->sbytes[0] = *s012++; + tb->sbytes[1] = *s012; } tb->lbytes = lb; tb->rbytes = rb; @@ -607,8 +647,10 @@ static void set_parameters(struct tree_balance *tb, int h, int lnum, PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb); } -/* check, does node disappear if we shift tb->lnum[0] items to left - neighbor and tb->rnum[0] to the right one. */ +/* + * check if node disappears if we shift tb->lnum[0] items to left + * neighbor and tb->rnum[0] to the right one. + */ static int is_leaf_removable(struct tree_balance *tb) { struct virtual_node *vn = tb->tb_vn; @@ -616,8 +658,10 @@ static int is_leaf_removable(struct tree_balance *tb) int size; int remain_items; - /* number of items, that will be shifted to left (right) neighbor - entirely */ + /* + * number of items that will be shifted to left (right) neighbor + * entirely + */ to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0); to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0); remain_items = vn->vn_nr_item; @@ -625,21 +669,21 @@ static int is_leaf_removable(struct tree_balance *tb) /* how many items remain in S[0] after shiftings to neighbors */ remain_items -= (to_left + to_right); + /* all content of node can be shifted to neighbors */ if (remain_items < 1) { - /* all content of node can be shifted to neighbors */ set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0, NULL, -1, -1); return 1; } + /* S[0] is not removable */ if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1) - /* S[0] is not removable */ return 0; - /* check, whether we can divide 1 remaining item between neighbors */ + /* check whether we can divide 1 remaining item between neighbors */ /* get size of remaining item (in item units) */ - size = op_unit_num(&(vn->vn_vi[to_left])); + size = op_unit_num(&vn->vn_vi[to_left]); if (tb->lbytes + tb->rbytes >= size) { set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL, @@ -675,23 +719,28 @@ static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree) "vs-8125: item number must be 1: it is %d", B_NR_ITEMS(S0)); - ih = B_N_PITEM_HEAD(S0, 0); + ih = item_head(S0, 0); if (tb->CFR[0] - && !comp_short_le_keys(&(ih->ih_key), - B_N_PDELIM_KEY(tb->CFR[0], + && !comp_short_le_keys(&ih->ih_key, + internal_key(tb->CFR[0], tb->rkey[0]))) + /* + * Directory must be in correct state here: that is + * somewhere at the left side should exist first + * directory item. But the item being deleted can + * not be that first one because its right neighbor + * is item of the same directory. (But first item + * always gets deleted in last turn). So, neighbors + * of deleted item can be merged, so we can save + * ih_size + */ if (is_direntry_le_ih(ih)) { - /* Directory must be in correct state here: that is - somewhere at the left side should exist first directory - item. But the item being deleted can not be that first - one because its right neighbor is item of the same - directory. (But first item always gets deleted in last - turn). So, neighbors of deleted item can be merged, so - we can save ih_size */ ih_size = IH_SIZE; - /* we might check that left neighbor exists and is of the - same directory */ + /* + * we might check that left neighbor exists + * and is of the same directory + */ RFALSE(le_ih_k_offset(ih) == DOT_OFFSET, "vs-8130: first directory item can not be removed until directory is not empty"); } @@ -770,7 +819,8 @@ static void free_buffers_in_tb(struct tree_balance *tb) } } -/* Get new buffers for storing new nodes that are created while balancing. +/* + * Get new buffers for storing new nodes that are created while balancing. * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; * CARRY_ON - schedule didn't occur while the function worked; * NO_DISK_SPACE - no disk space. @@ -778,28 +828,33 @@ static void free_buffers_in_tb(struct tree_balance *tb) /* The function is NOT SCHEDULE-SAFE! */ static int get_empty_nodes(struct tree_balance *tb, int h) { - struct buffer_head *new_bh, - *Sh = PATH_H_PBUFFER(tb->tb_path, h); + struct buffer_head *new_bh, *Sh = PATH_H_PBUFFER(tb->tb_path, h); b_blocknr_t *blocknr, blocknrs[MAX_AMOUNT_NEEDED] = { 0, }; - int counter, number_of_freeblk, amount_needed, /* number of needed empty blocks */ - retval = CARRY_ON; + int counter, number_of_freeblk; + int amount_needed; /* number of needed empty blocks */ + int retval = CARRY_ON; struct super_block *sb = tb->tb_sb; - /* number_of_freeblk is the number of empty blocks which have been - acquired for use by the balancing algorithm minus the number of - empty blocks used in the previous levels of the analysis, - number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs - after empty blocks are acquired, and the balancing analysis is - then restarted, amount_needed is the number needed by this level - (h) of the balancing analysis. - - Note that for systems with many processes writing, it would be - more layout optimal to calculate the total number needed by all - levels and then to run reiserfs_new_blocks to get all of them at once. */ - - /* Initiate number_of_freeblk to the amount acquired prior to the restart of - the analysis or 0 if not restarted, then subtract the amount needed - by all of the levels of the tree below h. */ + /* + * number_of_freeblk is the number of empty blocks which have been + * acquired for use by the balancing algorithm minus the number of + * empty blocks used in the previous levels of the analysis, + * number_of_freeblk = tb->cur_blknum can be non-zero if a schedule + * occurs after empty blocks are acquired, and the balancing analysis + * is then restarted, amount_needed is the number needed by this + * level (h) of the balancing analysis. + * + * Note that for systems with many processes writing, it would be + * more layout optimal to calculate the total number needed by all + * levels and then to run reiserfs_new_blocks to get all of them at + * once. + */ + + /* + * Initiate number_of_freeblk to the amount acquired prior to the + * restart of the analysis or 0 if not restarted, then subtract the + * amount needed by all of the levels of the tree below h. + */ /* blknum includes S[h], so we subtract 1 in this calculation */ for (counter = 0, number_of_freeblk = tb->cur_blknum; counter < h; counter++) @@ -810,13 +865,19 @@ static int get_empty_nodes(struct tree_balance *tb, int h) /* Allocate missing empty blocks. */ /* if Sh == 0 then we are getting a new root */ amount_needed = (Sh) ? (tb->blknum[h] - 1) : 1; - /* Amount_needed = the amount that we need more than the amount that we have. */ + /* + * Amount_needed = the amount that we need more than the + * amount that we have. + */ if (amount_needed > number_of_freeblk) amount_needed -= number_of_freeblk; - else /* If we have enough already then there is nothing to do. */ + else /* If we have enough already then there is nothing to do. */ return CARRY_ON; - /* No need to check quota - is not allocated for blocks used for formatted nodes */ + /* + * No need to check quota - is not allocated for blocks used + * for formatted nodes + */ if (reiserfs_new_form_blocknrs(tb, blocknrs, amount_needed) == NO_DISK_SPACE) return NO_DISK_SPACE; @@ -849,8 +910,10 @@ static int get_empty_nodes(struct tree_balance *tb, int h) return retval; } -/* Get free space of the left neighbor, which is stored in the parent - * node of the left neighbor. */ +/* + * Get free space of the left neighbor, which is stored in the parent + * node of the left neighbor. + */ static int get_lfree(struct tree_balance *tb, int h) { struct buffer_head *l, *f; @@ -870,7 +933,8 @@ static int get_lfree(struct tree_balance *tb, int h) return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order))); } -/* Get free space of the right neighbor, +/* + * Get free space of the right neighbor, * which is stored in the parent node of the right neighbor. */ static int get_rfree(struct tree_balance *tb, int h) @@ -916,7 +980,10 @@ static int is_left_neighbor_in_cache(struct tree_balance *tb, int h) "vs-8165: F[h] (%b) or FL[h] (%b) is invalid", father, tb->FL[h]); - /* Get position of the pointer to the left neighbor into the left father. */ + /* + * Get position of the pointer to the left neighbor + * into the left father. + */ left_neighbor_position = (father == tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->FL[h]); /* Get left neighbor block number. */ @@ -940,17 +1007,20 @@ static int is_left_neighbor_in_cache(struct tree_balance *tb, int h) static void decrement_key(struct cpu_key *key) { - // call item specific function for this key + /* call item specific function for this key */ item_ops[cpu_key_k_type(key)]->decrement_key(key); } -/* Calculate far left/right parent of the left/right neighbor of the current node, that - * is calculate the left/right (FL[h]/FR[h]) neighbor of the parent F[h]. +/* + * Calculate far left/right parent of the left/right neighbor of the + * current node, that is calculate the left/right (FL[h]/FR[h]) neighbor + * of the parent F[h]. * Calculate left/right common parent of the current node and L[h]/R[h]. * Calculate left/right delimiting key position. - * Returns: PATH_INCORRECT - path in the tree is not correct; - SCHEDULE_OCCURRED - schedule occurred while the function worked; - * CARRY_ON - schedule didn't occur while the function worked; + * Returns: PATH_INCORRECT - path in the tree is not correct + * SCHEDULE_OCCURRED - schedule occurred while the function worked + * CARRY_ON - schedule didn't occur while the function + * worked */ static int get_far_parent(struct tree_balance *tb, int h, @@ -966,8 +1036,10 @@ static int get_far_parent(struct tree_balance *tb, first_last_position = 0, path_offset = PATH_H_PATH_OFFSET(path, h); - /* Starting from F[h] go upwards in the tree, and look for the common - ancestor of F[h], and its neighbor l/r, that should be obtained. */ + /* + * Starting from F[h] go upwards in the tree, and look for the common + * ancestor of F[h], and its neighbor l/r, that should be obtained. + */ counter = path_offset; @@ -975,21 +1047,33 @@ static int get_far_parent(struct tree_balance *tb, "PAP-8180: invalid path length"); for (; counter > FIRST_PATH_ELEMENT_OFFSET; counter--) { - /* Check whether parent of the current buffer in the path is really parent in the tree. */ + /* + * Check whether parent of the current buffer in the path + * is really parent in the tree. + */ if (!B_IS_IN_TREE (parent = PATH_OFFSET_PBUFFER(path, counter - 1))) return REPEAT_SEARCH; + /* Check whether position in the parent is correct. */ if ((position = PATH_OFFSET_POSITION(path, counter - 1)) > B_NR_ITEMS(parent)) return REPEAT_SEARCH; - /* Check whether parent at the path really points to the child. */ + + /* + * Check whether parent at the path really points + * to the child. + */ if (B_N_CHILD_NUM(parent, position) != PATH_OFFSET_PBUFFER(path, counter)->b_blocknr) return REPEAT_SEARCH; - /* Return delimiting key if position in the parent is not equal to first/last one. */ + + /* + * Return delimiting key if position in the parent is not + * equal to first/last one. + */ if (c_lr_par == RIGHT_PARENTS) first_last_position = B_NR_ITEMS(parent); if (position != first_last_position) { @@ -1002,7 +1086,10 @@ static int get_far_parent(struct tree_balance *tb, /* if we are in the root of the tree, then there is no common father */ if (counter == FIRST_PATH_ELEMENT_OFFSET) { - /* Check whether first buffer in the path is the root of the tree. */ + /* + * Check whether first buffer in the path is the + * root of the tree. + */ if (PATH_OFFSET_PBUFFER (tb->tb_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == @@ -1031,12 +1118,15 @@ static int get_far_parent(struct tree_balance *tb, } } - /* So, we got common parent of the current node and its left/right neighbor. - Now we are geting the parent of the left/right neighbor. */ + /* + * So, we got common parent of the current node and its + * left/right neighbor. Now we are getting the parent of the + * left/right neighbor. + */ /* Form key to get parent of the left/right neighbor. */ le_key2cpu_key(&s_lr_father_key, - B_N_PDELIM_KEY(*pcom_father, + internal_key(*pcom_father, (c_lr_par == LEFT_PARENTS) ? (tb->lkey[h - 1] = position - @@ -1050,7 +1140,7 @@ static int get_far_parent(struct tree_balance *tb, if (search_by_key (tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father, h + 1) == IO_ERROR) - // path is released + /* path is released */ return IO_ERROR; if (FILESYSTEM_CHANGED_TB(tb)) { @@ -1071,12 +1161,15 @@ static int get_far_parent(struct tree_balance *tb, return CARRY_ON; } -/* Get parents of neighbors of node in the path(S[path_offset]) and common parents of - * S[path_offset] and L[path_offset]/R[path_offset]: F[path_offset], FL[path_offset], - * FR[path_offset], CFL[path_offset], CFR[path_offset]. - * Calculate numbers of left and right delimiting keys position: lkey[path_offset], rkey[path_offset]. - * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; - * CARRY_ON - schedule didn't occur while the function worked; +/* + * Get parents of neighbors of node in the path(S[path_offset]) and + * common parents of S[path_offset] and L[path_offset]/R[path_offset]: + * F[path_offset], FL[path_offset], FR[path_offset], CFL[path_offset], + * CFR[path_offset]. + * Calculate numbers of left and right delimiting keys position: + * lkey[path_offset], rkey[path_offset]. + * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked + * CARRY_ON - schedule didn't occur while the function worked */ static int get_parents(struct tree_balance *tb, int h) { @@ -1088,8 +1181,11 @@ static int get_parents(struct tree_balance *tb, int h) /* Current node is the root of the tree or will be root of the tree */ if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) { - /* The root can not have parents. - Release nodes which previously were obtained as parents of the current node neighbors. */ + /* + * The root can not have parents. + * Release nodes which previously were obtained as + * parents of the current node neighbors. + */ brelse(tb->FL[h]); brelse(tb->CFL[h]); brelse(tb->FR[h]); @@ -1111,10 +1207,14 @@ static int get_parents(struct tree_balance *tb, int h) get_bh(curf); tb->lkey[h] = position - 1; } else { - /* Calculate current parent of L[path_offset], which is the left neighbor of the current node. - Calculate current common parent of L[path_offset] and the current node. Note that - CFL[path_offset] not equal FL[path_offset] and CFL[path_offset] not equal F[path_offset]. - Calculate lkey[path_offset]. */ + /* + * Calculate current parent of L[path_offset], which is the + * left neighbor of the current node. Calculate current + * common parent of L[path_offset] and the current node. + * Note that CFL[path_offset] not equal FL[path_offset] and + * CFL[path_offset] not equal F[path_offset]. + * Calculate lkey[path_offset]. + */ if ((ret = get_far_parent(tb, h + 1, &curf, &curcf, LEFT_PARENTS)) != CARRY_ON) @@ -1130,19 +1230,22 @@ static int get_parents(struct tree_balance *tb, int h) (curcf && !B_IS_IN_TREE(curcf)), "PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf); -/* Get parent FR[h] of R[h]. */ + /* Get parent FR[h] of R[h]. */ -/* Current node is the last child of F[h]. FR[h] != F[h]. */ + /* Current node is the last child of F[h]. FR[h] != F[h]. */ if (position == B_NR_ITEMS(PATH_H_PBUFFER(path, h + 1))) { -/* Calculate current parent of R[h], which is the right neighbor of F[h]. - Calculate current common parent of R[h] and current node. Note that CFR[h] - not equal FR[path_offset] and CFR[h] not equal F[h]. */ + /* + * Calculate current parent of R[h], which is the right + * neighbor of F[h]. Calculate current common parent of + * R[h] and current node. Note that CFR[h] not equal + * FR[path_offset] and CFR[h] not equal F[h]. + */ if ((ret = get_far_parent(tb, h + 1, &curf, &curcf, RIGHT_PARENTS)) != CARRY_ON) return ret; } else { -/* Current node is not the last child of its parent F[h]. */ + /* Current node is not the last child of its parent F[h]. */ curf = PATH_OFFSET_PBUFFER(path, path_offset - 1); curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1); get_bh(curf); @@ -1165,8 +1268,10 @@ static int get_parents(struct tree_balance *tb, int h) return CARRY_ON; } -/* it is possible to remove node as result of shiftings to - neighbors even when we insert or paste item. */ +/* + * it is possible to remove node as result of shiftings to + * neighbors even when we insert or paste item. + */ static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree, struct tree_balance *tb, int h) { @@ -1175,21 +1280,22 @@ static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree, struct item_head *ih; struct reiserfs_key *r_key = NULL; - ih = B_N_PITEM_HEAD(Sh, 0); + ih = item_head(Sh, 0); if (tb->CFR[h]) - r_key = B_N_PDELIM_KEY(tb->CFR[h], tb->rkey[h]); + r_key = internal_key(tb->CFR[h], tb->rkey[h]); if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes /* shifting may merge items which might save space */ - ((!h - && op_is_left_mergeable(&(ih->ih_key), Sh->b_size)) ? IH_SIZE : 0) + && op_is_left_mergeable(&ih->ih_key, Sh->b_size)) ? IH_SIZE : 0) - ((!h && r_key && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0) + ((h) ? KEY_SIZE : 0)) { /* node can not be removed */ - if (sfree >= levbytes) { /* new item fits into node S[h] without any shifting */ + if (sfree >= levbytes) { + /* new item fits into node S[h] without any shifting */ if (!h) tb->s0num = B_NR_ITEMS(Sh) + @@ -1202,7 +1308,8 @@ static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree, return !NO_BALANCING_NEEDED; } -/* Check whether current node S[h] is balanced when increasing its size by +/* + * Check whether current node S[h] is balanced when increasing its size by * Inserting or Pasting. * Calculate parameters for balancing for current level h. * Parameters: @@ -1219,39 +1326,48 @@ static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree, static int ip_check_balance(struct tree_balance *tb, int h) { struct virtual_node *vn = tb->tb_vn; - int levbytes, /* Number of bytes that must be inserted into (value - is negative if bytes are deleted) buffer which - contains node being balanced. The mnemonic is - that the attempted change in node space used level - is levbytes bytes. */ - ret; + /* + * Number of bytes that must be inserted into (value is negative + * if bytes are deleted) buffer which contains node being balanced. + * The mnemonic is that the attempted change in node space used + * level is levbytes bytes. + */ + int levbytes; + int ret; int lfree, sfree, rfree /* free space in L, S and R */ ; - /* nver is short for number of vertixes, and lnver is the number if - we shift to the left, rnver is the number if we shift to the - right, and lrnver is the number if we shift in both directions. - The goal is to minimize first the number of vertixes, and second, - the number of vertixes whose contents are changed by shifting, - and third the number of uncached vertixes whose contents are - changed by shifting and must be read from disk. */ + /* + * nver is short for number of vertixes, and lnver is the number if + * we shift to the left, rnver is the number if we shift to the + * right, and lrnver is the number if we shift in both directions. + * The goal is to minimize first the number of vertixes, and second, + * the number of vertixes whose contents are changed by shifting, + * and third the number of uncached vertixes whose contents are + * changed by shifting and must be read from disk. + */ int nver, lnver, rnver, lrnver; - /* used at leaf level only, S0 = S[0] is the node being balanced, - sInum [ I = 0,1,2 ] is the number of items that will - remain in node SI after balancing. S1 and S2 are new - nodes that might be created. */ + /* + * used at leaf level only, S0 = S[0] is the node being balanced, + * sInum [ I = 0,1,2 ] is the number of items that will + * remain in node SI after balancing. S1 and S2 are new + * nodes that might be created. + */ - /* we perform 8 calls to get_num_ver(). For each call we calculate five parameters. - where 4th parameter is s1bytes and 5th - s2bytes + /* + * we perform 8 calls to get_num_ver(). For each call we + * calculate five parameters. where 4th parameter is s1bytes + * and 5th - s2bytes + * + * s0num, s1num, s2num for 8 cases + * 0,1 - do not shift and do not shift but bottle + * 2 - shift only whole item to left + * 3 - shift to left and bottle as much as possible + * 4,5 - shift to right (whole items and as much as possible + * 6,7 - shift to both directions (whole items and as much as possible) */ - short snum012[40] = { 0, }; /* s0num, s1num, s2num for 8 cases - 0,1 - do not shift and do not shift but bottle - 2 - shift only whole item to left - 3 - shift to left and bottle as much as possible - 4,5 - shift to right (whole items and as much as possible - 6,7 - shift to both directions (whole items and as much as possible) - */ + short snum012[40] = { 0, }; /* Sh is the node whose balance is currently being checked */ struct buffer_head *Sh; @@ -1265,9 +1381,10 @@ static int ip_check_balance(struct tree_balance *tb, int h) reiserfs_panic(tb->tb_sb, "vs-8210", "S[0] can not be 0"); switch (ret = get_empty_nodes(tb, h)) { + /* no balancing for higher levels needed */ case CARRY_ON: set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); - return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */ + return NO_BALANCING_NEEDED; case NO_DISK_SPACE: case REPEAT_SEARCH: @@ -1278,7 +1395,9 @@ static int ip_check_balance(struct tree_balance *tb, int h) } } - if ((ret = get_parents(tb, h)) != CARRY_ON) /* get parents of S[h] neighbors. */ + /* get parents of S[h] neighbors. */ + ret = get_parents(tb, h); + if (ret != CARRY_ON) return ret; sfree = B_FREE_SPACE(Sh); @@ -1287,38 +1406,44 @@ static int ip_check_balance(struct tree_balance *tb, int h) rfree = get_rfree(tb, h); lfree = get_lfree(tb, h); + /* and new item fits into node S[h] without any shifting */ if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) == NO_BALANCING_NEEDED) - /* and new item fits into node S[h] without any shifting */ return NO_BALANCING_NEEDED; create_virtual_node(tb, h); /* - determine maximal number of items we can shift to the left neighbor (in tb structure) - and the maximal number of bytes that can flow to the left neighbor - from the left most liquid item that cannot be shifted from S[0] entirely (returned value) + * determine maximal number of items we can shift to the left + * neighbor (in tb structure) and the maximal number of bytes + * that can flow to the left neighbor from the left most liquid + * item that cannot be shifted from S[0] entirely (returned value) */ check_left(tb, h, lfree); /* - determine maximal number of items we can shift to the right neighbor (in tb structure) - and the maximal number of bytes that can flow to the right neighbor - from the right most liquid item that cannot be shifted from S[0] entirely (returned value) + * determine maximal number of items we can shift to the right + * neighbor (in tb structure) and the maximal number of bytes + * that can flow to the right neighbor from the right most liquid + * item that cannot be shifted from S[0] entirely (returned value) */ check_right(tb, h, rfree); - /* all contents of internal node S[h] can be moved into its - neighbors, S[h] will be removed after balancing */ + /* + * all contents of internal node S[h] can be moved into its + * neighbors, S[h] will be removed after balancing + */ if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) { int to_r; - /* Since we are working on internal nodes, and our internal - nodes have fixed size entries, then we can balance by the - number of items rather than the space they consume. In this - routine we set the left node equal to the right node, - allowing a difference of less than or equal to 1 child - pointer. */ + /* + * Since we are working on internal nodes, and our internal + * nodes have fixed size entries, then we can balance by the + * number of items rather than the space they consume. In this + * routine we set the left node equal to the right node, + * allowing a difference of less than or equal to 1 child + * pointer. + */ to_r = ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - @@ -1328,7 +1453,10 @@ static int ip_check_balance(struct tree_balance *tb, int h) return CARRY_ON; } - /* this checks balance condition, that any two neighboring nodes can not fit in one node */ + /* + * this checks balance condition, that any two neighboring nodes + * can not fit in one node + */ RFALSE(h && (tb->lnum[h] >= vn->vn_nr_item + 1 || tb->rnum[h] >= vn->vn_nr_item + 1), @@ -1337,16 +1465,22 @@ static int ip_check_balance(struct tree_balance *tb, int h) (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))), "vs-8225: tree is not balanced on leaf level"); - /* all contents of S[0] can be moved into its neighbors - S[0] will be removed after balancing. */ + /* + * all contents of S[0] can be moved into its neighbors + * S[0] will be removed after balancing. + */ if (!h && is_leaf_removable(tb)) return CARRY_ON; - /* why do we perform this check here rather than earlier?? - Answer: we can win 1 node in some cases above. Moreover we - checked it above, when we checked, that S[0] is not removable - in principle */ - if (sfree >= levbytes) { /* new item fits into node S[h] without any shifting */ + /* + * why do we perform this check here rather than earlier?? + * Answer: we can win 1 node in some cases above. Moreover we + * checked it above, when we checked, that S[0] is not removable + * in principle + */ + + /* new item fits into node S[h] without any shifting */ + if (sfree >= levbytes) { if (!h) tb->s0num = vn->vn_nr_item; set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); @@ -1355,18 +1489,19 @@ static int ip_check_balance(struct tree_balance *tb, int h) { int lpar, rpar, nset, lset, rset, lrset; - /* - * regular overflowing of the node - */ + /* regular overflowing of the node */ - /* get_num_ver works in 2 modes (FLOW & NO_FLOW) - lpar, rpar - number of items we can shift to left/right neighbor (including splitting item) - nset, lset, rset, lrset - shows, whether flowing items give better packing + /* + * get_num_ver works in 2 modes (FLOW & NO_FLOW) + * lpar, rpar - number of items we can shift to left/right + * neighbor (including splitting item) + * nset, lset, rset, lrset - shows, whether flowing items + * give better packing */ #define FLOW 1 #define NO_FLOW 0 /* do not any splitting */ - /* we choose one the following */ + /* we choose one of the following */ #define NOTHING_SHIFT_NO_FLOW 0 #define NOTHING_SHIFT_FLOW 5 #define LEFT_SHIFT_NO_FLOW 10 @@ -1379,10 +1514,13 @@ static int ip_check_balance(struct tree_balance *tb, int h) lpar = tb->lnum[h]; rpar = tb->rnum[h]; - /* calculate number of blocks S[h] must be split into when - nothing is shifted to the neighbors, - as well as number of items in each part of the split node (s012 numbers), - and number of bytes (s1bytes) of the shared drop which flow to S1 if any */ + /* + * calculate number of blocks S[h] must be split into when + * nothing is shifted to the neighbors, as well as number of + * items in each part of the split node (s012 numbers), + * and number of bytes (s1bytes) of the shared drop which + * flow to S1 if any + */ nset = NOTHING_SHIFT_NO_FLOW; nver = get_num_ver(vn->vn_mode, tb, h, 0, -1, h ? vn->vn_nr_item : 0, -1, @@ -1391,7 +1529,10 @@ static int ip_check_balance(struct tree_balance *tb, int h) if (!h) { int nver1; - /* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */ + /* + * note, that in this case we try to bottle + * between S[0] and S1 (S1 - the first new node) + */ nver1 = get_num_ver(vn->vn_mode, tb, h, 0, -1, 0, -1, snum012 + NOTHING_SHIFT_FLOW, FLOW); @@ -1399,11 +1540,13 @@ static int ip_check_balance(struct tree_balance *tb, int h) nset = NOTHING_SHIFT_FLOW, nver = nver1; } - /* calculate number of blocks S[h] must be split into when - l_shift_num first items and l_shift_bytes of the right most - liquid item to be shifted are shifted to the left neighbor, - as well as number of items in each part of the splitted node (s012 numbers), - and number of bytes (s1bytes) of the shared drop which flow to S1 if any + /* + * calculate number of blocks S[h] must be split into when + * l_shift_num first items and l_shift_bytes of the right + * most liquid item to be shifted are shifted to the left + * neighbor, as well as number of items in each part of the + * splitted node (s012 numbers), and number of bytes + * (s1bytes) of the shared drop which flow to S1 if any */ lset = LEFT_SHIFT_NO_FLOW; lnver = get_num_ver(vn->vn_mode, tb, h, @@ -1422,11 +1565,13 @@ static int ip_check_balance(struct tree_balance *tb, int h) lset = LEFT_SHIFT_FLOW, lnver = lnver1; } - /* calculate number of blocks S[h] must be split into when - r_shift_num first items and r_shift_bytes of the left most - liquid item to be shifted are shifted to the right neighbor, - as well as number of items in each part of the splitted node (s012 numbers), - and number of bytes (s1bytes) of the shared drop which flow to S1 if any + /* + * calculate number of blocks S[h] must be split into when + * r_shift_num first items and r_shift_bytes of the left most + * liquid item to be shifted are shifted to the right neighbor, + * as well as number of items in each part of the splitted + * node (s012 numbers), and number of bytes (s1bytes) of the + * shared drop which flow to S1 if any */ rset = RIGHT_SHIFT_NO_FLOW; rnver = get_num_ver(vn->vn_mode, tb, h, @@ -1451,10 +1596,12 @@ static int ip_check_balance(struct tree_balance *tb, int h) rset = RIGHT_SHIFT_FLOW, rnver = rnver1; } - /* calculate number of blocks S[h] must be split into when - items are shifted in both directions, - as well as number of items in each part of the splitted node (s012 numbers), - and number of bytes (s1bytes) of the shared drop which flow to S1 if any + /* + * calculate number of blocks S[h] must be split into when + * items are shifted in both directions, as well as number + * of items in each part of the splitted node (s012 numbers), + * and number of bytes (s1bytes) of the shared drop which + * flow to S1 if any */ lrset = LR_SHIFT_NO_FLOW; lrnver = get_num_ver(vn->vn_mode, tb, h, @@ -1481,10 +1628,12 @@ static int ip_check_balance(struct tree_balance *tb, int h) lrset = LR_SHIFT_FLOW, lrnver = lrnver1; } - /* Our general shifting strategy is: - 1) to minimized number of new nodes; - 2) to minimized number of neighbors involved in shifting; - 3) to minimized number of disk reads; */ + /* + * Our general shifting strategy is: + * 1) to minimized number of new nodes; + * 2) to minimized number of neighbors involved in shifting; + * 3) to minimized number of disk reads; + */ /* we can win TWO or ONE nodes by shifting in both directions */ if (lrnver < lnver && lrnver < rnver) { @@ -1508,42 +1657,59 @@ static int ip_check_balance(struct tree_balance *tb, int h) return CARRY_ON; } - /* if shifting doesn't lead to better packing then don't shift */ + /* + * if shifting doesn't lead to better packing + * then don't shift + */ if (nver == lrnver) { set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1, -1); return CARRY_ON; } - /* now we know that for better packing shifting in only one - direction either to the left or to the right is required */ + /* + * now we know that for better packing shifting in only one + * direction either to the left or to the right is required + */ - /* if shifting to the left is better than shifting to the right */ + /* + * if shifting to the left is better than + * shifting to the right + */ if (lnver < rnver) { SET_PAR_SHIFT_LEFT; return CARRY_ON; } - /* if shifting to the right is better than shifting to the left */ + /* + * if shifting to the right is better than + * shifting to the left + */ if (lnver > rnver) { SET_PAR_SHIFT_RIGHT; return CARRY_ON; } - /* now shifting in either direction gives the same number - of nodes and we can make use of the cached neighbors */ + /* + * now shifting in either direction gives the same number + * of nodes and we can make use of the cached neighbors + */ if (is_left_neighbor_in_cache(tb, h)) { SET_PAR_SHIFT_LEFT; return CARRY_ON; } - /* shift to the right independently on whether the right neighbor in cache or not */ + /* + * shift to the right independently on whether the + * right neighbor in cache or not + */ SET_PAR_SHIFT_RIGHT; return CARRY_ON; } } -/* Check whether current node S[h] is balanced when Decreasing its size by +/* + * Check whether current node S[h] is balanced when Decreasing its size by * Deleting or Cutting for INTERNAL node of S+tree. * Calculate parameters for balancing for current level h. * Parameters: @@ -1563,8 +1729,10 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h) { struct virtual_node *vn = tb->tb_vn; - /* Sh is the node whose balance is currently being checked, - and Fh is its father. */ + /* + * Sh is the node whose balance is currently being checked, + * and Fh is its father. + */ struct buffer_head *Sh, *Fh; int maxsize, ret; int lfree, rfree /* free space in L and R */ ; @@ -1574,19 +1742,25 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h) maxsize = MAX_CHILD_SIZE(Sh); -/* using tb->insert_size[h], which is negative in this case, create_virtual_node calculates: */ -/* new_nr_item = number of items node would have if operation is */ -/* performed without balancing (new_nr_item); */ + /* + * using tb->insert_size[h], which is negative in this case, + * create_virtual_node calculates: + * new_nr_item = number of items node would have if operation is + * performed without balancing (new_nr_item); + */ create_virtual_node(tb, h); if (!Fh) { /* S[h] is the root. */ + /* no balancing for higher levels needed */ if (vn->vn_nr_item > 0) { set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); - return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */ + return NO_BALANCING_NEEDED; } - /* new_nr_item == 0. + /* + * new_nr_item == 0. * Current root will be deleted resulting in - * decrementing the tree height. */ + * decrementing the tree height. + */ set_parameters(tb, h, 0, 0, 0, NULL, -1, -1); return CARRY_ON; } @@ -1602,12 +1776,18 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h) check_left(tb, h, lfree); check_right(tb, h, rfree); - if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) { /* Balance condition for the internal node is valid. - * In this case we balance only if it leads to better packing. */ - if (vn->vn_nr_item == MIN_NR_KEY(Sh)) { /* Here we join S[h] with one of its neighbors, - * which is impossible with greater values of new_nr_item. */ + /* + * Balance condition for the internal node is valid. + * In this case we balance only if it leads to better packing. + */ + if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) { + /* + * Here we join S[h] with one of its neighbors, + * which is impossible with greater values of new_nr_item. + */ + if (vn->vn_nr_item == MIN_NR_KEY(Sh)) { + /* All contents of S[h] can be moved to L[h]. */ if (tb->lnum[h] >= vn->vn_nr_item + 1) { - /* All contents of S[h] can be moved to L[h]. */ int n; int order_L; @@ -1623,8 +1803,8 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h) return CARRY_ON; } + /* All contents of S[h] can be moved to R[h]. */ if (tb->rnum[h] >= vn->vn_nr_item + 1) { - /* All contents of S[h] can be moved to R[h]. */ int n; int order_R; @@ -1641,8 +1821,11 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h) } } + /* + * All contents of S[h] can be moved to the neighbors + * (L[h] & R[h]). + */ if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) { - /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */ int to_r; to_r = @@ -1659,7 +1842,10 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h) return NO_BALANCING_NEEDED; } - /* Current node contain insufficient number of items. Balancing is required. */ + /* + * Current node contain insufficient number of items. + * Balancing is required. + */ /* Check whether we can merge S[h] with left neighbor. */ if (tb->lnum[h] >= vn->vn_nr_item + 1) if (is_left_neighbor_in_cache(tb, h) @@ -1726,7 +1912,8 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h) return CARRY_ON; } -/* Check whether current node S[h] is balanced when Decreasing its size by +/* + * Check whether current node S[h] is balanced when Decreasing its size by * Deleting or Truncating for LEAF node of S+tree. * Calculate parameters for balancing for current level h. * Parameters: @@ -1743,15 +1930,21 @@ static int dc_check_balance_leaf(struct tree_balance *tb, int h) { struct virtual_node *vn = tb->tb_vn; - /* Number of bytes that must be deleted from - (value is negative if bytes are deleted) buffer which - contains node being balanced. The mnemonic is that the - attempted change in node space used level is levbytes bytes. */ + /* + * Number of bytes that must be deleted from + * (value is negative if bytes are deleted) buffer which + * contains node being balanced. The mnemonic is that the + * attempted change in node space used level is levbytes bytes. + */ int levbytes; + /* the maximal item size */ int maxsize, ret; - /* S0 is the node whose balance is currently being checked, - and F0 is its father. */ + + /* + * S0 is the node whose balance is currently being checked, + * and F0 is its father. + */ struct buffer_head *S0, *F0; int lfree, rfree /* free space in L and R */ ; @@ -1784,9 +1977,11 @@ static int dc_check_balance_leaf(struct tree_balance *tb, int h) if (are_leaves_removable(tb, lfree, rfree)) return CARRY_ON; - /* determine maximal number of items we can shift to the left/right neighbor - and the maximal number of bytes that can flow to the left/right neighbor - from the left/right most liquid item that cannot be shifted from S[0] entirely + /* + * determine maximal number of items we can shift to the left/right + * neighbor and the maximal number of bytes that can flow to the + * left/right neighbor from the left/right most liquid item that + * cannot be shifted from S[0] entirely */ check_left(tb, h, lfree); check_right(tb, h, rfree); @@ -1810,7 +2005,10 @@ static int dc_check_balance_leaf(struct tree_balance *tb, int h) return CARRY_ON; } - /* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */ + /* + * All contents of S[0] can be moved to the neighbors (L[0] & R[0]). + * Set parameters and return + */ if (is_leaf_removable(tb)) return CARRY_ON; @@ -1820,7 +2018,8 @@ static int dc_check_balance_leaf(struct tree_balance *tb, int h) return NO_BALANCING_NEEDED; } -/* Check whether current node S[h] is balanced when Decreasing its size by +/* + * Check whether current node S[h] is balanced when Decreasing its size by * Deleting or Cutting. * Calculate parameters for balancing for current level h. * Parameters: @@ -1844,15 +2043,16 @@ static int dc_check_balance(struct tree_balance *tb, int h) return dc_check_balance_leaf(tb, h); } -/* Check whether current node S[h] is balanced. +/* + * Check whether current node S[h] is balanced. * Calculate parameters for balancing for current level h. * Parameters: * * tb tree_balance structure: * - * tb is a large structure that must be read about in the header file - * at the same time as this procedure if the reader is to successfully - * understand this procedure + * tb is a large structure that must be read about in the header + * file at the same time as this procedure if the reader is + * to successfully understand this procedure * * h current level of the node; * inum item number in S[h]; @@ -1882,8 +2082,8 @@ static int check_balance(int mode, RFALSE(mode == M_INSERT && !vn->vn_ins_ih, "vs-8255: ins_ih can not be 0 in insert mode"); + /* Calculate balance parameters when size of node is increasing. */ if (tb->insert_size[h] > 0) - /* Calculate balance parameters when size of node is increasing. */ return ip_check_balance(tb, h); /* Calculate balance parameters when size of node is decreasing. */ @@ -1911,21 +2111,23 @@ static int get_direct_parent(struct tree_balance *tb, int h) PATH_OFFSET_POSITION(path, path_offset - 1) = 0; return CARRY_ON; } - return REPEAT_SEARCH; /* Root is changed and we must recalculate the path. */ + /* Root is changed and we must recalculate the path. */ + return REPEAT_SEARCH; } + /* Parent in the path is not in the tree. */ if (!B_IS_IN_TREE (bh = PATH_OFFSET_PBUFFER(path, path_offset - 1))) - return REPEAT_SEARCH; /* Parent in the path is not in the tree. */ + return REPEAT_SEARCH; if ((position = PATH_OFFSET_POSITION(path, path_offset - 1)) > B_NR_ITEMS(bh)) return REPEAT_SEARCH; + /* Parent in the path is not parent of the current node in the tree. */ if (B_N_CHILD_NUM(bh, position) != PATH_OFFSET_PBUFFER(path, path_offset)->b_blocknr) - /* Parent in the path is not parent of the current node in the tree. */ return REPEAT_SEARCH; if (buffer_locked(bh)) { @@ -1936,10 +2138,15 @@ static int get_direct_parent(struct tree_balance *tb, int h) return REPEAT_SEARCH; } - return CARRY_ON; /* Parent in the path is unlocked and really parent of the current node. */ + /* + * Parent in the path is unlocked and really parent + * of the current node. + */ + return CARRY_ON; } -/* Using lnum[h] and rnum[h] we should determine what neighbors +/* + * Using lnum[h] and rnum[h] we should determine what neighbors * of S[h] we * need in order to balance S[h], and get them if necessary. * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; @@ -1997,7 +2204,7 @@ static int get_neighbors(struct tree_balance *tb, int h) } /* We need right neighbor to balance S[path_offset]. */ - if (tb->rnum[h]) { /* We need right neighbor to balance S[path_offset]. */ + if (tb->rnum[h]) { PROC_INFO_INC(sb, need_r_neighbor[h]); bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset); @@ -2053,9 +2260,11 @@ static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh) (max_num_of_entries - 1) * sizeof(__u16)); } -/* maybe we should fail balancing we are going to perform when kmalloc - fails several times. But now it will loop until kmalloc gets - required memory */ +/* + * maybe we should fail balancing we are going to perform when kmalloc + * fails several times. But now it will loop until kmalloc gets + * required memory + */ static int get_mem_for_virtual_node(struct tree_balance *tb) { int check_fs = 0; @@ -2064,8 +2273,8 @@ static int get_mem_for_virtual_node(struct tree_balance *tb) size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path)); + /* we have to allocate more memory for virtual node */ if (size > tb->vn_buf_size) { - /* we have to allocate more memory for virtual node */ if (tb->vn_buf) { /* free memory allocated before */ kfree(tb->vn_buf); @@ -2079,10 +2288,12 @@ static int get_mem_for_virtual_node(struct tree_balance *tb) /* get memory for virtual item */ buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN); if (!buf) { - /* getting memory with GFP_KERNEL priority may involve - balancing now (due to indirect_to_direct conversion on - dcache shrinking). So, release path and collected - resources here */ + /* + * getting memory with GFP_KERNEL priority may involve + * balancing now (due to indirect_to_direct conversion + * on dcache shrinking). So, release path and collected + * resources here + */ free_buffers_in_tb(tb); buf = kmalloc(size, GFP_NOFS); if (!buf) { @@ -2168,8 +2379,10 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb) for (i = tb->tb_path->path_length; !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) { if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) { - /* if I understand correctly, we can only be sure the last buffer - ** in the path is in the tree --clm + /* + * if I understand correctly, we can only + * be sure the last buffer in the path is + * in the tree --clm */ #ifdef CONFIG_REISERFS_CHECK if (PATH_PLAST_BUFFER(tb->tb_path) == @@ -2256,13 +2469,15 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb) } } } - /* as far as I can tell, this is not required. The FEB list seems - ** to be full of newly allocated nodes, which will never be locked, - ** dirty, or anything else. - ** To be safe, I'm putting in the checks and waits in. For the moment, - ** they are needed to keep the code in journal.c from complaining - ** about the buffer. That code is inside CONFIG_REISERFS_CHECK as well. - ** --clm + + /* + * as far as I can tell, this is not required. The FEB list + * seems to be full of newly allocated nodes, which will + * never be locked, dirty, or anything else. + * To be safe, I'm putting in the checks and waits in. + * For the moment, they are needed to keep the code in + * journal.c from complaining about the buffer. + * That code is inside CONFIG_REISERFS_CHECK as well. --clm */ for (i = 0; !locked && i < MAX_FEB_SIZE; i++) { if (tb->FEB[i]) { @@ -2300,7 +2515,8 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb) return CARRY_ON; } -/* Prepare for balancing, that is +/* + * Prepare for balancing, that is * get all necessary parents, and neighbors; * analyze what and where should be moved; * get sufficient number of new nodes; @@ -2309,13 +2525,14 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb) * When ported to SMP kernels, only at the last moment after all needed nodes * are collected in cache, will the resources be locked using the usual * textbook ordered lock acquisition algorithms. Note that ensuring that - * this code neither write locks what it does not need to write lock nor locks out of order - * will be a pain in the butt that could have been avoided. Grumble grumble. -Hans + * this code neither write locks what it does not need to write lock nor locks + * out of order will be a pain in the butt that could have been avoided. + * Grumble grumble. -Hans * * fix is meant in the sense of render unchanging * - * Latency might be improved by first gathering a list of what buffers are needed - * and then getting as many of them in parallel as possible? -Hans + * Latency might be improved by first gathering a list of what buffers + * are needed and then getting as many of them in parallel as possible? -Hans * * Parameters: * op_mode i - insert, d - delete, c - cut (truncate), p - paste (append) @@ -2335,8 +2552,9 @@ int fix_nodes(int op_mode, struct tree_balance *tb, int ret, h, item_num = PATH_LAST_POSITION(tb->tb_path); int pos_in_item; - /* we set wait_tb_buffers_run when we have to restore any dirty bits cleared - ** during wait_tb_buffers_run + /* + * we set wait_tb_buffers_run when we have to restore any dirty + * bits cleared during wait_tb_buffers_run */ int wait_tb_buffers_run = 0; struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); @@ -2347,14 +2565,15 @@ int fix_nodes(int op_mode, struct tree_balance *tb, tb->fs_gen = get_generation(tb->tb_sb); - /* we prepare and log the super here so it will already be in the - ** transaction when do_balance needs to change it. - ** This way do_balance won't have to schedule when trying to prepare - ** the super for logging + /* + * we prepare and log the super here so it will already be in the + * transaction when do_balance needs to change it. + * This way do_balance won't have to schedule when trying to prepare + * the super for logging */ reiserfs_prepare_for_journal(tb->tb_sb, SB_BUFFER_WITH_SB(tb->tb_sb), 1); - journal_mark_dirty(tb->transaction_handle, tb->tb_sb, + journal_mark_dirty(tb->transaction_handle, SB_BUFFER_WITH_SB(tb->tb_sb)); if (FILESYSTEM_CHANGED_TB(tb)) return REPEAT_SEARCH; @@ -2408,7 +2627,7 @@ int fix_nodes(int op_mode, struct tree_balance *tb, #endif if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH) - // FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat + /* FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat */ return REPEAT_SEARCH; /* Starting from the leaf level; for all levels h of the tree. */ @@ -2427,7 +2646,10 @@ int fix_nodes(int op_mode, struct tree_balance *tb, goto repeat; if (h != MAX_HEIGHT - 1) tb->insert_size[h + 1] = 0; - /* ok, analysis and resource gathering are complete */ + /* + * ok, analysis and resource gathering + * are complete + */ break; } goto repeat; @@ -2437,15 +2659,19 @@ int fix_nodes(int op_mode, struct tree_balance *tb, if (ret != CARRY_ON) goto repeat; - /* No disk space, or schedule occurred and analysis may be - * invalid and needs to be redone. */ + /* + * No disk space, or schedule occurred and analysis may be + * invalid and needs to be redone. + */ ret = get_empty_nodes(tb, h); if (ret != CARRY_ON) goto repeat; + /* + * We have a positive insert size but no nodes exist on this + * level, this means that we are creating a new root. + */ if (!PATH_H_PBUFFER(tb->tb_path, h)) { - /* We have a positive insert size but no nodes exist on this - level, this means that we are creating a new root. */ RFALSE(tb->blknum[h] != 1, "PAP-8350: creating new empty root"); @@ -2453,11 +2679,13 @@ int fix_nodes(int op_mode, struct tree_balance *tb, if (h < MAX_HEIGHT - 1) tb->insert_size[h + 1] = 0; } else if (!PATH_H_PBUFFER(tb->tb_path, h + 1)) { + /* + * The tree needs to be grown, so this node S[h] + * which is the root node is split into two nodes, + * and a new node (S[h+1]) will be created to + * become the root node. + */ if (tb->blknum[h] > 1) { - /* The tree needs to be grown, so this node S[h] - which is the root node is split into two nodes, - and a new node (S[h+1]) will be created to - become the root node. */ RFALSE(h == MAX_HEIGHT - 1, "PAP-8355: attempt to create too high of a tree"); @@ -2487,12 +2715,14 @@ int fix_nodes(int op_mode, struct tree_balance *tb, goto repeat; } - repeat: - // fix_nodes was unable to perform its calculation due to - // filesystem got changed under us, lack of free disk space or i/o - // failure. If the first is the case - the search will be - // repeated. For now - free all resources acquired so far except - // for the new allocated nodes +repeat: + /* + * fix_nodes was unable to perform its calculation due to + * filesystem got changed under us, lack of free disk space or i/o + * failure. If the first is the case - the search will be + * repeated. For now - free all resources acquired so far except + * for the new allocated nodes + */ { int i; @@ -2548,8 +2778,6 @@ int fix_nodes(int op_mode, struct tree_balance *tb, } -/* Anatoly will probably forgive me renaming tb to tb. I just - wanted to make lines shorter */ void unfix_nodes(struct tree_balance *tb) { int i; @@ -2578,8 +2806,10 @@ void unfix_nodes(struct tree_balance *tb) for (i = 0; i < MAX_FEB_SIZE; i++) { if (tb->FEB[i]) { b_blocknr_t blocknr = tb->FEB[i]->b_blocknr; - /* de-allocated block which was not used by balancing and - bforget about buffer for it */ + /* + * de-allocated block which was not used by + * balancing and bforget about buffer for it + */ brelse(tb->FEB[i]); reiserfs_free_block(tb->transaction_handle, NULL, blocknr, 0); diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c index 91b0cc1242a..7a26c4fe6c4 100644 --- a/fs/reiserfs/hashes.c +++ b/fs/reiserfs/hashes.c @@ -12,12 +12,6 @@ * Yura's function is added (04/07/2000) */ -// -// keyed_hash -// yura_hash -// r5_hash -// - #include <linux/kernel.h> #include "reiserfs.h" #include <asm/types.h> @@ -56,7 +50,7 @@ u32 keyed_hash(const signed char *msg, int len) u32 pad; int i; - // assert(len >= 0 && len < 256); + /* assert(len >= 0 && len < 256); */ pad = (u32) len | ((u32) len << 8); pad |= pad << 16; @@ -127,9 +121,10 @@ u32 keyed_hash(const signed char *msg, int len) return h0 ^ h1; } -/* What follows in this file is copyright 2000 by Hans Reiser, and the - * licensing of what follows is governed by reiserfs/README */ - +/* + * What follows in this file is copyright 2000 by Hans Reiser, and the + * licensing of what follows is governed by reiserfs/README + */ u32 yura_hash(const signed char *msg, int len) { int j, pow; diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c index e1978fd895f..73231b1ebdb 100644 --- a/fs/reiserfs/ibalance.c +++ b/fs/reiserfs/ibalance.c @@ -12,7 +12,10 @@ int balance_internal(struct tree_balance *, int, int, struct item_head *, struct buffer_head **); -/* modes of internal_shift_left, internal_shift_right and internal_insert_childs */ +/* + * modes of internal_shift_left, internal_shift_right and + * internal_insert_childs + */ #define INTERNAL_SHIFT_FROM_S_TO_L 0 #define INTERNAL_SHIFT_FROM_R_TO_S 1 #define INTERNAL_SHIFT_FROM_L_TO_S 2 @@ -32,7 +35,9 @@ static void internal_define_dest_src_infos(int shift_mode, memset(src_bi, 0, sizeof(struct buffer_info)); /* define dest, src, dest parent, dest position */ switch (shift_mode) { - case INTERNAL_SHIFT_FROM_S_TO_L: /* used in internal_shift_left */ + + /* used in internal_shift_left */ + case INTERNAL_SHIFT_FROM_S_TO_L: src_bi->tb = tb; src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); @@ -52,12 +57,14 @@ static void internal_define_dest_src_infos(int shift_mode, dest_bi->tb = tb; dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); - dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); /* dest position is analog of dest->b_item_order */ + /* dest position is analog of dest->b_item_order */ + dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); *d_key = tb->lkey[h]; *cf = tb->CFL[h]; break; - case INTERNAL_SHIFT_FROM_R_TO_S: /* used in internal_shift_left */ + /* used in internal_shift_left */ + case INTERNAL_SHIFT_FROM_R_TO_S: src_bi->tb = tb; src_bi->bi_bh = tb->R[h]; src_bi->bi_parent = tb->FR[h]; @@ -111,7 +118,8 @@ static void internal_define_dest_src_infos(int shift_mode, } } -/* Insert count node pointers into buffer cur before position to + 1. +/* + * Insert count node pointers into buffer cur before position to + 1. * Insert count items into buffer cur before position to. * Items and node pointers are specified by inserted and bh respectively. */ @@ -146,14 +154,14 @@ static void internal_insert_childs(struct buffer_info *cur_bi, /* copy to_be_insert disk children */ for (i = 0; i < count; i++) { - put_dc_size(&(new_dc[i]), + put_dc_size(&new_dc[i], MAX_CHILD_SIZE(bh[i]) - B_FREE_SPACE(bh[i])); - put_dc_block_number(&(new_dc[i]), bh[i]->b_blocknr); + put_dc_block_number(&new_dc[i], bh[i]->b_blocknr); } memcpy(dc, new_dc, DC_SIZE * count); /* prepare space for count items */ - ih = B_N_PDELIM_KEY(cur, ((to == -1) ? 0 : to)); + ih = internal_key(cur, ((to == -1) ? 0 : to)); memmove(ih + count, ih, (nr - to) * KEY_SIZE + (nr + 1 + count) * DC_SIZE); @@ -190,8 +198,10 @@ static void internal_insert_childs(struct buffer_info *cur_bi, } -/* Delete del_num items and node pointers from buffer cur starting from * - * the first_i'th item and first_p'th pointers respectively. */ +/* + * Delete del_num items and node pointers from buffer cur starting from + * the first_i'th item and first_p'th pointers respectively. + */ static void internal_delete_pointers_items(struct buffer_info *cur_bi, int first_p, int first_i, int del_num) @@ -233,7 +243,7 @@ static void internal_delete_pointers_items(struct buffer_info *cur_bi, dc = B_N_CHILD(cur, first_p); memmove(dc, dc + del_num, (nr + 1 - first_p - del_num) * DC_SIZE); - key = B_N_PDELIM_KEY(cur, first_i); + key = internal_key(cur, first_i); memmove(key, key + del_num, (nr - first_i - del_num) * KEY_SIZE + (nr + 1 - del_num) * DC_SIZE); @@ -270,22 +280,30 @@ static void internal_delete_childs(struct buffer_info *cur_bi, int from, int n) i_from = (from == 0) ? from : from - 1; - /* delete n pointers starting from `from' position in CUR; - delete n keys starting from 'i_from' position in CUR; + /* + * delete n pointers starting from `from' position in CUR; + * delete n keys starting from 'i_from' position in CUR; */ internal_delete_pointers_items(cur_bi, from, i_from, n); } -/* copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest -* last_first == FIRST_TO_LAST means, that we copy first items from src to tail of dest - * last_first == LAST_TO_FIRST means, that we copy last items from src to head of dest +/* + * copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer + * dest + * last_first == FIRST_TO_LAST means that we copy first items + * from src to tail of dest + * last_first == LAST_TO_FIRST means that we copy last items + * from src to head of dest */ static void internal_copy_pointers_items(struct buffer_info *dest_bi, struct buffer_head *src, int last_first, int cpy_num) { - /* ATTENTION! Number of node pointers in DEST is equal to number of items in DEST * - * as delimiting key have already inserted to buffer dest.*/ + /* + * ATTENTION! Number of node pointers in DEST is equal to number + * of items in DEST as delimiting key have already inserted to + * buffer dest. + */ struct buffer_head *dest = dest_bi->bi_bh; int nr_dest, nr_src; int dest_order, src_order; @@ -330,13 +348,13 @@ static void internal_copy_pointers_items(struct buffer_info *dest_bi, memcpy(dc, B_N_CHILD(src, src_order), DC_SIZE * cpy_num); /* prepare space for cpy_num - 1 item headers */ - key = B_N_PDELIM_KEY(dest, dest_order); + key = internal_key(dest, dest_order); memmove(key + cpy_num - 1, key, KEY_SIZE * (nr_dest - dest_order) + DC_SIZE * (nr_dest + cpy_num)); /* insert headers */ - memcpy(key, B_N_PDELIM_KEY(src, src_order), KEY_SIZE * (cpy_num - 1)); + memcpy(key, internal_key(src, src_order), KEY_SIZE * (cpy_num - 1)); /* sizes, item number */ set_blkh_nr_item(blkh, blkh_nr_item(blkh) + (cpy_num - 1)); @@ -366,7 +384,9 @@ static void internal_copy_pointers_items(struct buffer_info *dest_bi, } -/* Copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest. +/* + * Copy cpy_num node pointers and cpy_num - 1 items from buffer src to + * buffer dest. * Delete cpy_num - del_par items and node pointers from buffer src. * last_first == FIRST_TO_LAST means, that we copy/delete first items from src. * last_first == LAST_TO_FIRST means, that we copy/delete last items from src. @@ -385,8 +405,10 @@ static void internal_move_pointers_items(struct buffer_info *dest_bi, if (last_first == FIRST_TO_LAST) { /* shift_left occurs */ first_pointer = 0; first_item = 0; - /* delete cpy_num - del_par pointers and keys starting for pointers with first_pointer, - for key - with first_item */ + /* + * delete cpy_num - del_par pointers and keys starting for + * pointers with first_pointer, for key - with first_item + */ internal_delete_pointers_items(src_bi, first_pointer, first_item, cpy_num - del_par); } else { /* shift_right occurs */ @@ -404,7 +426,9 @@ static void internal_move_pointers_items(struct buffer_info *dest_bi, } /* Insert n_src'th key of buffer src before n_dest'th key of buffer dest. */ -static void internal_insert_key(struct buffer_info *dest_bi, int dest_position_before, /* insert key before key with n_dest number */ +static void internal_insert_key(struct buffer_info *dest_bi, + /* insert key before key with n_dest number */ + int dest_position_before, struct buffer_head *src, int src_position) { struct buffer_head *dest = dest_bi->bi_bh; @@ -429,12 +453,12 @@ static void internal_insert_key(struct buffer_info *dest_bi, int dest_position_b nr = blkh_nr_item(blkh); /* prepare space for inserting key */ - key = B_N_PDELIM_KEY(dest, dest_position_before); + key = internal_key(dest, dest_position_before); memmove(key + 1, key, (nr - dest_position_before) * KEY_SIZE + (nr + 1) * DC_SIZE); /* insert key */ - memcpy(key, B_N_PDELIM_KEY(src, src_position), KEY_SIZE); + memcpy(key, internal_key(src, src_position), KEY_SIZE); /* Change dirt, free space, item number fields. */ @@ -453,13 +477,19 @@ static void internal_insert_key(struct buffer_info *dest_bi, int dest_position_b } } -/* Insert d_key'th (delimiting) key from buffer cfl to tail of dest. - * Copy pointer_amount node pointers and pointer_amount - 1 items from buffer src to buffer dest. +/* + * Insert d_key'th (delimiting) key from buffer cfl to tail of dest. + * Copy pointer_amount node pointers and pointer_amount - 1 items from + * buffer src to buffer dest. * Replace d_key'th key in buffer cfl. * Delete pointer_amount items and node pointers from buffer src. */ /* this can be invoked both to shift from S to L and from R to S */ -static void internal_shift_left(int mode, /* INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S */ +static void internal_shift_left( + /* + * INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S + */ + int mode, struct tree_balance *tb, int h, int pointer_amount) { @@ -473,7 +503,10 @@ static void internal_shift_left(int mode, /* INTERNAL_FROM_S_TO_L | INTERNAL_FRO /*printk("pointer_amount = %d\n",pointer_amount); */ if (pointer_amount) { - /* insert delimiting key from common father of dest and src to node dest into position B_NR_ITEM(dest) */ + /* + * insert delimiting key from common father of dest and + * src to node dest into position B_NR_ITEM(dest) + */ internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, d_key_position); @@ -492,7 +525,8 @@ static void internal_shift_left(int mode, /* INTERNAL_FROM_S_TO_L | INTERNAL_FRO } -/* Insert delimiting key to L[h]. +/* + * Insert delimiting key to L[h]. * Copy n node pointers and n - 1 items from buffer S[h] to L[h]. * Delete n - 1 items and node pointers from buffer S[h]. */ @@ -507,23 +541,27 @@ static void internal_shift1_left(struct tree_balance *tb, internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, &dest_bi, &src_bi, &d_key_position, &cf); - if (pointer_amount > 0) /* insert lkey[h]-th key from CFL[h] to left neighbor L[h] */ + /* insert lkey[h]-th key from CFL[h] to left neighbor L[h] */ + if (pointer_amount > 0) internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, d_key_position); - /* internal_insert_key (tb->L[h], B_NR_ITEM(tb->L[h]), tb->CFL[h], tb->lkey[h]); */ /* last parameter is del_parameter */ internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST, pointer_amount, 1); - /* internal_move_pointers_items (tb->L[h], tb->S[h], FIRST_TO_LAST, pointer_amount, 1); */ } -/* Insert d_key'th (delimiting) key from buffer cfr to head of dest. +/* + * Insert d_key'th (delimiting) key from buffer cfr to head of dest. * Copy n node pointers and n - 1 items from buffer src to buffer dest. * Replace d_key'th key in buffer cfr. * Delete n items and node pointers from buffer src. */ -static void internal_shift_right(int mode, /* INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S */ +static void internal_shift_right( + /* + * INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S + */ + int mode, struct tree_balance *tb, int h, int pointer_amount) { @@ -538,7 +576,10 @@ static void internal_shift_right(int mode, /* INTERNAL_FROM_S_TO_R | INTERNAL_FR nr = B_NR_ITEMS(src_bi.bi_bh); if (pointer_amount > 0) { - /* insert delimiting key from common father of dest and src to dest node into position 0 */ + /* + * insert delimiting key from common father of dest + * and src to dest node into position 0 + */ internal_insert_key(&dest_bi, 0, cf, d_key_position); if (nr == pointer_amount - 1) { RFALSE(src_bi.bi_bh != PATH_H_PBUFFER(tb->tb_path, h) /*tb->S[h] */ || @@ -559,7 +600,8 @@ static void internal_shift_right(int mode, /* INTERNAL_FROM_S_TO_R | INTERNAL_FR pointer_amount, 0); } -/* Insert delimiting key to R[h]. +/* + * Insert delimiting key to R[h]. * Copy n node pointers and n - 1 items from buffer S[h] to R[h]. * Delete n - 1 items and node pointers from buffer S[h]. */ @@ -574,18 +616,19 @@ static void internal_shift1_right(struct tree_balance *tb, internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, &dest_bi, &src_bi, &d_key_position, &cf); - if (pointer_amount > 0) /* insert rkey from CFR[h] to right neighbor R[h] */ + /* insert rkey from CFR[h] to right neighbor R[h] */ + if (pointer_amount > 0) internal_insert_key(&dest_bi, 0, cf, d_key_position); - /* internal_insert_key (tb->R[h], 0, tb->CFR[h], tb->rkey[h]); */ /* last parameter is del_parameter */ internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST, pointer_amount, 1); - /* internal_move_pointers_items (tb->R[h], tb->S[h], LAST_TO_FIRST, pointer_amount, 1); */ } -/* Delete insert_num node pointers together with their left items - * and balance current node.*/ +/* + * Delete insert_num node pointers together with their left items + * and balance current node. + */ static void balance_internal_when_delete(struct tree_balance *tb, int h, int child_pos) { @@ -626,9 +669,11 @@ static void balance_internal_when_delete(struct tree_balance *tb, new_root = tb->R[h - 1]; else new_root = tb->L[h - 1]; - /* switch super block's tree root block number to the new value */ + /* + * switch super block's tree root block + * number to the new value */ PUT_SB_ROOT_BLOCK(tb->tb_sb, new_root->b_blocknr); - //REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --; + /*REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --; */ PUT_SB_TREE_HEIGHT(tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) - 1); @@ -636,8 +681,8 @@ static void balance_internal_when_delete(struct tree_balance *tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1); /*&&&&&&&&&&&&&&&&&&&&&& */ + /* use check_internal if new root is an internal node */ if (h > 1) - /* use check_internal if new root is an internal node */ check_internal(new_root); /*&&&&&&&&&&&&&&&&&&&&&& */ @@ -648,7 +693,8 @@ static void balance_internal_when_delete(struct tree_balance *tb, return; } - if (tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1) { /* join S[h] with L[h] */ + /* join S[h] with L[h] */ + if (tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1) { RFALSE(tb->rnum[h] != 0, "invalid tb->rnum[%d]==%d when joining S[h] with L[h]", @@ -660,7 +706,8 @@ static void balance_internal_when_delete(struct tree_balance *tb, return; } - if (tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1) { /* join S[h] with R[h] */ + /* join S[h] with R[h] */ + if (tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1) { RFALSE(tb->lnum[h] != 0, "invalid tb->lnum[%d]==%d when joining S[h] with R[h]", h, tb->lnum[h]); @@ -671,17 +718,18 @@ static void balance_internal_when_delete(struct tree_balance *tb, return; } - if (tb->lnum[h] < 0) { /* borrow from left neighbor L[h] */ + /* borrow from left neighbor L[h] */ + if (tb->lnum[h] < 0) { RFALSE(tb->rnum[h] != 0, "wrong tb->rnum[%d]==%d when borrow from L[h]", h, tb->rnum[h]); - /*internal_shift_right (tb, h, tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], -tb->lnum[h]); */ internal_shift_right(INTERNAL_SHIFT_FROM_L_TO_S, tb, h, -tb->lnum[h]); return; } - if (tb->rnum[h] < 0) { /* borrow from right neighbor R[h] */ + /* borrow from right neighbor R[h] */ + if (tb->rnum[h] < 0) { RFALSE(tb->lnum[h] != 0, "invalid tb->lnum[%d]==%d when borrow from R[h]", h, tb->lnum[h]); @@ -689,7 +737,8 @@ static void balance_internal_when_delete(struct tree_balance *tb, return; } - if (tb->lnum[h] > 0) { /* split S[h] into two parts and put them into neighbors */ + /* split S[h] into two parts and put them into neighbors */ + if (tb->lnum[h] > 0) { RFALSE(tb->rnum[h] == 0 || tb->lnum[h] + tb->rnum[h] != n + 1, "invalid tb->lnum[%d]==%d or tb->rnum[%d]==%d when S[h](item number == %d) is split between them", h, tb->lnum[h], h, tb->rnum[h], n); @@ -717,7 +766,7 @@ static void replace_lkey(struct tree_balance *tb, int h, struct item_head *key) if (B_NR_ITEMS(PATH_H_PBUFFER(tb->tb_path, h)) == 0) return; - memcpy(B_N_PDELIM_KEY(tb->CFL[h], tb->lkey[h]), key, KEY_SIZE); + memcpy(internal_key(tb->CFL[h], tb->lkey[h]), key, KEY_SIZE); do_balance_mark_internal_dirty(tb, tb->CFL[h], 0); } @@ -732,34 +781,41 @@ static void replace_rkey(struct tree_balance *tb, int h, struct item_head *key) "R[h] can not be empty if it exists (item number=%d)", B_NR_ITEMS(tb->R[h])); - memcpy(B_N_PDELIM_KEY(tb->CFR[h], tb->rkey[h]), key, KEY_SIZE); + memcpy(internal_key(tb->CFR[h], tb->rkey[h]), key, KEY_SIZE); do_balance_mark_internal_dirty(tb, tb->CFR[h], 0); } -int balance_internal(struct tree_balance *tb, /* tree_balance structure */ - int h, /* level of the tree */ - int child_pos, struct item_head *insert_key, /* key for insertion on higher level */ - struct buffer_head **insert_ptr /* node for insertion on higher level */ - ) - /* if inserting/pasting - { - child_pos is the position of the node-pointer in S[h] that * - pointed to S[h-1] before balancing of the h-1 level; * - this means that new pointers and items must be inserted AFTER * - child_pos - } - else - { - it is the position of the leftmost pointer that must be deleted (together with - its corresponding key to the left of the pointer) - as a result of the previous level's balancing. - } - */ + +/* + * if inserting/pasting { + * child_pos is the position of the node-pointer in S[h] that + * pointed to S[h-1] before balancing of the h-1 level; + * this means that new pointers and items must be inserted AFTER + * child_pos + * } else { + * it is the position of the leftmost pointer that must be deleted + * (together with its corresponding key to the left of the pointer) + * as a result of the previous level's balancing. + * } + */ + +int balance_internal(struct tree_balance *tb, + int h, /* level of the tree */ + int child_pos, + /* key for insertion on higher level */ + struct item_head *insert_key, + /* node for insertion on higher level */ + struct buffer_head **insert_ptr) { struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h); struct buffer_info bi; - int order; /* we return this: it is 0 if there is no S[h], else it is tb->S[h]->b_item_order */ + + /* + * we return this: it is 0 if there is no S[h], + * else it is tb->S[h]->b_item_order + */ + int order; int insert_num, n, k; struct buffer_head *S_new; struct item_head new_insert_key; @@ -774,8 +830,10 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure (tbSh) ? PATH_H_POSITION(tb->tb_path, h + 1) /*tb->S[h]->b_item_order */ : 0; - /* Using insert_size[h] calculate the number insert_num of items - that must be inserted to or deleted from S[h]. */ + /* + * Using insert_size[h] calculate the number insert_num of items + * that must be inserted to or deleted from S[h]. + */ insert_num = tb->insert_size[h] / ((int)(KEY_SIZE + DC_SIZE)); /* Check whether insert_num is proper * */ @@ -794,23 +852,21 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure k = 0; if (tb->lnum[h] > 0) { - /* shift lnum[h] items from S[h] to the left neighbor L[h]. - check how many of new items fall into L[h] or CFL[h] after - shifting */ + /* + * shift lnum[h] items from S[h] to the left neighbor L[h]. + * check how many of new items fall into L[h] or CFL[h] after + * shifting + */ n = B_NR_ITEMS(tb->L[h]); /* number of items in L[h] */ if (tb->lnum[h] <= child_pos) { /* new items don't fall into L[h] or CFL[h] */ internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]); - /*internal_shift_left (tb->L[h],tb->CFL[h],tb->lkey[h],tbSh,tb->lnum[h]); */ child_pos -= tb->lnum[h]; } else if (tb->lnum[h] > child_pos + insert_num) { /* all new items fall into L[h] */ internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h] - insert_num); - /* internal_shift_left(tb->L[h],tb->CFL[h],tb->lkey[h],tbSh, - tb->lnum[h]-insert_num); - */ /* insert insert_num keys and node-pointers into L[h] */ bi.tb = tb; bi.bi_bh = tb->L[h]; @@ -826,7 +882,10 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure } else { struct disk_child *dc; - /* some items fall into L[h] or CFL[h], but some don't fall */ + /* + * some items fall into L[h] or CFL[h], + * but some don't fall + */ internal_shift1_left(tb, h, child_pos + 1); /* calculate number of new items that fall into L[h] */ k = tb->lnum[h] - child_pos - 1; @@ -841,7 +900,10 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure replace_lkey(tb, h, insert_key + k); - /* replace the first node-ptr in S[h] by node-ptr to insert_ptr[k] */ + /* + * replace the first node-ptr in S[h] by + * node-ptr to insert_ptr[k] + */ dc = B_N_CHILD(tbSh, 0); put_dc_size(dc, MAX_CHILD_SIZE(insert_ptr[k]) - @@ -860,17 +922,17 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure /* tb->lnum[h] > 0 */ if (tb->rnum[h] > 0) { /*shift rnum[h] items from S[h] to the right neighbor R[h] */ - /* check how many of new items fall into R or CFR after shifting */ + /* + * check how many of new items fall into R or CFR + * after shifting + */ n = B_NR_ITEMS(tbSh); /* number of items in S[h] */ if (n - tb->rnum[h] >= child_pos) /* new items fall into S[h] */ - /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h],tb->rnum[h]); */ internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h]); else if (n + insert_num - tb->rnum[h] < child_pos) { /* all new items fall into R[h] */ - /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h], - tb->rnum[h] - insert_num); */ internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h] - insert_num); @@ -904,7 +966,10 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure replace_rkey(tb, h, insert_key + insert_num - k - 1); - /* replace the first node-ptr in R[h] by node-ptr insert_ptr[insert_num-k-1] */ + /* + * replace the first node-ptr in R[h] by + * node-ptr insert_ptr[insert_num-k-1] + */ dc = B_N_CHILD(tb->R[h], 0); put_dc_size(dc, MAX_CHILD_SIZE(insert_ptr @@ -921,7 +986,7 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure } } - /** Fill new node that appears instead of S[h] **/ + /** Fill new node that appears instead of S[h] **/ RFALSE(tb->blknum[h] > 2, "blknum can not be > 2 for internal level"); RFALSE(tb->blknum[h] < 0, "blknum can not be < 0"); @@ -997,26 +1062,30 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure /* new items don't fall into S_new */ /* store the delimiting key for the next level */ /* new_insert_key = (n - snum)'th key in S[h] */ - memcpy(&new_insert_key, B_N_PDELIM_KEY(tbSh, n - snum), + memcpy(&new_insert_key, internal_key(tbSh, n - snum), KEY_SIZE); /* last parameter is del_par */ internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST, snum, 0); - /* internal_move_pointers_items(S_new, tbSh, LAST_TO_FIRST, snum, 0); */ } else if (n + insert_num - snum < child_pos) { /* all new items fall into S_new */ /* store the delimiting key for the next level */ - /* new_insert_key = (n + insert_item - snum)'th key in S[h] */ + /* + * new_insert_key = (n + insert_item - snum)'th + * key in S[h] + */ memcpy(&new_insert_key, - B_N_PDELIM_KEY(tbSh, n + insert_num - snum), + internal_key(tbSh, n + insert_num - snum), KEY_SIZE); /* last parameter is del_par */ internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST, snum - insert_num, 0); - /* internal_move_pointers_items(S_new,tbSh,1,snum - insert_num,0); */ - /* insert insert_num keys and node-pointers into S_new */ + /* + * insert insert_num keys and node-pointers + * into S_new + */ internal_insert_childs(&dest_bi, /*S_new,tb->S[h-1]->b_next, */ child_pos - n - insert_num + @@ -1033,7 +1102,6 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST, n - child_pos + 1, 1); - /* internal_move_pointers_items(S_new,tbSh,1,n - child_pos + 1,1); */ /* calculate number of new items that fall into S_new */ k = snum - n + child_pos - 1; @@ -1043,7 +1111,10 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure /* new_insert_key = insert_key[insert_num - k - 1] */ memcpy(&new_insert_key, insert_key + insert_num - k - 1, KEY_SIZE); - /* replace first node-ptr in S_new by node-ptr to insert_ptr[insert_num-k-1] */ + /* + * replace first node-ptr in S_new by node-ptr + * to insert_ptr[insert_num-k-1] + */ dc = B_N_CHILD(S_new, 0); put_dc_size(dc, @@ -1066,7 +1137,7 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure || buffer_dirty(S_new), "cm-00001: bad S_new (%b)", S_new); - // S_new is released in unfix_nodes + /* S_new is released in unfix_nodes */ } n = B_NR_ITEMS(tbSh); /*number of items in S[h] */ diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index ad62bdbb451..63b2b0ec49e 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -25,7 +25,10 @@ int reiserfs_commit_write(struct file *f, struct page *page, void reiserfs_evict_inode(struct inode *inode) { - /* We need blocks for transaction + (user+group) quota update (possibly delete) */ + /* + * We need blocks for transaction + (user+group) quota + * update (possibly delete) + */ int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb); @@ -35,12 +38,16 @@ void reiserfs_evict_inode(struct inode *inode) if (!inode->i_nlink && !is_bad_inode(inode)) dquot_initialize(inode); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (inode->i_nlink) goto no_delete; - /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */ - if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */ + /* + * The = 0 happens when we abort creating a new inode + * for some reason like lack of space.. + * also handles bad_inode case + */ + if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { reiserfs_delete_xattrs(inode); @@ -54,34 +61,43 @@ void reiserfs_evict_inode(struct inode *inode) err = reiserfs_delete_object(&th, inode); - /* Do quota update inside a transaction for journaled quotas. We must do that - * after delete_object so that quota updates go into the same transaction as - * stat data deletion */ + /* + * Do quota update inside a transaction for journaled quotas. + * We must do that after delete_object so that quota updates + * go into the same transaction as stat data deletion + */ if (!err) { int depth = reiserfs_write_unlock_nested(inode->i_sb); dquot_free_inode(inode); reiserfs_write_lock_nested(inode->i_sb, depth); } - if (journal_end(&th, inode->i_sb, jbegin_count)) + if (journal_end(&th)) goto out; - /* check return value from reiserfs_delete_object after + /* + * check return value from reiserfs_delete_object after * ending the transaction */ if (err) goto out; - /* all items of file are deleted, so we can remove "save" link */ - remove_save_link(inode, 0 /* not truncate */ ); /* we can't do anything - * about an error here */ + /* + * all items of file are deleted, so we can remove + * "save" link + * we can't do anything about an error here + */ + remove_save_link(inode, 0 /* not truncate */); out: reiserfs_write_unlock(inode->i_sb); } else { /* no object items are in the tree */ ; } - clear_inode(inode); /* note this must go after the journal_end to prevent deadlock */ + + /* note this must go after the journal_end to prevent deadlock */ + clear_inode(inode); + dquot_drop(inode); inode->i_blocks = 0; return; @@ -103,8 +119,10 @@ static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid, key->key_length = length; } -/* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set - offset and type of key */ +/* + * take base of inode_key (it comes from inode always) (dirid, objectid) + * and version from an inode, set offset and type of key + */ void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset, int type, int length) { @@ -114,9 +132,7 @@ void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset, length); } -// -// when key is 0, do not set version and short key -// +/* when key is 0, do not set version and short key */ inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key, int version, loff_t offset, int type, int length, @@ -132,43 +148,47 @@ inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key, set_le_ih_k_type(ih, type); put_ih_item_len(ih, length); /* set_ih_free_space (ih, 0); */ - // for directory items it is entry count, for directs and stat - // datas - 0xffff, for indirects - 0 + /* + * for directory items it is entry count, for directs and stat + * datas - 0xffff, for indirects - 0 + */ put_ih_entry_count(ih, entry_count); } -// -// FIXME: we might cache recently accessed indirect item - -// Ugh. Not too eager for that.... -// I cut the code until such time as I see a convincing argument (benchmark). -// I don't want a bloated inode struct..., and I don't like code complexity.... - -/* cutting the code is fine, since it really isn't in use yet and is easy -** to add back in. But, Vladimir has a really good idea here. Think -** about what happens for reading a file. For each page, -** The VFS layer calls reiserfs_readpage, who searches the tree to find -** an indirect item. This indirect item has X number of pointers, where -** X is a big number if we've done the block allocation right. But, -** we only use one or two of these pointers during each call to readpage, -** needlessly researching again later on. -** -** The size of the cache could be dynamic based on the size of the file. -** -** I'd also like to see us cache the location the stat data item, since -** we are needlessly researching for that frequently. -** -** --chris -*/ +/* + * FIXME: we might cache recently accessed indirect item + * Ugh. Not too eager for that.... + * I cut the code until such time as I see a convincing argument (benchmark). + * I don't want a bloated inode struct..., and I don't like code complexity.... + */ -/* If this page has a file tail in it, and -** it was read in by get_block_create_0, the page data is valid, -** but tail is still sitting in a direct item, and we can't write to -** it. So, look through this page, and check all the mapped buffers -** to make sure they have valid block numbers. Any that don't need -** to be unmapped, so that __block_write_begin will correctly call -** reiserfs_get_block to convert the tail into an unformatted node -*/ +/* + * cutting the code is fine, since it really isn't in use yet and is easy + * to add back in. But, Vladimir has a really good idea here. Think + * about what happens for reading a file. For each page, + * The VFS layer calls reiserfs_readpage, who searches the tree to find + * an indirect item. This indirect item has X number of pointers, where + * X is a big number if we've done the block allocation right. But, + * we only use one or two of these pointers during each call to readpage, + * needlessly researching again later on. + * + * The size of the cache could be dynamic based on the size of the file. + * + * I'd also like to see us cache the location the stat data item, since + * we are needlessly researching for that frequently. + * + * --chris + */ + +/* + * If this page has a file tail in it, and + * it was read in by get_block_create_0, the page data is valid, + * but tail is still sitting in a direct item, and we can't write to + * it. So, look through this page, and check all the mapped buffers + * to make sure they have valid block numbers. Any that don't need + * to be unmapped, so that __block_write_begin will correctly call + * reiserfs_get_block to convert the tail into an unformatted node + */ static inline void fix_tail_page_for_writing(struct page *page) { struct buffer_head *head, *next, *bh; @@ -186,8 +206,10 @@ static inline void fix_tail_page_for_writing(struct page *page) } } -/* reiserfs_get_block does not need to allocate a block only if it has been - done already or non-hole position has been found in the indirect item */ +/* + * reiserfs_get_block does not need to allocate a block only if it has been + * done already or non-hole position has been found in the indirect item + */ static inline int allocation_needed(int retval, b_blocknr_t allocated, struct item_head *ih, __le32 * item, int pos_in_item) @@ -211,14 +233,16 @@ static inline void set_block_dev_mapped(struct buffer_head *bh, map_bh(bh, inode->i_sb, block); } -// -// files which were created in the earlier version can not be longer, -// than 2 gb -// +/* + * files which were created in the earlier version can not be longer, + * than 2 gb + */ static int file_capable(struct inode *inode, sector_t block) { - if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 || // it is new file. - block < (1 << (31 - inode->i_sb->s_blocksize_bits))) // old file, but 'block' is inside of 2gb + /* it is new file. */ + if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 || + /* old file, but 'block' is inside of 2gb */ + block < (1 << (31 - inode->i_sb->s_blocksize_bits))) return 1; return 0; @@ -228,7 +252,6 @@ static int restart_transaction(struct reiserfs_transaction_handle *th, struct inode *inode, struct treepath *path) { struct super_block *s = th->t_super; - int len = th->t_blocks_allocated; int err; BUG_ON(!th->t_trans_id); @@ -241,7 +264,7 @@ static int restart_transaction(struct reiserfs_transaction_handle *th, return 0; } reiserfs_update_sd(th, inode); - err = journal_end(th, s, len); + err = journal_end(th); if (!err) { err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6); if (!err) @@ -250,14 +273,14 @@ static int restart_transaction(struct reiserfs_transaction_handle *th, return err; } -// it is called by get_block when create == 0. Returns block number -// for 'block'-th logical block of file. When it hits direct item it -// returns 0 (being called from bmap) or read direct item into piece -// of page (bh_result) - -// Please improve the english/clarity in the comment above, as it is -// hard to understand. - +/* + * it is called by get_block when create == 0. Returns block number + * for 'block'-th logical block of file. When it hits direct item it + * returns 0 (being called from bmap) or read direct item into piece + * of page (bh_result) + * Please improve the english/clarity in the comment above, as it is + * hard to understand. + */ static int _get_block_create_0(struct inode *inode, sector_t block, struct buffer_head *bh_result, int args) { @@ -273,7 +296,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block, int done = 0; unsigned long offset; - // prepare the key to look for the 'block'-th block of file + /* prepare the key to look for the 'block'-th block of file */ make_cpu_key(&key, inode, (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY, 3); @@ -285,23 +308,28 @@ static int _get_block_create_0(struct inode *inode, sector_t block, kunmap(bh_result->b_page); if (result == IO_ERROR) return -EIO; - // We do not return -ENOENT if there is a hole but page is uptodate, because it means - // That there is some MMAPED data associated with it that is yet to be written to disk. + /* + * We do not return -ENOENT if there is a hole but page is + * uptodate, because it means that there is some MMAPED data + * associated with it that is yet to be written to disk. + */ if ((args & GET_BLOCK_NO_HOLE) && !PageUptodate(bh_result->b_page)) { return -ENOENT; } return 0; } - // + bh = get_last_bh(&path); - ih = get_ih(&path); + ih = tp_item_head(&path); if (is_indirect_le_ih(ih)) { - __le32 *ind_item = (__le32 *) B_I_PITEM(bh, ih); + __le32 *ind_item = (__le32 *) ih_item_body(bh, ih); - /* FIXME: here we could cache indirect item or part of it in - the inode to avoid search_by_key in case of subsequent - access to file */ + /* + * FIXME: here we could cache indirect item or part of it in + * the inode to avoid search_by_key in case of subsequent + * access to file + */ blocknr = get_block_num(ind_item, path.pos_in_item); ret = 0; if (blocknr) { @@ -311,8 +339,12 @@ static int _get_block_create_0(struct inode *inode, sector_t block, set_buffer_boundary(bh_result); } } else - // We do not return -ENOENT if there is a hole but page is uptodate, because it means - // That there is some MMAPED data associated with it that is yet to be written to disk. + /* + * We do not return -ENOENT if there is a hole but + * page is uptodate, because it means that there is + * some MMAPED data associated with it that is + * yet to be written to disk. + */ if ((args & GET_BLOCK_NO_HOLE) && !PageUptodate(bh_result->b_page)) { ret = -ENOENT; @@ -323,41 +355,45 @@ static int _get_block_create_0(struct inode *inode, sector_t block, kunmap(bh_result->b_page); return ret; } - // requested data are in direct item(s) + /* requested data are in direct item(s) */ if (!(args & GET_BLOCK_READ_DIRECT)) { - // we are called by bmap. FIXME: we can not map block of file - // when it is stored in direct item(s) + /* + * we are called by bmap. FIXME: we can not map block of file + * when it is stored in direct item(s) + */ pathrelse(&path); if (p) kunmap(bh_result->b_page); return -ENOENT; } - /* if we've got a direct item, and the buffer or page was uptodate, - ** we don't want to pull data off disk again. skip to the - ** end, where we map the buffer and return + /* + * if we've got a direct item, and the buffer or page was uptodate, + * we don't want to pull data off disk again. skip to the + * end, where we map the buffer and return */ if (buffer_uptodate(bh_result)) { goto finished; } else /* - ** grab_tail_page can trigger calls to reiserfs_get_block on up to date - ** pages without any buffers. If the page is up to date, we don't want - ** read old data off disk. Set the up to date bit on the buffer instead - ** and jump to the end + * grab_tail_page can trigger calls to reiserfs_get_block on + * up to date pages without any buffers. If the page is up + * to date, we don't want read old data off disk. Set the up + * to date bit on the buffer instead and jump to the end */ if (!bh_result->b_page || PageUptodate(bh_result->b_page)) { set_buffer_uptodate(bh_result); goto finished; } - // read file tail into part of page + /* read file tail into part of page */ offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1); copy_item_head(&tmp_ih, ih); - /* we only want to kmap if we are reading the tail into the page. - ** this is not the common case, so we don't kmap until we are - ** sure we need to. But, this means the item might move if - ** kmap schedules + /* + * we only want to kmap if we are reading the tail into the page. + * this is not the common case, so we don't kmap until we are + * sure we need to. But, this means the item might move if + * kmap schedules */ if (!p) p = (char *)kmap(bh_result->b_page); @@ -368,10 +404,11 @@ static int _get_block_create_0(struct inode *inode, sector_t block, if (!is_direct_le_ih(ih)) { BUG(); } - /* make sure we don't read more bytes than actually exist in - ** the file. This can happen in odd cases where i_size isn't - ** correct, and when direct item padding results in a few - ** extra bytes at the end of the direct item + /* + * make sure we don't read more bytes than actually exist in + * the file. This can happen in odd cases where i_size isn't + * correct, and when direct item padding results in a few + * extra bytes at the end of the direct item */ if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size) break; @@ -383,40 +420,43 @@ static int _get_block_create_0(struct inode *inode, sector_t block, } else { chars = ih_item_len(ih) - path.pos_in_item; } - memcpy(p, B_I_PITEM(bh, ih) + path.pos_in_item, chars); + memcpy(p, ih_item_body(bh, ih) + path.pos_in_item, chars); if (done) break; p += chars; + /* + * we done, if read direct item is not the last item of + * node FIXME: we could try to check right delimiting key + * to see whether direct item continues in the right + * neighbor or rely on i_size + */ if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1)) - // we done, if read direct item is not the last item of - // node FIXME: we could try to check right delimiting key - // to see whether direct item continues in the right - // neighbor or rely on i_size break; - // update key to look for the next piece + /* update key to look for the next piece */ set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars); result = search_for_position_by_key(inode->i_sb, &key, &path); if (result != POSITION_FOUND) - // i/o error most likely + /* i/o error most likely */ break; bh = get_last_bh(&path); - ih = get_ih(&path); + ih = tp_item_head(&path); } while (1); flush_dcache_page(bh_result->b_page); kunmap(bh_result->b_page); - finished: +finished: pathrelse(&path); if (result == IO_ERROR) return -EIO; - /* this buffer has valid data, but isn't valid for io. mapping it to + /* + * this buffer has valid data, but isn't valid for io. mapping it to * block #0 tells the rest of reiserfs it just has a tail in it */ map_bh(bh_result, inode->i_sb, 0); @@ -424,8 +464,10 @@ static int _get_block_create_0(struct inode *inode, sector_t block, return 0; } -// this is called to create file map. So, _get_block_create_0 will not -// read direct item +/* + * this is called to create file map. So, _get_block_create_0 will not + * read direct item + */ static int reiserfs_bmap(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create) { @@ -439,22 +481,23 @@ static int reiserfs_bmap(struct inode *inode, sector_t block, return 0; } -/* special version of get_block that is only used by grab_tail_page right -** now. It is sent to __block_write_begin, and when you try to get a -** block past the end of the file (or a block from a hole) it returns -** -ENOENT instead of a valid buffer. __block_write_begin expects to -** be able to do i/o on the buffers returned, unless an error value -** is also returned. -** -** So, this allows __block_write_begin to be used for reading a single block -** in a page. Where it does not produce a valid page for holes, or past the -** end of the file. This turns out to be exactly what we need for reading -** tails for conversion. -** -** The point of the wrapper is forcing a certain value for create, even -** though the VFS layer is calling this function with create==1. If you -** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block, -** don't use this function. +/* + * special version of get_block that is only used by grab_tail_page right + * now. It is sent to __block_write_begin, and when you try to get a + * block past the end of the file (or a block from a hole) it returns + * -ENOENT instead of a valid buffer. __block_write_begin expects to + * be able to do i/o on the buffers returned, unless an error value + * is also returned. + * + * So, this allows __block_write_begin to be used for reading a single block + * in a page. Where it does not produce a valid page for holes, or past the + * end of the file. This turns out to be exactly what we need for reading + * tails for conversion. + * + * The point of the wrapper is forcing a certain value for create, even + * though the VFS layer is calling this function with create==1. If you + * don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block, + * don't use this function. */ static int reiserfs_get_block_create_0(struct inode *inode, sector_t block, struct buffer_head *bh_result, @@ -463,8 +506,10 @@ static int reiserfs_get_block_create_0(struct inode *inode, sector_t block, return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE); } -/* This is special helper for reiserfs_get_block in case we are executing - direct_IO request. */ +/* + * This is special helper for reiserfs_get_block in case we are executing + * direct_IO request. + */ static int reiserfs_get_blocks_direct_io(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, @@ -474,9 +519,11 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode, bh_result->b_page = NULL; - /* We set the b_size before reiserfs_get_block call since it is - referenced in convert_tail_for_hole() that may be called from - reiserfs_get_block() */ + /* + * We set the b_size before reiserfs_get_block call since it is + * referenced in convert_tail_for_hole() that may be called from + * reiserfs_get_block() + */ bh_result->b_size = (1 << inode->i_blkbits); ret = reiserfs_get_block(inode, iblock, bh_result, @@ -486,14 +533,18 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode, /* don't allow direct io onto tail pages */ if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { - /* make sure future calls to the direct io funcs for this offset - ** in the file fail by unmapping the buffer + /* + * make sure future calls to the direct io funcs for this + * offset in the file fail by unmapping the buffer */ clear_buffer_mapped(bh_result); ret = -EINVAL; } - /* Possible unpacked tail. Flush the data before pages have - disappeared */ + + /* + * Possible unpacked tail. Flush the data before pages have + * disappeared + */ if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) { int err; @@ -507,20 +558,20 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode, if (err < 0) ret = err; } - out: +out: return ret; } /* -** helper function for when reiserfs_get_block is called for a hole -** but the file tail is still in a direct item -** bh_result is the buffer head for the hole -** tail_offset is the offset of the start of the tail in the file -** -** This calls prepare_write, which will start a new transaction -** you should not be in a transaction, or have any paths held when you -** call this. -*/ + * helper function for when reiserfs_get_block is called for a hole + * but the file tail is still in a direct item + * bh_result is the buffer head for the hole + * tail_offset is the offset of the start of the tail in the file + * + * This calls prepare_write, which will start a new transaction + * you should not be in a transaction, or have any paths held when you + * call this. + */ static int convert_tail_for_hole(struct inode *inode, struct buffer_head *bh_result, loff_t tail_offset) @@ -540,9 +591,10 @@ static int convert_tail_for_hole(struct inode *inode, tail_end = (tail_start | (bh_result->b_size - 1)) + 1; index = tail_offset >> PAGE_CACHE_SHIFT; - /* hole_page can be zero in case of direct_io, we are sure - that we cannot get here if we write with O_DIRECT into - tail page */ + /* + * hole_page can be zero in case of direct_io, we are sure + * that we cannot get here if we write with O_DIRECT into tail page + */ if (!hole_page || index != hole_page->index) { tail_page = grab_cache_page(inode->i_mapping, index); retval = -ENOMEM; @@ -553,14 +605,15 @@ static int convert_tail_for_hole(struct inode *inode, tail_page = hole_page; } - /* we don't have to make sure the conversion did not happen while - ** we were locking the page because anyone that could convert - ** must first take i_mutex. - ** - ** We must fix the tail page for writing because it might have buffers - ** that are mapped, but have a block number of 0. This indicates tail - ** data that has been read directly into the page, and - ** __block_write_begin won't trigger a get_block in this case. + /* + * we don't have to make sure the conversion did not happen while + * we were locking the page because anyone that could convert + * must first take i_mutex. + * + * We must fix the tail page for writing because it might have buffers + * that are mapped, but have a block number of 0. This indicates tail + * data that has been read directly into the page, and + * __block_write_begin won't trigger a get_block in this case. */ fix_tail_page_for_writing(tail_page); retval = __reiserfs_write_begin(tail_page, tail_start, @@ -573,12 +626,12 @@ static int convert_tail_for_hole(struct inode *inode, retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end); - unlock: +unlock: if (tail_page != hole_page) { unlock_page(tail_page); page_cache_release(tail_page); } - out: +out: return retval; } @@ -604,7 +657,8 @@ int reiserfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create) { int repeat, retval = 0; - b_blocknr_t allocated_block_nr = 0; // b_blocknr_t is (unsigned) 32 bit int + /* b_blocknr_t is (unsigned) 32 bit int*/ + b_blocknr_t allocated_block_nr = 0; INITIALIZE_PATH(path); int pos_in_item; struct cpu_key key; @@ -614,12 +668,14 @@ int reiserfs_get_block(struct inode *inode, sector_t block, int done; int fs_gen; struct reiserfs_transaction_handle *th = NULL; - /* space reserved in transaction batch: - . 3 balancings in direct->indirect conversion - . 1 block involved into reiserfs_update_sd() - XXX in practically impossible worst case direct2indirect() - can incur (much) more than 3 balancings. - quota update for user, group */ + /* + * space reserved in transaction batch: + * . 3 balancings in direct->indirect conversion + * . 1 block involved into reiserfs_update_sd() + * XXX in practically impossible worst case direct2indirect() + * can incur (much) more than 3 balancings. + * quota update for user, group + */ int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb); @@ -636,8 +692,9 @@ int reiserfs_get_block(struct inode *inode, sector_t block, return -EFBIG; } - /* if !create, we aren't changing the FS, so we don't need to - ** log anything, so we don't need to start a transaction + /* + * if !create, we aren't changing the FS, so we don't need to + * log anything, so we don't need to start a transaction */ if (!(create & GET_BLOCK_CREATE)) { int ret; @@ -647,6 +704,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block, reiserfs_write_unlock(inode->i_sb); return ret; } + /* * if we're already in a transaction, make sure to close * any new transactions we start in this func @@ -655,8 +713,10 @@ int reiserfs_get_block(struct inode *inode, sector_t block, reiserfs_transaction_running(inode->i_sb)) dangle = 0; - /* If file is of such a size, that it might have a tail and tails are enabled - ** we should mark it as possibly needing tail packing on close + /* + * If file is of such a size, that it might have a tail and + * tails are enabled we should mark it as possibly needing + * tail packing on close */ if ((have_large_tails(inode->i_sb) && inode->i_size < i_block_size(inode) * 4) @@ -667,7 +727,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block, /* set the key of the first byte in the 'block'-th block of file */ make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ ); if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) { - start_trans: +start_trans: th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count); if (!th) { retval = -ENOMEM; @@ -675,7 +735,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block, } reiserfs_update_inode_transaction(inode); } - research: +research: retval = search_for_position_by_key(inode->i_sb, &key, &path); if (retval == IO_ERROR) { @@ -684,8 +744,8 @@ int reiserfs_get_block(struct inode *inode, sector_t block, } bh = get_last_bh(&path); - ih = get_ih(&path); - item = get_item(&path); + ih = tp_item_head(&path); + item = tp_item_body(&path); pos_in_item = path.pos_in_item; fs_gen = get_generation(inode->i_sb); @@ -703,11 +763,12 @@ int reiserfs_get_block(struct inode *inode, sector_t block, _allocate_block(th, block, inode, &allocated_block_nr, &path, create); + /* + * restart the transaction to give the journal a chance to free + * some blocks. releases the path, so we have to go back to + * research if we succeed on the second try + */ if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) { - /* restart the transaction to give the journal a chance to free - ** some blocks. releases the path, so we have to go back to - ** research if we succeed on the second try - */ SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1; retval = restart_transaction(th, inode, &path); if (retval) @@ -734,9 +795,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block, if (indirect_item_found(retval, ih)) { b_blocknr_t unfm_ptr; - /* 'block'-th block is in the file already (there is - corresponding cell in some indirect item). But it may be - zero unformatted node pointer (hole) */ + /* + * 'block'-th block is in the file already (there is + * corresponding cell in some indirect item). But it may be + * zero unformatted node pointer (hole) + */ unfm_ptr = get_block_num(item, pos_in_item); if (unfm_ptr == 0) { /* use allocated block to plug the hole */ @@ -753,7 +816,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block, reiserfs_add_ordered_list(inode, bh_result); put_block_num(item, pos_in_item, allocated_block_nr); unfm_ptr = allocated_block_nr; - journal_mark_dirty(th, inode->i_sb, bh); + journal_mark_dirty(th, bh); reiserfs_update_sd(th, inode); } set_block_dev_mapped(bh_result, unfm_ptr, inode); @@ -764,9 +827,10 @@ int reiserfs_get_block(struct inode *inode, sector_t block, reiserfs_write_unlock(inode->i_sb); - /* the item was found, so new blocks were not added to the file - ** there is no need to make sure the inode is updated with this - ** transaction + /* + * the item was found, so new blocks were not added to the file + * there is no need to make sure the inode is updated with this + * transaction */ return retval; } @@ -776,9 +840,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block, goto start_trans; } - /* desired position is not found or is in the direct item. We have - to append file with holes up to 'block'-th block converting - direct items to indirect one if necessary */ + /* + * desired position is not found or is in the direct item. We have + * to append file with holes up to 'block'-th block converting + * direct items to indirect one if necessary + */ done = 0; do { if (is_statdata_le_ih(ih)) { @@ -790,16 +856,18 @@ int reiserfs_get_block(struct inode *inode, sector_t block, TYPE_INDIRECT, UNFM_P_SIZE, 0 /* free_space */ ); + /* + * we are going to add 'block'-th block to the file. + * Use allocated block for that + */ if (cpu_key_k_offset(&key) == 1) { - /* we are going to add 'block'-th block to the file. Use - allocated block for that */ unp = cpu_to_le32(allocated_block_nr); set_block_dev_mapped(bh_result, allocated_block_nr, inode); set_buffer_new(bh_result); done = 1; } - tmp_key = key; // ;) + tmp_key = key; /* ;) */ set_cpu_key_k_offset(&tmp_key, 1); PATH_LAST_POSITION(&path)++; @@ -809,9 +877,12 @@ int reiserfs_get_block(struct inode *inode, sector_t block, if (retval) { reiserfs_free_block(th, inode, allocated_block_nr, 1); - goto failure; // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST + /* + * retval == -ENOSPC, -EDQUOT or -EIO + * or -EEXIST + */ + goto failure; } - //mark_tail_converted (inode); } else if (is_direct_le_ih(ih)) { /* direct item has to be converted */ loff_t tail_offset; @@ -819,18 +890,24 @@ int reiserfs_get_block(struct inode *inode, sector_t block, tail_offset = ((le_ih_k_offset(ih) - 1) & ~(inode->i_sb->s_blocksize - 1)) + 1; + + /* + * direct item we just found fits into block we have + * to map. Convert it into unformatted node: use + * bh_result for the conversion + */ if (tail_offset == cpu_key_k_offset(&key)) { - /* direct item we just found fits into block we have - to map. Convert it into unformatted node: use - bh_result for the conversion */ set_block_dev_mapped(bh_result, allocated_block_nr, inode); unbh = bh_result; done = 1; } else { - /* we have to padd file tail stored in direct item(s) - up to block size and convert it to unformatted - node. FIXME: this should also get into page cache */ + /* + * we have to pad file tail stored in direct + * item(s) up to block size and convert it + * to unformatted node. FIXME: this should + * also get into page cache + */ pathrelse(&path); /* @@ -859,7 +936,10 @@ int reiserfs_get_block(struct inode *inode, sector_t block, inode->i_ino, retval); if (allocated_block_nr) { - /* the bitmap, the super, and the stat data == 3 */ + /* + * the bitmap, the super, + * and the stat data == 3 + */ if (!th) th = reiserfs_persistent_transaction(inode->i_sb, 3); if (th) @@ -881,43 +961,57 @@ int reiserfs_get_block(struct inode *inode, sector_t block, allocated_block_nr, 1); goto failure; } - /* it is important the set_buffer_uptodate is done after - ** the direct2indirect. The buffer might contain valid - ** data newer than the data on disk (read by readpage, changed, - ** and then sent here by writepage). direct2indirect needs - ** to know if unbh was already up to date, so it can decide - ** if the data in unbh needs to be replaced with data from - ** the disk + /* + * it is important the set_buffer_uptodate is done + * after the direct2indirect. The buffer might + * contain valid data newer than the data on disk + * (read by readpage, changed, and then sent here by + * writepage). direct2indirect needs to know if unbh + * was already up to date, so it can decide if the + * data in unbh needs to be replaced with data from + * the disk */ set_buffer_uptodate(unbh); - /* unbh->b_page == NULL in case of DIRECT_IO request, this means - buffer will disappear shortly, so it should not be added to + /* + * unbh->b_page == NULL in case of DIRECT_IO request, + * this means buffer will disappear shortly, so it + * should not be added to */ if (unbh->b_page) { - /* we've converted the tail, so we must - ** flush unbh before the transaction commits + /* + * we've converted the tail, so we must + * flush unbh before the transaction commits */ reiserfs_add_tail_list(inode, unbh); - /* mark it dirty now to prevent commit_write from adding - ** this buffer to the inode's dirty buffer list + /* + * mark it dirty now to prevent commit_write + * from adding this buffer to the inode's + * dirty buffer list */ /* - * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty(). - * It's still atomic, but it sets the page dirty too, - * which makes it eligible for writeback at any time by the - * VM (which was also the case with __mark_buffer_dirty()) + * AKPM: changed __mark_buffer_dirty to + * mark_buffer_dirty(). It's still atomic, + * but it sets the page dirty too, which makes + * it eligible for writeback at any time by the + * VM (which was also the case with + * __mark_buffer_dirty()) */ mark_buffer_dirty(unbh); } } else { - /* append indirect item with holes if needed, when appending - pointer to 'block'-th block use block, which is already - allocated */ + /* + * append indirect item with holes if needed, when + * appending pointer to 'block'-th block use block, + * which is already allocated + */ struct cpu_key tmp_key; - unp_t unf_single = 0; // We use this in case we need to allocate only - // one block which is a fastpath + /* + * We use this in case we need to allocate + * only one block which is a fastpath + */ + unp_t unf_single = 0; unp_t *un; __u64 max_to_insert = MAX_ITEM_LEN(inode->i_sb->s_blocksize) / @@ -926,14 +1020,17 @@ int reiserfs_get_block(struct inode *inode, sector_t block, RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE, "vs-804: invalid position for append"); - /* indirect item has to be appended, set up key of that position */ + /* + * indirect item has to be appended, + * set up key of that position + * (key type is unimportant) + */ make_cpu_key(&tmp_key, inode, le_key_k_offset(version, - &(ih->ih_key)) + + &ih->ih_key) + op_bytes_number(ih, inode->i_sb->s_blocksize), - //pos_in_item * inode->i_sb->s_blocksize, - TYPE_INDIRECT, 3); // key type is unimportant + TYPE_INDIRECT, 3); RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key), "green-805: invalid offset"); @@ -954,8 +1051,10 @@ int reiserfs_get_block(struct inode *inode, sector_t block, } } if (blocks_needed <= max_to_insert) { - /* we are going to add target block to the file. Use allocated - block for that */ + /* + * we are going to add target block to + * the file. Use allocated block for that + */ un[blocks_needed - 1] = cpu_to_le32(allocated_block_nr); set_block_dev_mapped(bh_result, @@ -964,8 +1063,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block, done = 1; } else { /* paste hole to the indirect item */ - /* If kmalloc failed, max_to_insert becomes zero and it means we - only have space for one block */ + /* + * If kmalloc failed, max_to_insert becomes + * zero and it means we only have space for + * one block + */ blocks_needed = max_to_insert ? max_to_insert : 1; } @@ -984,9 +1086,12 @@ int reiserfs_get_block(struct inode *inode, sector_t block, goto failure; } if (!done) { - /* We need to mark new file size in case this function will be - interrupted/aborted later on. And we may do this only for - holes. */ + /* + * We need to mark new file size in case + * this function will be interrupted/aborted + * later on. And we may do this only for + * holes. + */ inode->i_size += inode->i_sb->s_blocksize * blocks_needed; } @@ -995,13 +1100,13 @@ int reiserfs_get_block(struct inode *inode, sector_t block, if (done == 1) break; - /* this loop could log more blocks than we had originally asked - ** for. So, we have to allow the transaction to end if it is - ** too big or too full. Update the inode so things are - ** consistent if we crash before the function returns - ** - ** release the path so that anybody waiting on the path before - ** ending their transaction will be able to continue. + /* + * this loop could log more blocks than we had originally + * asked for. So, we have to allow the transaction to end + * if it is too big or too full. Update the inode so things + * are consistent if we crash before the function returns + * release the path so that anybody waiting on the path before + * ending their transaction will be able to continue. */ if (journal_transaction_should_end(th, th->t_blocks_allocated)) { retval = restart_transaction(th, inode, &path); @@ -1031,14 +1136,14 @@ int reiserfs_get_block(struct inode *inode, sector_t block, goto failure; } bh = get_last_bh(&path); - ih = get_ih(&path); - item = get_item(&path); + ih = tp_item_head(&path); + item = tp_item_body(&path); pos_in_item = path.pos_in_item; } while (1); retval = 0; - failure: +failure: if (th && (!dangle || (retval && !th->t_trans_id))) { int err; if (th->t_trans_id) @@ -1060,8 +1165,10 @@ reiserfs_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block); } -/* Compute real number of used bytes by file - * Following three functions can go away when we'll have enough space in stat item +/* + * Compute real number of used bytes by file + * Following three functions can go away when we'll have enough space in + * stat item */ static int real_space_diff(struct inode *inode, int sd_size) { @@ -1071,13 +1178,14 @@ static int real_space_diff(struct inode *inode, int sd_size) if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) return sd_size; - /* End of file is also in full block with indirect reference, so round - ** up to the next block. - ** - ** there is just no way to know if the tail is actually packed - ** on the file, so we have to assume it isn't. When we pack the - ** tail, we add 4 bytes to pretend there really is an unformatted - ** node pointer + /* + * End of file is also in full block with indirect reference, so round + * up to the next block. + * + * there is just no way to know if the tail is actually packed + * on the file, so we have to assume it isn't. When we pack the + * tail, we add 4 bytes to pretend there really is an unformatted + * node pointer */ bytes = ((inode->i_size + @@ -1108,36 +1216,36 @@ static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size) bytes += (loff_t) 511; } - /* files from before the quota patch might i_blocks such that - ** bytes < real_space. Deal with that here to prevent it from - ** going negative. + /* + * files from before the quota patch might i_blocks such that + * bytes < real_space. Deal with that here to prevent it from + * going negative. */ if (bytes < real_space) return 0; return (bytes - real_space) >> 9; } -// -// BAD: new directories have stat data of new type and all other items -// of old type. Version stored in the inode says about body items, so -// in update_stat_data we can not rely on inode, but have to check -// item version directly -// +/* + * BAD: new directories have stat data of new type and all other items + * of old type. Version stored in the inode says about body items, so + * in update_stat_data we can not rely on inode, but have to check + * item version directly + */ -// called by read_locked_inode +/* called by read_locked_inode */ static void init_inode(struct inode *inode, struct treepath *path) { struct buffer_head *bh; struct item_head *ih; __u32 rdev; - //int version = ITEM_VERSION_1; bh = PATH_PLAST_BUFFER(path); - ih = PATH_PITEM_HEAD(path); + ih = tp_item_head(path); - copy_key(INODE_PKEY(inode), &(ih->ih_key)); + copy_key(INODE_PKEY(inode), &ih->ih_key); - INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list)); + INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list); REISERFS_I(inode)->i_flags = 0; REISERFS_I(inode)->i_prealloc_block = 0; REISERFS_I(inode)->i_prealloc_count = 0; @@ -1147,7 +1255,7 @@ static void init_inode(struct inode *inode, struct treepath *path) if (stat_data_v1(ih)) { struct stat_data_v1 *sd = - (struct stat_data_v1 *)B_I_PITEM(bh, ih); + (struct stat_data_v1 *)ih_item_body(bh, ih); unsigned long blocks; set_inode_item_key_version(inode, KEY_FORMAT_3_5); @@ -1168,20 +1276,26 @@ static void init_inode(struct inode *inode, struct treepath *path) inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); blocks = (inode->i_size + 511) >> 9; blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9); + + /* + * there was a bug in <=3.5.23 when i_blocks could take + * negative values. Starting from 3.5.17 this value could + * even be stored in stat data. For such files we set + * i_blocks based on file size. Just 2 notes: this can be + * wrong for sparse files. On-disk value will be only + * updated if file's inode will ever change + */ if (inode->i_blocks > blocks) { - // there was a bug in <=3.5.23 when i_blocks could take negative - // values. Starting from 3.5.17 this value could even be stored in - // stat data. For such files we set i_blocks based on file - // size. Just 2 notes: this can be wrong for sparce files. On-disk value will be - // only updated if file's inode will ever change inode->i_blocks = blocks; } rdev = sd_v1_rdev(sd); REISERFS_I(inode)->i_first_direct_byte = sd_v1_first_direct_byte(sd); - /* an early bug in the quota code can give us an odd number for the - ** block count. This is incorrect, fix it here. + + /* + * an early bug in the quota code can give us an odd + * number for the block count. This is incorrect, fix it here. */ if (inode->i_blocks & 1) { inode->i_blocks++; @@ -1189,13 +1303,17 @@ static void init_inode(struct inode *inode, struct treepath *path) inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks, SD_V1_SIZE)); - /* nopack is initially zero for v1 objects. For v2 objects, - nopack is initialised from sd_attrs */ + /* + * nopack is initially zero for v1 objects. For v2 objects, + * nopack is initialised from sd_attrs + */ REISERFS_I(inode)->i_flags &= ~i_nopack_mask; } else { - // new stat data found, but object may have old items - // (directories and symlinks) - struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih); + /* + * new stat data found, but object may have old items + * (directories and symlinks) + */ + struct stat_data *sd = (struct stat_data *)ih_item_body(bh, ih); inode->i_mode = sd_v2_mode(sd); set_nlink(inode, sd_v2_nlink(sd)); @@ -1225,8 +1343,10 @@ static void init_inode(struct inode *inode, struct treepath *path) inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks, SD_V2_SIZE)); - /* read persistent inode attributes from sd and initialise - generic inode flags from them */ + /* + * read persistent inode attributes from sd and initialise + * generic inode flags from them + */ REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd); sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode); } @@ -1249,7 +1369,7 @@ static void init_inode(struct inode *inode, struct treepath *path) } } -// update new stat data with inode fields +/* update new stat data with inode fields */ static void inode2sd(void *sd, struct inode *inode, loff_t size) { struct stat_data *sd_v2 = (struct stat_data *)sd; @@ -1273,7 +1393,7 @@ static void inode2sd(void *sd, struct inode *inode, loff_t size) set_sd_v2_attrs(sd_v2, flags); } -// used to copy inode's fields to old stat data +/* used to copy inode's fields to old stat data */ static void inode2sd_v1(void *sd, struct inode *inode, loff_t size) { struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd; @@ -1292,14 +1412,15 @@ static void inode2sd_v1(void *sd, struct inode *inode, loff_t size) else set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE)); - // Sigh. i_first_direct_byte is back + /* Sigh. i_first_direct_byte is back */ set_sd_v1_first_direct_byte(sd_v1, REISERFS_I(inode)->i_first_direct_byte); } -/* NOTE, you must prepare the buffer head before sending it here, -** and then log it after the call -*/ +/* + * NOTE, you must prepare the buffer head before sending it here, + * and then log it after the call + */ static void update_stat_data(struct treepath *path, struct inode *inode, loff_t size) { @@ -1307,17 +1428,17 @@ static void update_stat_data(struct treepath *path, struct inode *inode, struct item_head *ih; bh = PATH_PLAST_BUFFER(path); - ih = PATH_PITEM_HEAD(path); + ih = tp_item_head(path); if (!is_statdata_le_ih(ih)) reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h", INODE_PKEY(inode), ih); + /* path points to old stat data */ if (stat_data_v1(ih)) { - // path points to old stat data - inode2sd_v1(B_I_PITEM(bh, ih), inode, size); + inode2sd_v1(ih_item_body(bh, ih), inode, size); } else { - inode2sd(B_I_PITEM(bh, ih), inode, size); + inode2sd(ih_item_body(bh, ih), inode, size); } return; @@ -1335,7 +1456,8 @@ void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th, BUG_ON(!th->t_trans_id); - make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3); //key type is unimportant + /* key type is unimportant */ + make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3); for (;;) { int pos; @@ -1363,45 +1485,48 @@ void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th, return; } - /* sigh, prepare_for_journal might schedule. When it schedules the - ** FS might change. We have to detect that, and loop back to the - ** search if the stat data item has moved + /* + * sigh, prepare_for_journal might schedule. When it + * schedules the FS might change. We have to detect that, + * and loop back to the search if the stat data item has moved */ bh = get_last_bh(&path); - ih = get_ih(&path); + ih = tp_item_head(&path); copy_item_head(&tmp_ih, ih); fs_gen = get_generation(inode->i_sb); reiserfs_prepare_for_journal(inode->i_sb, bh, 1); + + /* Stat_data item has been moved after scheduling. */ if (fs_changed(fs_gen, inode->i_sb) && item_moved(&tmp_ih, &path)) { reiserfs_restore_prepared_buffer(inode->i_sb, bh); - continue; /* Stat_data item has been moved after scheduling. */ + continue; } break; } update_stat_data(&path, inode, size); - journal_mark_dirty(th, th->t_super, bh); + journal_mark_dirty(th, bh); pathrelse(&path); return; } -/* reiserfs_read_locked_inode is called to read the inode off disk, and it -** does a make_bad_inode when things go wrong. But, we need to make sure -** and clear the key in the private portion of the inode, otherwise a -** corresponding iput might try to delete whatever object the inode last -** represented. -*/ +/* + * reiserfs_read_locked_inode is called to read the inode off disk, and it + * does a make_bad_inode when things go wrong. But, we need to make sure + * and clear the key in the private portion of the inode, otherwise a + * corresponding iput might try to delete whatever object the inode last + * represented. + */ static void reiserfs_make_bad_inode(struct inode *inode) { memset(INODE_PKEY(inode), 0, KEY_SIZE); make_bad_inode(inode); } -// -// initially this function was derived from minix or ext2's analog and -// evolved as the prototype did -// - +/* + * initially this function was derived from minix or ext2's analog and + * evolved as the prototype did + */ int reiserfs_init_locked_inode(struct inode *inode, void *p) { struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p; @@ -1410,8 +1535,10 @@ int reiserfs_init_locked_inode(struct inode *inode, void *p) return 0; } -/* looks for stat data in the tree, and fills up the fields of in-core - inode stat data fields */ +/* + * looks for stat data in the tree, and fills up the fields of in-core + * inode stat data fields + */ void reiserfs_read_locked_inode(struct inode *inode, struct reiserfs_iget_args *args) { @@ -1422,8 +1549,10 @@ void reiserfs_read_locked_inode(struct inode *inode, dirino = args->dirid; - /* set version 1, version 2 could be used too, because stat data - key is the same in both versions */ + /* + * set version 1, version 2 could be used too, because stat data + * key is the same in both versions + */ key.version = KEY_FORMAT_3_5; key.on_disk_key.k_dir_id = dirino; key.on_disk_key.k_objectid = inode->i_ino; @@ -1439,8 +1568,9 @@ void reiserfs_read_locked_inode(struct inode *inode, reiserfs_make_bad_inode(inode); return; } + + /* a stale NFS handle can trigger this without it being an error */ if (retval != ITEM_FOUND) { - /* a stale NFS handle can trigger this without it being an error */ pathrelse(&path_to_sd); reiserfs_make_bad_inode(inode); clear_nlink(inode); @@ -1449,20 +1579,25 @@ void reiserfs_read_locked_inode(struct inode *inode, init_inode(inode, &path_to_sd); - /* It is possible that knfsd is trying to access inode of a file - that is being removed from the disk by some other thread. As we - update sd on unlink all that is required is to check for nlink - here. This bug was first found by Sizif when debugging - SquidNG/Butterfly, forgotten, and found again after Philippe - Gramoulle <philippe.gramoulle@mmania.com> reproduced it. - - More logical fix would require changes in fs/inode.c:iput() to - remove inode from hash-table _after_ fs cleaned disk stuff up and - in iget() to return NULL if I_FREEING inode is found in - hash-table. */ - /* Currently there is one place where it's ok to meet inode with - nlink==0: processing of open-unlinked and half-truncated files - during mount (fs/reiserfs/super.c:finish_unfinished()). */ + /* + * It is possible that knfsd is trying to access inode of a file + * that is being removed from the disk by some other thread. As we + * update sd on unlink all that is required is to check for nlink + * here. This bug was first found by Sizif when debugging + * SquidNG/Butterfly, forgotten, and found again after Philippe + * Gramoulle <philippe.gramoulle@mmania.com> reproduced it. + + * More logical fix would require changes in fs/inode.c:iput() to + * remove inode from hash-table _after_ fs cleaned disk stuff up and + * in iget() to return NULL if I_FREEING inode is found in + * hash-table. + */ + + /* + * Currently there is one place where it's ok to meet inode with + * nlink==0: processing of open-unlinked and half-truncated files + * during mount (fs/reiserfs/super.c:finish_unfinished()). + */ if ((inode->i_nlink == 0) && !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) { reiserfs_warning(inode->i_sb, "vs-13075", @@ -1472,7 +1607,8 @@ void reiserfs_read_locked_inode(struct inode *inode, reiserfs_make_bad_inode(inode); } - reiserfs_check_path(&path_to_sd); /* init inode should be relsing */ + /* init inode should be relsing */ + reiserfs_check_path(&path_to_sd); /* * Stat data v1 doesn't support ACLs. @@ -1481,7 +1617,7 @@ void reiserfs_read_locked_inode(struct inode *inode, cache_no_acl(inode); } -/** +/* * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked(). * * @inode: inode from hash table to check @@ -1556,7 +1692,8 @@ static struct dentry *reiserfs_get_dentry(struct super_block *sb, struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { - /* fhtype happens to reflect the number of u32s encoded. + /* + * fhtype happens to reflect the number of u32s encoded. * due to a bug in earlier code, fhtype might indicate there * are more u32s then actually fitted. * so if fhtype seems to be more than len, reduce fhtype. @@ -1625,13 +1762,16 @@ int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp, return *lenp; } -/* looks for stat data, then copies fields to it, marks the buffer - containing stat data as dirty */ -/* reiserfs inodes are never really dirty, since the dirty inode call -** always logs them. This call allows the VFS inode marking routines -** to properly mark inodes for datasync and such, but only actually -** does something when called for a synchronous update. -*/ +/* + * looks for stat data, then copies fields to it, marks the buffer + * containing stat data as dirty + */ +/* + * reiserfs inodes are never really dirty, since the dirty inode call + * always logs them. This call allows the VFS inode marking routines + * to properly mark inodes for datasync and such, but only actually + * does something when called for a synchronous update. + */ int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct reiserfs_transaction_handle th; @@ -1639,24 +1779,28 @@ int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc) if (inode->i_sb->s_flags & MS_RDONLY) return -EROFS; - /* memory pressure can sometimes initiate write_inode calls with sync == 1, - ** these cases are just when the system needs ram, not when the - ** inode needs to reach disk for safety, and they can safely be - ** ignored because the altered inode has already been logged. + /* + * memory pressure can sometimes initiate write_inode calls with + * sync == 1, + * these cases are just when the system needs ram, not when the + * inode needs to reach disk for safety, and they can safely be + * ignored because the altered inode has already been logged. */ if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) { reiserfs_write_lock(inode->i_sb); if (!journal_begin(&th, inode->i_sb, jbegin_count)) { reiserfs_update_sd(&th, inode); - journal_end_sync(&th, inode->i_sb, jbegin_count); + journal_end_sync(&th); } reiserfs_write_unlock(inode->i_sb); } return 0; } -/* stat data of new object is inserted already, this inserts the item - containing "." and ".." entries */ +/* + * stat data of new object is inserted already, this inserts the item + * containing "." and ".." entries + */ static int reiserfs_new_directory(struct reiserfs_transaction_handle *th, struct inode *inode, struct item_head *ih, struct treepath *path, @@ -1674,9 +1818,11 @@ static int reiserfs_new_directory(struct reiserfs_transaction_handle *th, le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET, TYPE_DIRENTRY, 3 /*key length */ ); - /* compose item head for new item. Directories consist of items of - old type (ITEM_VERSION_1). Do not set key (second arg is 0), it - is done by reiserfs_new_inode */ + /* + * compose item head for new item. Directories consist of items of + * old type (ITEM_VERSION_1). Do not set key (second arg is 0), it + * is done by reiserfs_new_inode + */ if (old_format_only(sb)) { make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2); @@ -1714,9 +1860,12 @@ static int reiserfs_new_directory(struct reiserfs_transaction_handle *th, return reiserfs_insert_item(th, path, &key, ih, inode, body); } -/* stat data of object has been inserted, this inserts the item - containing the body of symlink */ -static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct inode *inode, /* Inode of symlink */ +/* + * stat data of object has been inserted, this inserts the item + * containing the body of symlink + */ +static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, + struct inode *inode, struct item_head *ih, struct treepath *path, const char *symname, int item_len) @@ -1754,15 +1903,26 @@ static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct i return reiserfs_insert_item(th, path, &key, ih, inode, symname); } -/* inserts the stat data into the tree, and then calls - reiserfs_new_directory (to insert ".", ".." item if new object is - directory) or reiserfs_new_symlink (to insert symlink body if new - object is symlink) or nothing (if new object is regular file) - - NOTE! uid and gid must already be set in the inode. If we return - non-zero due to an error, we have to drop the quota previously allocated - for the fresh inode. This can only be done outside a transaction, so - if we return non-zero, we also end the transaction. */ +/* + * inserts the stat data into the tree, and then calls + * reiserfs_new_directory (to insert ".", ".." item if new object is + * directory) or reiserfs_new_symlink (to insert symlink body if new + * object is symlink) or nothing (if new object is regular file) + + * NOTE! uid and gid must already be set in the inode. If we return + * non-zero due to an error, we have to drop the quota previously allocated + * for the fresh inode. This can only be done outside a transaction, so + * if we return non-zero, we also end the transaction. + * + * @th: active transaction handle + * @dir: parent directory for new inode + * @mode: mode of new inode + * @symname: symlink contents if inode is symlink + * @isize: 0 for regular file, EMPTY_DIR_SIZE for dirs, strlen(symname) for + * symlinks + * @inode: inode to be filled + * @security: optional security context to associate with this inode + */ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, struct inode *dir, umode_t mode, const char *symname, /* 0 for regular, EMTRY_DIR_SIZE for dirs, @@ -1807,7 +1967,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, else make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET, TYPE_STAT_DATA, SD_SIZE, MAX_US_INT); - memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE); + memcpy(INODE_PKEY(inode), &ih.ih_key, KEY_SIZE); args.dirid = le32_to_cpu(ih.ih_key.k_dir_id); depth = reiserfs_write_unlock_nested(inode->i_sb); @@ -1820,10 +1980,11 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, } if (old_format_only(sb)) - /* not a perfect generation count, as object ids can be reused, but - ** this is as good as reiserfs can do right now. - ** note that the private part of inode isn't filled in yet, we have - ** to use the directory. + /* + * not a perfect generation count, as object ids can be reused, + * but this is as good as reiserfs can do right now. + * note that the private part of inode isn't filled in yet, + * we have to use the directory. */ inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid); else @@ -1850,7 +2011,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 : U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ; - INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list)); + INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list); REISERFS_I(inode)->i_flags = 0; REISERFS_I(inode)->i_prealloc_block = 0; REISERFS_I(inode)->i_prealloc_count = 0; @@ -1878,9 +2039,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, goto out_bad_inode; } if (old_format_only(sb)) { + /* i_uid or i_gid is too big to be stored in stat data v3.5 */ if (i_uid_read(inode) & ~0xffff || i_gid_read(inode) & ~0xffff) { pathrelse(&path_to_key); - /* i_uid or i_gid is too big to be stored in stat data v3.5 */ err = -EINVAL; goto out_bad_inode; } @@ -1888,9 +2049,11 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, } else { inode2sd(&sd, inode, inode->i_size); } - // store in in-core inode the key of stat data and version all - // object items will have (directory items will have old offset - // format, other new objects will consist of new items) + /* + * store in in-core inode the key of stat data and version all + * object items will have (directory items will have old offset + * format, other new objects will consist of new items) + */ if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode)) set_inode_item_key_version(inode, KEY_FORMAT_3_5); else @@ -1934,7 +2097,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, if (retval) { err = retval; reiserfs_check_path(&path_to_key); - journal_end(th, th->t_super, th->t_blocks_allocated); + journal_end(th); goto out_inserted_sd; } @@ -1945,7 +2108,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, if (retval) { err = retval; reiserfs_check_path(&path_to_key); - journal_end(th, th->t_super, th->t_blocks_allocated); + journal_end(th); goto out_inserted_sd; } } else if (inode->i_sb->s_flags & MS_POSIXACL) { @@ -1962,8 +2125,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, if (retval) { err = retval; reiserfs_check_path(&path_to_key); - retval = journal_end(th, th->t_super, - th->t_blocks_allocated); + retval = journal_end(th); if (retval) err = retval; goto out_inserted_sd; @@ -1975,11 +2137,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, return 0; -/* it looks like you can easily compress these two goto targets into - * one. Keeping it like this doesn't actually hurt anything, and they - * are place holders for what the quota code actually needs. - */ - out_bad_inode: +out_bad_inode: /* Invalidate the object, nothing was inserted yet */ INODE_PKEY(inode)->k_objectid = 0; @@ -1988,16 +2146,19 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, dquot_free_inode(inode); reiserfs_write_lock_nested(inode->i_sb, depth); - out_end_trans: - journal_end(th, th->t_super, th->t_blocks_allocated); - /* Drop can be outside and it needs more credits so it's better to have it outside */ +out_end_trans: + journal_end(th); + /* + * Drop can be outside and it needs more credits so it's better + * to have it outside + */ depth = reiserfs_write_unlock_nested(inode->i_sb); dquot_drop(inode); reiserfs_write_lock_nested(inode->i_sb, depth); inode->i_flags |= S_NOQUOTA; make_bad_inode(inode); - out_inserted_sd: +out_inserted_sd: clear_nlink(inode); th->t_trans_id = 0; /* so the caller can't use this handle later */ unlock_new_inode(inode); /* OK to do even if we hadn't locked it */ @@ -2006,25 +2167,26 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, } /* -** finds the tail page in the page cache, -** reads the last block in. -** -** On success, page_result is set to a locked, pinned page, and bh_result -** is set to an up to date buffer for the last block in the file. returns 0. -** -** tail conversion is not done, so bh_result might not be valid for writing -** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before -** trying to write the block. -** -** on failure, nonzero is returned, page_result and bh_result are untouched. -*/ + * finds the tail page in the page cache, + * reads the last block in. + * + * On success, page_result is set to a locked, pinned page, and bh_result + * is set to an up to date buffer for the last block in the file. returns 0. + * + * tail conversion is not done, so bh_result might not be valid for writing + * check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before + * trying to write the block. + * + * on failure, nonzero is returned, page_result and bh_result are untouched. + */ static int grab_tail_page(struct inode *inode, struct page **page_result, struct buffer_head **bh_result) { - /* we want the page with the last byte in the file, - ** not the page that will hold the next byte for appending + /* + * we want the page with the last byte in the file, + * not the page that will hold the next byte for appending */ unsigned long index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; unsigned long pos = 0; @@ -2036,10 +2198,11 @@ static int grab_tail_page(struct inode *inode, struct page *page; int error; - /* we know that we are only called with inode->i_size > 0. - ** we also know that a file tail can never be as big as a block - ** If i_size % blocksize == 0, our file is currently block aligned - ** and it won't need converting or zeroing after a truncate. + /* + * we know that we are only called with inode->i_size > 0. + * we also know that a file tail can never be as big as a block + * If i_size % blocksize == 0, our file is currently block aligned + * and it won't need converting or zeroing after a truncate. */ if ((offset & (blocksize - 1)) == 0) { return -ENOENT; @@ -2068,10 +2231,11 @@ static int grab_tail_page(struct inode *inode, } while (bh != head); if (!buffer_uptodate(bh)) { - /* note, this should never happen, prepare_write should - ** be taking care of this for us. If the buffer isn't up to date, - ** I've screwed up the code to find the buffer, or the code to - ** call prepare_write + /* + * note, this should never happen, prepare_write should be + * taking care of this for us. If the buffer isn't up to + * date, I've screwed up the code to find the buffer, or the + * code to call prepare_write */ reiserfs_error(inode->i_sb, "clm-6000", "error reading block %lu", bh->b_blocknr); @@ -2081,21 +2245,21 @@ static int grab_tail_page(struct inode *inode, *bh_result = bh; *page_result = page; - out: +out: return error; - unlock: +unlock: unlock_page(page); page_cache_release(page); return error; } /* -** vfs version of truncate file. Must NOT be called with -** a transaction already started. -** -** some code taken from block_truncate_page -*/ + * vfs version of truncate file. Must NOT be called with + * a transaction already started. + * + * some code taken from block_truncate_page + */ int reiserfs_truncate_file(struct inode *inode, int update_timestamps) { struct reiserfs_transaction_handle th; @@ -2113,9 +2277,11 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps) if (inode->i_size > 0) { error = grab_tail_page(inode, &page, &bh); if (error) { - // -ENOENT means we truncated past the end of the file, - // and get_block_create_0 could not find a block to read in, - // which is ok. + /* + * -ENOENT means we truncated past the end of the + * file, and get_block_create_0 could not find a + * block to read in, which is ok. + */ if (error != -ENOENT) reiserfs_error(inode->i_sb, "clm-6001", "grab_tail_page failed %d", @@ -2125,29 +2291,33 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps) } } - /* so, if page != NULL, we have a buffer head for the offset at - ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0, - ** then we have an unformatted node. Otherwise, we have a direct item, - ** and no zeroing is required on disk. We zero after the truncate, - ** because the truncate might pack the item anyway - ** (it will unmap bh if it packs). + /* + * so, if page != NULL, we have a buffer head for the offset at + * the end of the file. if the bh is mapped, and bh->b_blocknr != 0, + * then we have an unformatted node. Otherwise, we have a direct item, + * and no zeroing is required on disk. We zero after the truncate, + * because the truncate might pack the item anyway + * (it will unmap bh if it packs). + * + * it is enough to reserve space in transaction for 2 balancings: + * one for "save" link adding and another for the first + * cut_from_item. 1 is for update_sd */ - /* it is enough to reserve space in transaction for 2 balancings: - one for "save" link adding and another for the first - cut_from_item. 1 is for update_sd */ error = journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1); if (error) goto out; reiserfs_update_inode_transaction(inode); if (update_timestamps) - /* we are doing real truncate: if the system crashes before the last - transaction of truncating gets committed - on reboot the file - either appears truncated properly or not truncated at all */ + /* + * we are doing real truncate: if the system crashes + * before the last transaction of truncating gets committed + * - on reboot the file either appears truncated properly + * or not truncated at all + */ add_save_link(&th, inode, 1); err2 = reiserfs_do_truncate(&th, inode, page, update_timestamps); - error = - journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1); + error = journal_end(&th); if (error) goto out; @@ -2180,7 +2350,7 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps) reiserfs_write_unlock(inode->i_sb); return 0; - out: +out: if (page) { unlock_page(page); page_cache_release(page); @@ -2212,7 +2382,10 @@ static int map_block_for_writepage(struct inode *inode, int copy_size; int trans_running = 0; - /* catch places below that try to log something without starting a trans */ + /* + * catch places below that try to log something without + * starting a trans + */ th.t_trans_id = 0; if (!buffer_uptodate(bh_result)) { @@ -2220,11 +2393,11 @@ static int map_block_for_writepage(struct inode *inode, } kmap(bh_result->b_page); - start_over: +start_over: reiserfs_write_lock(inode->i_sb); make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3); - research: +research: retval = search_for_position_by_key(inode->i_sb, &key, &path); if (retval != POSITION_FOUND) { use_get_block = 1; @@ -2232,8 +2405,8 @@ static int map_block_for_writepage(struct inode *inode, } bh = get_last_bh(&path); - ih = get_ih(&path); - item = get_item(&path); + ih = tp_item_head(&path); + item = tp_item_body(&path); pos_in_item = path.pos_in_item; /* we've found an unformatted node */ @@ -2281,10 +2454,10 @@ static int map_block_for_writepage(struct inode *inode, goto research; } - memcpy(B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied, + memcpy(ih_item_body(bh, ih) + pos_in_item, p + bytes_copied, copy_size); - journal_mark_dirty(&th, inode->i_sb, bh); + journal_mark_dirty(&th, bh); bytes_copied += copy_size; set_block_dev_mapped(bh_result, 0, inode); @@ -2304,10 +2477,10 @@ static int map_block_for_writepage(struct inode *inode, } retval = 0; - out: +out: pathrelse(&path); if (trans_running) { - int err = journal_end(&th, inode->i_sb, jbegin_count); + int err = journal_end(&th); if (err) retval = err; trans_running = 0; @@ -2331,7 +2504,8 @@ static int map_block_for_writepage(struct inode *inode, kunmap(bh_result->b_page); if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { - /* we've copied data from the page into the direct item, so the + /* + * we've copied data from the page into the direct item, so the * buffer in the page is now clean, mark it to reflect that. */ lock_buffer(bh_result); @@ -2370,7 +2544,8 @@ static int reiserfs_write_full_page(struct page *page, return 0; } - /* The page dirty bit is cleared before writepage is called, which + /* + * The page dirty bit is cleared before writepage is called, which * means we have to tell create_empty_buffers to make dirty buffers * The page really should be up to date at this point, so tossing * in the BH_Uptodate is just a sanity check. @@ -2381,8 +2556,9 @@ static int reiserfs_write_full_page(struct page *page, } head = page_buffers(page); - /* last page in the file, zero out any contents past the - ** last byte in the file + /* + * last page in the file, zero out any contents past the + * last byte in the file */ if (page->index >= end_index) { unsigned last_offset; @@ -2412,7 +2588,8 @@ static int reiserfs_write_full_page(struct page *page, (!buffer_mapped(bh) || (buffer_mapped(bh) && bh->b_blocknr == 0))) { - /* not mapped yet, or it points to a direct item, search + /* + * not mapped yet, or it points to a direct item, search * the btree for the mapping info, and log any direct * items found */ @@ -2450,10 +2627,11 @@ static int reiserfs_write_full_page(struct page *page, if (checked) { reiserfs_prepare_for_journal(s, bh, 1); - journal_mark_dirty(&th, s, bh); + journal_mark_dirty(&th, bh); continue; } - /* from this point on, we know the buffer is mapped to a + /* + * from this point on, we know the buffer is mapped to a * real block and not a direct item */ if (wbc->sync_mode != WB_SYNC_NONE) { @@ -2472,7 +2650,7 @@ static int reiserfs_write_full_page(struct page *page, } while ((bh = bh->b_this_page) != head); if (checked) { - error = journal_end(&th, s, bh_per_page + 1); + error = journal_end(&th); reiserfs_write_unlock(s); if (error) goto fail; @@ -2497,7 +2675,7 @@ static int reiserfs_write_full_page(struct page *page, } while (bh != head); error = 0; - done: +done: if (nr == 0) { /* * if this page only had a direct item, it is very possible for @@ -2519,8 +2697,9 @@ static int reiserfs_write_full_page(struct page *page, } return error; - fail: - /* catches various errors, we need to make sure any valid dirty blocks +fail: + /* + * catches various errors, we need to make sure any valid dirty blocks * get to the media. The page is currently locked and not marked for * writeback */ @@ -2533,8 +2712,8 @@ static int reiserfs_write_full_page(struct page *page, mark_buffer_async_write(bh); } else { /* - * clear any dirty bits that might have come from getting - * attached to a dirty page + * clear any dirty bits that might have come from + * getting attached to a dirty page */ clear_buffer_dirty(bh); } @@ -2614,15 +2793,18 @@ static int reiserfs_write_begin(struct file *file, ret = __block_write_begin(page, pos, len, reiserfs_get_block); if (ret && reiserfs_transaction_running(inode->i_sb)) { struct reiserfs_transaction_handle *th = current->journal_info; - /* this gets a little ugly. If reiserfs_get_block returned an - * error and left a transacstion running, we've got to close it, - * and we've got to free handle if it was a persistent transaction. + /* + * this gets a little ugly. If reiserfs_get_block returned an + * error and left a transacstion running, we've got to close + * it, and we've got to free handle if it was a persistent + * transaction. * * But, if we had nested into an existing transaction, we need * to just drop the ref count on the handle. * * If old_ref == 0, the transaction is from reiserfs_get_block, - * and it was a persistent trans. Otherwise, it was nested above. + * and it was a persistent trans. Otherwise, it was nested + * above. */ if (th->t_refcount > old_ref) { if (old_ref) @@ -2671,15 +2853,18 @@ int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len) ret = __block_write_begin(page, from, len, reiserfs_get_block); if (ret && reiserfs_transaction_running(inode->i_sb)) { struct reiserfs_transaction_handle *th = current->journal_info; - /* this gets a little ugly. If reiserfs_get_block returned an - * error and left a transacstion running, we've got to close it, - * and we've got to free handle if it was a persistent transaction. + /* + * this gets a little ugly. If reiserfs_get_block returned an + * error and left a transacstion running, we've got to close + * it, and we've got to free handle if it was a persistent + * transaction. * * But, if we had nested into an existing transaction, we need * to just drop the ref count on the handle. * * If old_ref == 0, the transaction is from reiserfs_get_block, - * and it was a persistent trans. Otherwise, it was nested above. + * and it was a persistent trans. Otherwise, it was nested + * above. */ if (th->t_refcount > old_ref) { if (old_ref) @@ -2734,17 +2919,20 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, reiserfs_commit_page(inode, page, start, start + copied); - /* generic_commit_write does this for us, but does not update the - ** transaction tracking stuff when the size changes. So, we have - ** to do the i_size updates here. + /* + * generic_commit_write does this for us, but does not update the + * transaction tracking stuff when the size changes. So, we have + * to do the i_size updates here. */ if (pos + copied > inode->i_size) { struct reiserfs_transaction_handle myth; reiserfs_write_lock(inode->i_sb); locked = true; - /* If the file have grown beyond the border where it - can have a tail, unmark it as needing a tail - packing */ + /* + * If the file have grown beyond the border where it + * can have a tail, unmark it as needing a tail + * packing + */ if ((have_large_tails(inode->i_sb) && inode->i_size > i_block_size(inode) * 4) || (have_small_tails(inode->i_sb) @@ -2759,13 +2947,13 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, inode->i_size = pos + copied; /* * this will just nest into our transaction. It's important - * to use mark_inode_dirty so the inode gets pushed around on the - * dirty lists, and so that O_SYNC works as expected + * to use mark_inode_dirty so the inode gets pushed around on + * the dirty lists, and so that O_SYNC works as expected */ mark_inode_dirty(inode); reiserfs_update_sd(&myth, inode); update_sd = 1; - ret = journal_end(&myth, inode->i_sb, 1); + ret = journal_end(&myth); if (ret) goto journal_error; } @@ -2781,7 +2969,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, goto out; } - out: +out: if (locked) reiserfs_write_unlock(inode->i_sb); unlock_page(page); @@ -2792,7 +2980,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, return ret == 0 ? copied : ret; - journal_error: +journal_error: reiserfs_write_unlock(inode->i_sb); locked = false; if (th) { @@ -2822,15 +3010,18 @@ int reiserfs_commit_write(struct file *f, struct page *page, } reiserfs_commit_page(inode, page, from, to); - /* generic_commit_write does this for us, but does not update the - ** transaction tracking stuff when the size changes. So, we have - ** to do the i_size updates here. + /* + * generic_commit_write does this for us, but does not update the + * transaction tracking stuff when the size changes. So, we have + * to do the i_size updates here. */ if (pos > inode->i_size) { struct reiserfs_transaction_handle myth; - /* If the file have grown beyond the border where it - can have a tail, unmark it as needing a tail - packing */ + /* + * If the file have grown beyond the border where it + * can have a tail, unmark it as needing a tail + * packing + */ if ((have_large_tails(inode->i_sb) && inode->i_size > i_block_size(inode) * 4) || (have_small_tails(inode->i_sb) @@ -2845,13 +3036,13 @@ int reiserfs_commit_write(struct file *f, struct page *page, inode->i_size = pos; /* * this will just nest into our transaction. It's important - * to use mark_inode_dirty so the inode gets pushed around on the - * dirty lists, and so that O_SYNC works as expected + * to use mark_inode_dirty so the inode gets pushed around + * on the dirty lists, and so that O_SYNC works as expected */ mark_inode_dirty(inode); reiserfs_update_sd(&myth, inode); update_sd = 1; - ret = journal_end(&myth, inode->i_sb, 1); + ret = journal_end(&myth); if (ret) goto journal_error; } @@ -2863,10 +3054,10 @@ int reiserfs_commit_write(struct file *f, struct page *page, goto out; } - out: +out: return ret; - journal_error: +journal_error: if (th) { if (!update_sd) reiserfs_update_sd(th, inode); @@ -2924,9 +3115,10 @@ void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs) } } -/* decide if this buffer needs to stay around for data logging or ordered -** write purposes -*/ +/* + * decide if this buffer needs to stay around for data logging or ordered + * write purposes + */ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) { int ret = 1; @@ -2937,7 +3129,8 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) if (!buffer_mapped(bh)) { goto free_jh; } - /* the page is locked, and the only places that log a data buffer + /* + * the page is locked, and the only places that log a data buffer * also lock the page. */ if (reiserfs_file_data_log(inode)) { @@ -2952,7 +3145,8 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) struct reiserfs_journal_list *jl; struct reiserfs_jh *jh = bh->b_private; - /* why is this safe? + /* + * why is this safe? * reiserfs_setattr updates i_size in the on disk * stat data before allowing vmtruncate to be called. * @@ -2969,7 +3163,7 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) && jl != SB_JOURNAL(inode->i_sb)->j_current_jl) ret = 0; } - free_jh: +free_jh: if (ret && bh->b_private) { reiserfs_free_jh(bh); } @@ -3028,7 +3222,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned int offset, ret = try_to_release_page(page, 0); /* maybe should BUG_ON(!ret); - neilb */ } - out: +out: return; } @@ -3080,18 +3274,20 @@ static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags) return ret; } -/* We thank Mingming Cao for helping us understand in great detail what - to do in this section of the code. */ +/* + * We thank Mingming Cao for helping us understand in great detail what + * to do in this section of the code. + */ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, - reiserfs_get_blocks_direct_io); + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, + reiserfs_get_blocks_direct_io); /* * In case of error extending write may have instantiated a few @@ -3099,7 +3295,7 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb, */ if (unlikely((rw & WRITE) && ret < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + iov_length(iov, nr_segs); + loff_t end = offset + count; if ((end > isize) && inode_newsize_ok(inode, isize) == 0) { truncate_setsize(inode, isize); @@ -3127,8 +3323,9 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) dquot_initialize(inode); reiserfs_write_lock(inode->i_sb); if (attr->ia_valid & ATTR_SIZE) { - /* version 2 items will be caught by the s_maxbytes check - ** done for us in vmtruncate + /* + * version 2 items will be caught by the s_maxbytes check + * done for us in vmtruncate */ if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 && attr->ia_size > MAX_NON_LFS) { @@ -3149,7 +3346,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) err = journal_begin(&th, inode->i_sb, 4); if (!err) { reiserfs_discard_prealloc(&th, inode); - err = journal_end(&th, inode->i_sb, 4); + err = journal_end(&th); } if (err) error = err; @@ -3189,7 +3386,10 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; - /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */ + /* + * (user+group)*(old+new) structure - we count quota + * info and , inode write (sb, inode) + */ reiserfs_write_lock(inode->i_sb); error = journal_begin(&th, inode->i_sb, jbegin_count); reiserfs_write_unlock(inode->i_sb); @@ -3198,19 +3398,21 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) error = dquot_transfer(inode, attr); reiserfs_write_lock(inode->i_sb); if (error) { - journal_end(&th, inode->i_sb, jbegin_count); + journal_end(&th); reiserfs_write_unlock(inode->i_sb); goto out; } - /* Update corresponding info in inode so that everything is in - * one transaction */ + /* + * Update corresponding info in inode so that everything + * is in one transaction + */ if (attr->ia_valid & ATTR_UID) inode->i_uid = attr->ia_uid; if (attr->ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; mark_inode_dirty(inode); - error = journal_end(&th, inode->i_sb, jbegin_count); + error = journal_end(&th); reiserfs_write_unlock(inode->i_sb); if (error) goto out; @@ -3220,8 +3422,14 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_size != i_size_read(inode)) { error = inode_newsize_ok(inode, attr->ia_size); if (!error) { + /* + * Could race against reiserfs_file_release + * if called from NFS, so take tailpack mutex. + */ + mutex_lock(&REISERFS_I(inode)->tailpack); truncate_setsize(inode, attr->ia_size); - reiserfs_vfs_truncate_file(inode); + reiserfs_truncate_file(inode, 1); + mutex_unlock(&REISERFS_I(inode)->tailpack); } } diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 946ccbf5b5a..501ed6811a2 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -15,7 +15,8 @@ * reiserfs_ioctl - handler for ioctl for inode * supported commands: * 1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect - * and prevent packing file (argument arg has to be non-zero) + * and prevent packing file (argument arg has t + * be non-zero) * 2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION * 3) That's all for a while ... */ @@ -132,7 +133,10 @@ setversion_out: long reiserfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - /* These are just misnamed, they actually get/put from/to user an int */ + /* + * These are just misnamed, they actually + * get/put from/to user an int + */ switch (cmd) { case REISERFS_IOC32_UNPACK: cmd = REISERFS_IOC_UNPACK; @@ -160,10 +164,10 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd, int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); /* -** reiserfs_unpack -** Function try to convert tail from direct item into indirect. -** It set up nopack attribute in the REISERFS_I(inode)->nopack -*/ + * reiserfs_unpack + * Function try to convert tail from direct item into indirect. + * It set up nopack attribute in the REISERFS_I(inode)->nopack + */ int reiserfs_unpack(struct inode *inode, struct file *filp) { int retval = 0; @@ -194,9 +198,10 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) goto out; } - /* we unpack by finding the page with the tail, and calling - ** __reiserfs_write_begin on that page. This will force a - ** reiserfs_get_block to unpack the tail for us. + /* + * we unpack by finding the page with the tail, and calling + * __reiserfs_write_begin on that page. This will force a + * reiserfs_get_block to unpack the tail for us. */ index = inode->i_size >> PAGE_CACHE_SHIFT; mapping = inode->i_mapping; @@ -214,11 +219,11 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) retval = reiserfs_commit_write(NULL, page, write_from, write_from); REISERFS_I(inode)->i_flags |= i_nopack_mask; - out_unlock: +out_unlock: unlock_page(page); page_cache_release(page); - out: +out: mutex_unlock(&inode->i_mutex); reiserfs_write_unlock(inode->i_sb); return retval; diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c index ee382ef3d30..cfaee912ee0 100644 --- a/fs/reiserfs/item_ops.c +++ b/fs/reiserfs/item_ops.c @@ -5,15 +5,17 @@ #include <linux/time.h> #include "reiserfs.h" -// this contains item handlers for old item types: sd, direct, -// indirect, directory +/* + * this contains item handlers for old item types: sd, direct, + * indirect, directory + */ -/* and where are the comments? how about saying where we can find an - explanation of each item handler method? -Hans */ +/* + * and where are the comments? how about saying where we can find an + * explanation of each item handler method? -Hans + */ -////////////////////////////////////////////////////////////////////////////// -// stat data functions -// +/* stat data functions */ static int sd_bytes_number(struct item_head *ih, int block_size) { return 0; @@ -60,7 +62,7 @@ static void sd_print_item(struct item_head *ih, char *item) static void sd_check_item(struct item_head *ih, char *item) { - // FIXME: type something here! + /* unused */ } static int sd_create_vi(struct virtual_node *vn, @@ -68,7 +70,6 @@ static int sd_create_vi(struct virtual_node *vn, int is_affected, int insert_size) { vi->vi_index = TYPE_STAT_DATA; - //vi->vi_type |= VI_TYPE_STAT_DATA;// not needed? return 0; } @@ -117,15 +118,13 @@ static struct item_operations stat_data_ops = { .print_vi = sd_print_vi }; -////////////////////////////////////////////////////////////////////////////// -// direct item functions -// +/* direct item functions */ static int direct_bytes_number(struct item_head *ih, int block_size) { return ih_item_len(ih); } -// FIXME: this should probably switch to indirect as well +/* FIXME: this should probably switch to indirect as well */ static void direct_decrement_key(struct cpu_key *key) { cpu_key_k_offset_dec(key); @@ -144,7 +143,7 @@ static void direct_print_item(struct item_head *ih, char *item) { int j = 0; -// return; +/* return; */ printk("\""); while (j < ih_item_len(ih)) printk("%c", item[j++]); @@ -153,7 +152,7 @@ static void direct_print_item(struct item_head *ih, char *item) static void direct_check_item(struct item_head *ih, char *item) { - // FIXME: type something here! + /* unused */ } static int direct_create_vi(struct virtual_node *vn, @@ -161,7 +160,6 @@ static int direct_create_vi(struct virtual_node *vn, int is_affected, int insert_size) { vi->vi_index = TYPE_DIRECT; - //vi->vi_type |= VI_TYPE_DIRECT; return 0; } @@ -211,16 +209,13 @@ static struct item_operations direct_ops = { .print_vi = direct_print_vi }; -////////////////////////////////////////////////////////////////////////////// -// indirect item functions -// - +/* indirect item functions */ static int indirect_bytes_number(struct item_head *ih, int block_size) { - return ih_item_len(ih) / UNFM_P_SIZE * block_size; //- get_ih_free_space (ih); + return ih_item_len(ih) / UNFM_P_SIZE * block_size; } -// decrease offset, if it becomes 0, change type to stat data +/* decrease offset, if it becomes 0, change type to stat data */ static void indirect_decrement_key(struct cpu_key *key) { cpu_key_k_offset_dec(key); @@ -228,7 +223,7 @@ static void indirect_decrement_key(struct cpu_key *key) set_cpu_key_k_type(key, TYPE_STAT_DATA); } -// if it is not first item of the body, then it is mergeable +/* if it is not first item of the body, then it is mergeable */ static int indirect_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize) { @@ -236,7 +231,7 @@ static int indirect_is_left_mergeable(struct reiserfs_key *key, return (le_key_k_offset(version, key) != 1); } -// printing of indirect item +/* printing of indirect item */ static void start_new_sequence(__u32 * start, int *len, __u32 new) { *start = new; @@ -295,7 +290,7 @@ static void indirect_print_item(struct item_head *ih, char *item) static void indirect_check_item(struct item_head *ih, char *item) { - // FIXME: type something here! + /* unused */ } static int indirect_create_vi(struct virtual_node *vn, @@ -303,7 +298,6 @@ static int indirect_create_vi(struct virtual_node *vn, int is_affected, int insert_size) { vi->vi_index = TYPE_INDIRECT; - //vi->vi_type |= VI_TYPE_INDIRECT; return 0; } @@ -321,16 +315,19 @@ static int indirect_check_right(struct virtual_item *vi, int free) return indirect_check_left(vi, free, 0, 0); } -// return size in bytes of 'units' units. If first == 0 - calculate from the head (left), otherwise - from tail (right) +/* + * return size in bytes of 'units' units. If first == 0 - calculate + * from the head (left), otherwise - from tail (right) + */ static int indirect_part_size(struct virtual_item *vi, int first, int units) { - // unit of indirect item is byte (yet) + /* unit of indirect item is byte (yet) */ return units; } static int indirect_unit_num(struct virtual_item *vi) { - // unit of indirect item is byte (yet) + /* unit of indirect item is byte (yet) */ return vi->vi_item_len - IH_SIZE; } @@ -356,10 +353,7 @@ static struct item_operations indirect_ops = { .print_vi = indirect_print_vi }; -////////////////////////////////////////////////////////////////////////////// -// direntry functions -// - +/* direntry functions */ static int direntry_bytes_number(struct item_head *ih, int block_size) { reiserfs_warning(NULL, "vs-16090", @@ -396,7 +390,7 @@ static void direntry_print_item(struct item_head *ih, char *item) deh = (struct reiserfs_de_head *)item; - for (i = 0; i < I_ENTRY_COUNT(ih); i++, deh++) { + for (i = 0; i < ih_entry_count(ih); i++, deh++) { namelen = (i ? (deh_location(deh - 1)) : ih_item_len(ih)) - deh_location(deh); @@ -428,9 +422,9 @@ static void direntry_check_item(struct item_head *ih, char *item) int i; struct reiserfs_de_head *deh; - // FIXME: type something here! + /* unused */ deh = (struct reiserfs_de_head *)item; - for (i = 0; i < I_ENTRY_COUNT(ih); i++, deh++) { + for (i = 0; i < ih_entry_count(ih); i++, deh++) { ; } } @@ -439,7 +433,8 @@ static void direntry_check_item(struct item_head *ih, char *item) /* * function returns old entry number in directory item in real node - * using new entry number in virtual item in virtual node */ + * using new entry number in virtual item in virtual node + */ static inline int old_entry_num(int is_affected, int virtual_entry_num, int pos_in_item, int mode) { @@ -463,9 +458,11 @@ static inline int old_entry_num(int is_affected, int virtual_entry_num, return virtual_entry_num - 1; } -/* Create an array of sizes of directory entries for virtual - item. Return space used by an item. FIXME: no control over - consuming of space used by this item handler */ +/* + * Create an array of sizes of directory entries for virtual + * item. Return space used by an item. FIXME: no control over + * consuming of space used by this item handler + */ static int direntry_create_vi(struct virtual_node *vn, struct virtual_item *vi, int is_affected, int insert_size) @@ -494,8 +491,8 @@ static int direntry_create_vi(struct virtual_node *vn, j = old_entry_num(is_affected, i, vn->vn_pos_in_item, vn->vn_mode); dir_u->entry_sizes[i] = - (j ? deh_location(&(deh[j - 1])) : ih_item_len(vi->vi_ih)) - - deh_location(&(deh[j])) + DEH_SIZE; + (j ? deh_location(&deh[j - 1]) : ih_item_len(vi->vi_ih)) - + deh_location(&deh[j]) + DEH_SIZE; } size += (dir_u->entry_count * sizeof(short)); @@ -529,10 +526,10 @@ static int direntry_create_vi(struct virtual_node *vn, } -// -// return number of entries which may fit into specified amount of -// free space, or -1 if free space is not enough even for 1 entry -// +/* + * return number of entries which may fit into specified amount of + * free space, or -1 if free space is not enough even for 1 entry + */ static int direntry_check_left(struct virtual_item *vi, int free, int start_skip, int end_skip) { @@ -541,8 +538,8 @@ static int direntry_check_left(struct virtual_item *vi, int free, struct direntry_uarea *dir_u = vi->vi_uarea; for (i = start_skip; i < dir_u->entry_count - end_skip; i++) { + /* i-th entry doesn't fit into the remaining free space */ if (dir_u->entry_sizes[i] > free) - /* i-th entry doesn't fit into the remaining free space */ break; free -= dir_u->entry_sizes[i]; @@ -570,8 +567,8 @@ static int direntry_check_right(struct virtual_item *vi, int free) struct direntry_uarea *dir_u = vi->vi_uarea; for (i = dir_u->entry_count - 1; i >= 0; i--) { + /* i-th entry doesn't fit into the remaining free space */ if (dir_u->entry_sizes[i] > free) - /* i-th entry doesn't fit into the remaining free space */ break; free -= dir_u->entry_sizes[i]; @@ -643,9 +640,7 @@ static struct item_operations direntry_ops = { .print_vi = direntry_print_vi }; -////////////////////////////////////////////////////////////////////////////// -// Error catching functions to catch errors caused by incorrect item types. -// +/* Error catching functions to catch errors caused by incorrect item types. */ static int errcatch_bytes_number(struct item_head *ih, int block_size) { reiserfs_warning(NULL, "green-16001", @@ -685,8 +680,12 @@ static int errcatch_create_vi(struct virtual_node *vn, { reiserfs_warning(NULL, "green-16006", "Invalid item type observed, run fsck ASAP"); - return 0; // We might return -1 here as well, but it won't help as create_virtual_node() from where - // this operation is called from is of return type void. + /* + * We might return -1 here as well, but it won't help as + * create_virtual_node() from where this operation is called + * from is of return type void. + */ + return 0; } static int errcatch_check_left(struct virtual_item *vi, int free, @@ -739,9 +738,6 @@ static struct item_operations errcatch_ops = { errcatch_print_vi }; -////////////////////////////////////////////////////////////////////////////// -// -// #if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3) #error Item types must use disk-format assigned values. #endif diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index fd777032c2b..e8870de4627 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1,38 +1,38 @@ /* -** Write ahead logging implementation copyright Chris Mason 2000 -** -** The background commits make this code very interrelated, and -** overly complex. I need to rethink things a bit....The major players: -** -** journal_begin -- call with the number of blocks you expect to log. -** If the current transaction is too -** old, it will block until the current transaction is -** finished, and then start a new one. -** Usually, your transaction will get joined in with -** previous ones for speed. -** -** journal_join -- same as journal_begin, but won't block on the current -** transaction regardless of age. Don't ever call -** this. Ever. There are only two places it should be -** called from, and they are both inside this file. -** -** journal_mark_dirty -- adds blocks into this transaction. clears any flags -** that might make them get sent to disk -** and then marks them BH_JDirty. Puts the buffer head -** into the current transaction hash. -** -** journal_end -- if the current transaction is batchable, it does nothing -** otherwise, it could do an async/synchronous commit, or -** a full flush of all log and real blocks in the -** transaction. -** -** flush_old_commits -- if the current transaction is too old, it is ended and -** commit blocks are sent to disk. Forces commit blocks -** to disk for all backgrounded commits that have been -** around too long. -** -- Note, if you call this as an immediate flush from -** from within kupdate, it will ignore the immediate flag -*/ + * Write ahead logging implementation copyright Chris Mason 2000 + * + * The background commits make this code very interrelated, and + * overly complex. I need to rethink things a bit....The major players: + * + * journal_begin -- call with the number of blocks you expect to log. + * If the current transaction is too + * old, it will block until the current transaction is + * finished, and then start a new one. + * Usually, your transaction will get joined in with + * previous ones for speed. + * + * journal_join -- same as journal_begin, but won't block on the current + * transaction regardless of age. Don't ever call + * this. Ever. There are only two places it should be + * called from, and they are both inside this file. + * + * journal_mark_dirty -- adds blocks into this transaction. clears any flags + * that might make them get sent to disk + * and then marks them BH_JDirty. Puts the buffer head + * into the current transaction hash. + * + * journal_end -- if the current transaction is batchable, it does nothing + * otherwise, it could do an async/synchronous commit, or + * a full flush of all log and real blocks in the + * transaction. + * + * flush_old_commits -- if the current transaction is too old, it is ended and + * commit blocks are sent to disk. Forces commit blocks + * to disk for all backgrounded commits that have been + * around too long. + * -- Note, if you call this as an immediate flush from + * from within kupdate, it will ignore the immediate flag + */ #include <linux/time.h> #include <linux/semaphore.h> @@ -58,23 +58,19 @@ #define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ j_working_list)) -/* the number of mounted filesystems. This is used to decide when to -** start and kill the commit workqueue -*/ -static int reiserfs_mounted_fs_count; - -static struct workqueue_struct *commit_wq; - -#define JOURNAL_TRANS_HALF 1018 /* must be correct to keep the desc and commit - structs at 4k */ +/* must be correct to keep the desc and commit structs at 4k */ +#define JOURNAL_TRANS_HALF 1018 #define BUFNR 64 /*read ahead */ /* cnode stat bits. Move these into reiserfs_fs.h */ -#define BLOCK_FREED 2 /* this block was freed, and can't be written. */ -#define BLOCK_FREED_HOLDER 3 /* this block was freed during this transaction, and can't be written */ +/* this block was freed, and can't be written. */ +#define BLOCK_FREED 2 +/* this block was freed during this transaction, and can't be written */ +#define BLOCK_FREED_HOLDER 3 -#define BLOCK_NEEDS_FLUSH 4 /* used in flush_journal_list */ +/* used in flush_journal_list */ +#define BLOCK_NEEDS_FLUSH 4 #define BLOCK_DIRTIED 5 /* journal list state bits */ @@ -87,16 +83,14 @@ static struct workqueue_struct *commit_wq; #define COMMIT_NOW 2 /* end and commit this transaction */ #define WAIT 4 /* wait for the log blocks to hit the disk */ -static int do_journal_end(struct reiserfs_transaction_handle *, - struct super_block *, unsigned long nblocks, - int flags); +static int do_journal_end(struct reiserfs_transaction_handle *, int flags); static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall); static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall); static int can_dirty(struct reiserfs_journal_cnode *cn); static int journal_join(struct reiserfs_transaction_handle *th, - struct super_block *sb, unsigned long nblocks); + struct super_block *sb); static void release_journal_dev(struct super_block *super, struct reiserfs_journal *journal); static int dirty_one_transaction(struct super_block *s, @@ -107,8 +101,10 @@ static void queue_log_writer(struct super_block *s); /* values for join in do_journal_begin_r */ enum { JBEGIN_REG = 0, /* regular journal begin */ - JBEGIN_JOIN = 1, /* join the running transaction if at all possible */ - JBEGIN_ABORT = 2, /* called from cleanup code, ignores aborted flag */ + /* join the running transaction if at all possible */ + JBEGIN_JOIN = 1, + /* called from cleanup code, ignores aborted flag */ + JBEGIN_ABORT = 2, }; static int do_journal_begin_r(struct reiserfs_transaction_handle *th, @@ -123,10 +119,11 @@ static void init_journal_hash(struct super_block *sb) } /* -** clears BH_Dirty and sticks the buffer on the clean list. Called because I can't allow refile_buffer to -** make schedule happen after I've freed a block. Look at remove_from_transaction and journal_mark_freed for -** more details. -*/ + * clears BH_Dirty and sticks the buffer on the clean list. Called because + * I can't allow refile_buffer to make schedule happen after I've freed a + * block. Look at remove_from_transaction and journal_mark_freed for + * more details. + */ static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) { if (bh) { @@ -163,7 +160,7 @@ static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *sb) struct list_head *entry = journal->j_bitmap_nodes.next; journal->j_used_bitmap_nodes++; - repeat: +repeat: if (entry != &journal->j_bitmap_nodes) { bn = list_entry(entry, struct reiserfs_bitmap_node, list); @@ -204,7 +201,8 @@ static void allocate_bitmap_nodes(struct super_block *sb) list_add(&bn->list, &journal->j_bitmap_nodes); journal->j_free_bitmap_nodes++; } else { - break; /* this is ok, we'll try again when more are needed */ + /* this is ok, we'll try again when more are needed */ + break; } } } @@ -239,8 +237,8 @@ static void cleanup_bitmap_list(struct super_block *sb, } /* -** only call this on FS unmount. -*/ + * only call this on FS unmount. + */ static int free_list_bitmaps(struct super_block *sb, struct reiserfs_list_bitmap *jb_array) { @@ -275,9 +273,9 @@ static int free_bitmap_nodes(struct super_block *sb) } /* -** get memory for JOURNAL_NUM_BITMAPS worth of bitmaps. -** jb_array is the array to be filled in. -*/ + * get memory for JOURNAL_NUM_BITMAPS worth of bitmaps. + * jb_array is the array to be filled in. + */ int reiserfs_allocate_list_bitmaps(struct super_block *sb, struct reiserfs_list_bitmap *jb_array, unsigned int bmap_nr) @@ -306,9 +304,9 @@ int reiserfs_allocate_list_bitmaps(struct super_block *sb, } /* -** find an available list bitmap. If you can't find one, flush a commit list -** and try again -*/ + * find an available list bitmap. If you can't find one, flush a commit list + * and try again + */ static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb, struct reiserfs_journal_list *jl) @@ -332,18 +330,18 @@ static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb, break; } } - if (jb->journal_list) { /* double check to make sure if flushed correctly */ + /* double check to make sure if flushed correctly */ + if (jb->journal_list) return NULL; - } jb->journal_list = jl; return jb; } /* -** allocates a new chunk of X nodes, and links them all together as a list. -** Uses the cnode->next and cnode->prev pointers -** returns NULL on failure -*/ + * allocates a new chunk of X nodes, and links them all together as a list. + * Uses the cnode->next and cnode->prev pointers + * returns NULL on failure + */ static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes) { struct reiserfs_journal_cnode *head; @@ -365,9 +363,7 @@ static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes) return head; } -/* -** pulls a cnode off the free list, or returns NULL on failure -*/ +/* pulls a cnode off the free list, or returns NULL on failure */ static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb) { struct reiserfs_journal_cnode *cn; @@ -393,8 +389,8 @@ static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb) } /* -** returns a cnode to the free list -*/ + * returns a cnode to the free list + */ static void free_cnode(struct super_block *sb, struct reiserfs_journal_cnode *cn) { @@ -419,7 +415,10 @@ static void clear_prepared_bits(struct buffer_head *bh) clear_buffer_journal_restore_dirty(bh); } -/* return a cnode with same dev, block number and size in table, or null if not found */ +/* + * return a cnode with same dev, block number and size in table, + * or null if not found + */ static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct super_block *sb, @@ -439,23 +438,24 @@ static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct } /* -** this actually means 'can this block be reallocated yet?'. If you set search_all, a block can only be allocated -** if it is not in the current transaction, was not freed by the current transaction, and has no chance of ever -** being overwritten by a replay after crashing. -** -** If you don't set search_all, a block can only be allocated if it is not in the current transaction. Since deleting -** a block removes it from the current transaction, this case should never happen. If you don't set search_all, make -** sure you never write the block without logging it. -** -** next_zero_bit is a suggestion about the next block to try for find_forward. -** when bl is rejected because it is set in a journal list bitmap, we search -** for the next zero bit in the bitmap that rejected bl. Then, we return that -** through next_zero_bit for find_forward to try. -** -** Just because we return something in next_zero_bit does not mean we won't -** reject it on the next call to reiserfs_in_journal -** -*/ + * this actually means 'can this block be reallocated yet?'. If you set + * search_all, a block can only be allocated if it is not in the current + * transaction, was not freed by the current transaction, and has no chance + * of ever being overwritten by a replay after crashing. + * + * If you don't set search_all, a block can only be allocated if it is not + * in the current transaction. Since deleting a block removes it from the + * current transaction, this case should never happen. If you don't set + * search_all, make sure you never write the block without logging it. + * + * next_zero_bit is a suggestion about the next block to try for find_forward. + * when bl is rejected because it is set in a journal list bitmap, we search + * for the next zero bit in the bitmap that rejected bl. Then, we return + * that through next_zero_bit for find_forward to try. + * + * Just because we return something in next_zero_bit does not mean we won't + * reject it on the next call to reiserfs_in_journal + */ int reiserfs_in_journal(struct super_block *sb, unsigned int bmap_nr, int bit_nr, int search_all, b_blocknr_t * next_zero_bit) @@ -469,9 +469,11 @@ int reiserfs_in_journal(struct super_block *sb, *next_zero_bit = 0; /* always start this at zero. */ PROC_INFO_INC(sb, journal.in_journal); - /* If we aren't doing a search_all, this is a metablock, and it will be logged before use. - ** if we crash before the transaction that freed it commits, this transaction won't - ** have committed either, and the block will never be written + /* + * If we aren't doing a search_all, this is a metablock, and it + * will be logged before use. if we crash before the transaction + * that freed it commits, this transaction won't have committed + * either, and the block will never be written */ if (search_all) { for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { @@ -511,8 +513,7 @@ int reiserfs_in_journal(struct super_block *sb, return 0; } -/* insert cn into table -*/ +/* insert cn into table */ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table, struct reiserfs_journal_cnode *cn) { @@ -558,10 +559,10 @@ static inline void put_journal_list(struct super_block *s, } /* -** this used to be much more involved, and I'm keeping it just in case things get ugly again. -** it gets called by flush_commit_list, and cleans up any data stored about blocks freed during a -** transaction. -*/ + * this used to be much more involved, and I'm keeping it just in case + * things get ugly again. it gets called by flush_commit_list, and + * cleans up any data stored about blocks freed during a transaction. + */ static void cleanup_freed_for_journal_list(struct super_block *sb, struct reiserfs_journal_list *jl) { @@ -756,11 +757,12 @@ static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh, jh = bh->b_private; list_del_init(&jh->list); } else { - no_jh: +no_jh: get_bh(bh); jh = alloc_jh(); spin_lock(&j->j_dirty_buffers_lock); - /* buffer must be locked for __add_jh, should be able to have + /* + * buffer must be locked for __add_jh, should be able to have * two adds at the same time */ BUG_ON(bh->b_private); @@ -818,7 +820,8 @@ static int write_ordered_buffers(spinlock_t * lock, spin_lock(lock); goto loop_next; } - /* in theory, dirty non-uptodate buffers should never get here, + /* + * in theory, dirty non-uptodate buffers should never get here, * but the upper layer io error paths still have a few quirks. * Handle them here as gracefully as we can */ @@ -833,7 +836,7 @@ static int write_ordered_buffers(spinlock_t * lock, reiserfs_free_jh(bh); unlock_buffer(bh); } - loop_next: +loop_next: put_bh(bh); cond_resched_lock(lock); } @@ -856,13 +859,14 @@ static int write_ordered_buffers(spinlock_t * lock, if (!buffer_uptodate(bh)) { ret = -EIO; } - /* ugly interaction with invalidatepage here. - * reiserfs_invalidate_page will pin any buffer that has a valid - * journal head from an older transaction. If someone else sets - * our buffer dirty after we write it in the first loop, and - * then someone truncates the page away, nobody will ever write - * the buffer. We're safe if we write the page one last time - * after freeing the journal header. + /* + * ugly interaction with invalidatepage here. + * reiserfs_invalidate_page will pin any buffer that has a + * valid journal head from an older transaction. If someone + * else sets our buffer dirty after we write it in the first + * loop, and then someone truncates the page away, nobody + * will ever write the buffer. We're safe if we write the + * page one last time after freeing the journal header. */ if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) { spin_unlock(lock); @@ -887,7 +891,7 @@ static int flush_older_commits(struct super_block *s, unsigned int other_trans_id; unsigned int first_trans_id; - find_first: +find_first: /* * first we walk backwards to find the oldest uncommitted transation */ @@ -923,9 +927,11 @@ static int flush_older_commits(struct super_block *s, if (!journal_list_still_alive(s, trans_id)) return 1; - /* the one we just flushed is gone, this means all - * older lists are also gone, so first_jl is no longer - * valid either. Go back to the beginning. + /* + * the one we just flushed is gone, this means + * all older lists are also gone, so first_jl + * is no longer valid either. Go back to the + * beginning. */ if (!journal_list_still_alive (s, other_trans_id)) { @@ -958,12 +964,12 @@ static int reiserfs_async_progress_wait(struct super_block *s) } /* -** if this journal list still has commit blocks unflushed, send them to disk. -** -** log areas must be flushed in order (transaction 2 can't commit before transaction 1) -** Before the commit block can by written, every other log block must be safely on disk -** -*/ + * if this journal list still has commit blocks unflushed, send them to disk. + * + * log areas must be flushed in order (transaction 2 can't commit before + * transaction 1) Before the commit block can by written, every other log + * block must be safely on disk + */ static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) { @@ -982,8 +988,9 @@ static int flush_commit_list(struct super_block *s, return 0; } - /* before we can put our commit blocks on disk, we have to make sure everyone older than - ** us is on disk too + /* + * before we can put our commit blocks on disk, we have to make + * sure everyone older than us is on disk too */ BUG_ON(jl->j_len <= 0); BUG_ON(trans_id == journal->j_trans_id); @@ -991,7 +998,10 @@ static int flush_commit_list(struct super_block *s, get_journal_list(jl); if (flushall) { if (flush_older_commits(s, jl) == 1) { - /* list disappeared during flush_older_commits. return */ + /* + * list disappeared during flush_older_commits. + * return + */ goto put_jl; } } @@ -1006,9 +1016,9 @@ static int flush_commit_list(struct super_block *s, BUG_ON(jl->j_trans_id == 0); /* this commit is done, exit */ - if (atomic_read(&(jl->j_commit_left)) <= 0) { + if (atomic_read(&jl->j_commit_left) <= 0) { if (flushall) { - atomic_set(&(jl->j_older_commits_done), 1); + atomic_set(&jl->j_older_commits_done, 1); } mutex_unlock(&jl->j_commit_mutex); goto put_jl; @@ -1063,9 +1073,10 @@ static int flush_commit_list(struct super_block *s, depth = reiserfs_write_unlock_nested(s); __wait_on_buffer(tbh); reiserfs_write_lock_nested(s, depth); - // since we're using ll_rw_blk above, it might have skipped over - // a locked buffer. Double check here - // + /* + * since we're using ll_rw_blk above, it might have skipped + * over a locked buffer. Double check here + */ /* redundant, sync_dirty_buffer() checks */ if (buffer_dirty(tbh)) { depth = reiserfs_write_unlock_nested(s); @@ -1079,17 +1090,21 @@ static int flush_commit_list(struct super_block *s, #endif retval = -EIO; } - put_bh(tbh); /* once for journal_find_get_block */ - put_bh(tbh); /* once due to original getblk in do_journal_end */ - atomic_dec(&(jl->j_commit_left)); + /* once for journal_find_get_block */ + put_bh(tbh); + /* once due to original getblk in do_journal_end */ + put_bh(tbh); + atomic_dec(&jl->j_commit_left); } - BUG_ON(atomic_read(&(jl->j_commit_left)) != 1); + BUG_ON(atomic_read(&jl->j_commit_left) != 1); - /* If there was a write error in the journal - we can't commit + /* + * If there was a write error in the journal - we can't commit * this transaction - it will be invalid and, if successful, * will just end up propagating the write error out to - * the file system. */ + * the file system. + */ if (likely(!retval && !reiserfs_is_journal_aborted (journal))) { if (buffer_dirty(jl->j_commit_bh)) BUG(); @@ -1102,9 +1117,11 @@ static int flush_commit_list(struct super_block *s, reiserfs_write_lock_nested(s, depth); } - /* If there was a write error in the journal - we can't commit this + /* + * If there was a write error in the journal - we can't commit this * transaction - it will be invalid and, if successful, will just end - * up propagating the write error out to the filesystem. */ + * up propagating the write error out to the filesystem. + */ if (unlikely(!buffer_uptodate(jl->j_commit_bh))) { #ifdef CONFIG_REISERFS_CHECK reiserfs_warning(s, "journal-615", "buffer write failed"); @@ -1119,7 +1136,10 @@ static int flush_commit_list(struct super_block *s, } journal->j_last_commit_id = jl->j_trans_id; - /* now, every commit block is on the disk. It is safe to allow blocks freed during this transaction to be reallocated */ + /* + * now, every commit block is on the disk. It is safe to allow + * blocks freed during this transaction to be reallocated + */ cleanup_freed_for_journal_list(s, jl); retval = retval ? retval : journal->j_errno; @@ -1127,13 +1147,13 @@ static int flush_commit_list(struct super_block *s, /* mark the metadata dirty */ if (!retval) dirty_one_transaction(s, jl); - atomic_dec(&(jl->j_commit_left)); + atomic_dec(&jl->j_commit_left); if (flushall) { - atomic_set(&(jl->j_older_commits_done), 1); + atomic_set(&jl->j_older_commits_done, 1); } mutex_unlock(&jl->j_commit_mutex); - put_jl: +put_jl: put_journal_list(s, jl); if (retval) @@ -1143,9 +1163,9 @@ static int flush_commit_list(struct super_block *s, } /* -** flush_journal_list frequently needs to find a newer transaction for a given block. This does that, or -** returns NULL if it can't find anything -*/ + * flush_journal_list frequently needs to find a newer transaction for a + * given block. This does that, or returns NULL if it can't find anything + */ static struct reiserfs_journal_list *find_newer_jl_for_cn(struct reiserfs_journal_cnode *cn) @@ -1169,10 +1189,11 @@ static void remove_journal_hash(struct super_block *, int); /* -** once all the real blocks have been flushed, it is safe to remove them from the -** journal list for this transaction. Aside from freeing the cnode, this also allows the -** block to be reallocated for data blocks if it had been deleted. -*/ + * once all the real blocks have been flushed, it is safe to remove them + * from the journal list for this transaction. Aside from freeing the + * cnode, this also allows the block to be reallocated for data blocks + * if it had been deleted. + */ static void remove_all_from_journal_list(struct super_block *sb, struct reiserfs_journal_list *jl, int debug) @@ -1181,8 +1202,9 @@ static void remove_all_from_journal_list(struct super_block *sb, struct reiserfs_journal_cnode *cn, *last; cn = jl->j_realblock; - /* which is better, to lock once around the whole loop, or - ** to lock for each call to remove_journal_hash? + /* + * which is better, to lock once around the whole loop, or + * to lock for each call to remove_journal_hash? */ while (cn) { if (cn->blocknr != 0) { @@ -1204,12 +1226,13 @@ static void remove_all_from_journal_list(struct super_block *sb, } /* -** if this timestamp is greater than the timestamp we wrote last to the header block, write it to the header block. -** once this is done, I can safely say the log area for this transaction won't ever be replayed, and I can start -** releasing blocks in this transaction for reuse as data blocks. -** called by flush_journal_list, before it calls remove_all_from_journal_list -** -*/ + * if this timestamp is greater than the timestamp we wrote last to the + * header block, write it to the header block. once this is done, I can + * safely say the log area for this transaction won't ever be replayed, + * and I can start releasing blocks in this transaction for reuse as data + * blocks. called by flush_journal_list, before it calls + * remove_all_from_journal_list + */ static int _update_journal_header_block(struct super_block *sb, unsigned long offset, unsigned int trans_id) @@ -1279,10 +1302,11 @@ static int flush_older_journal_lists(struct super_block *sb, struct reiserfs_journal *journal = SB_JOURNAL(sb); unsigned int trans_id = jl->j_trans_id; - /* we know we are the only ones flushing things, no extra race + /* + * we know we are the only ones flushing things, no extra race * protection is required. */ - restart: +restart: entry = journal->j_journal_list.next; /* Did we wrap? */ if (entry == &journal->j_journal_list) @@ -1309,15 +1333,16 @@ static void del_from_work_list(struct super_block *s, } } -/* flush a journal list, both commit and real blocks -** -** always set flushall to 1, unless you are calling from inside -** flush_journal_list -** -** IMPORTANT. This can only be called while there are no journal writers, -** and the journal is locked. That means it can only be called from -** do_journal_end, or by journal_release -*/ +/* + * flush a journal list, both commit and real blocks + * + * always set flushall to 1, unless you are calling from inside + * flush_journal_list + * + * IMPORTANT. This can only be called while there are no journal writers, + * and the journal is locked. That means it can only be called from + * do_journal_end, or by journal_release + */ static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) { @@ -1354,13 +1379,14 @@ static int flush_journal_list(struct super_block *s, } /* if all the work is already done, get out of here */ - if (atomic_read(&(jl->j_nonzerolen)) <= 0 && - atomic_read(&(jl->j_commit_left)) <= 0) { + if (atomic_read(&jl->j_nonzerolen) <= 0 && + atomic_read(&jl->j_commit_left) <= 0) { goto flush_older_and_return; } - /* start by putting the commit list on disk. This will also flush - ** the commit lists of any olders transactions + /* + * start by putting the commit list on disk. This will also flush + * the commit lists of any olders transactions */ flush_commit_list(s, jl, 1); @@ -1369,15 +1395,16 @@ static int flush_journal_list(struct super_block *s, BUG(); /* are we done now? */ - if (atomic_read(&(jl->j_nonzerolen)) <= 0 && - atomic_read(&(jl->j_commit_left)) <= 0) { + if (atomic_read(&jl->j_nonzerolen) <= 0 && + atomic_read(&jl->j_commit_left) <= 0) { goto flush_older_and_return; } - /* loop through each cnode, see if we need to write it, - ** or wait on a more recent transaction, or just ignore it + /* + * loop through each cnode, see if we need to write it, + * or wait on a more recent transaction, or just ignore it */ - if (atomic_read(&(journal->j_wcount)) != 0) { + if (atomic_read(&journal->j_wcount) != 0) { reiserfs_panic(s, "journal-844", "journal list is flushing, " "wcount is not 0"); } @@ -1391,20 +1418,25 @@ static int flush_journal_list(struct super_block *s, goto free_cnode; } - /* This transaction failed commit. Don't write out to the disk */ + /* + * This transaction failed commit. + * Don't write out to the disk + */ if (!(jl->j_state & LIST_DIRTY)) goto free_cnode; pjl = find_newer_jl_for_cn(cn); - /* the order is important here. We check pjl to make sure we - ** don't clear BH_JDirty_wait if we aren't the one writing this - ** block to disk + /* + * the order is important here. We check pjl to make sure we + * don't clear BH_JDirty_wait if we aren't the one writing this + * block to disk */ if (!pjl && cn->bh) { saved_bh = cn->bh; - /* we do this to make sure nobody releases the buffer while - ** we are working with it + /* + * we do this to make sure nobody releases the + * buffer while we are working with it */ get_bh(saved_bh); @@ -1413,13 +1445,17 @@ static int flush_journal_list(struct super_block *s, was_jwait = 1; was_dirty = 1; } else if (can_dirty(cn)) { - /* everything with !pjl && jwait should be writable */ + /* + * everything with !pjl && jwait + * should be writable + */ BUG(); } } - /* if someone has this block in a newer transaction, just make - ** sure they are committed, and don't try writing it to disk + /* + * if someone has this block in a newer transaction, just make + * sure they are committed, and don't try writing it to disk */ if (pjl) { if (atomic_read(&pjl->j_commit_left)) @@ -1427,16 +1463,18 @@ static int flush_journal_list(struct super_block *s, goto free_cnode; } - /* bh == NULL when the block got to disk on its own, OR, - ** the block got freed in a future transaction + /* + * bh == NULL when the block got to disk on its own, OR, + * the block got freed in a future transaction */ if (saved_bh == NULL) { goto free_cnode; } - /* this should never happen. kupdate_one_transaction has this list - ** locked while it works, so we should never see a buffer here that - ** is not marked JDirty_wait + /* + * this should never happen. kupdate_one_transaction has + * this list locked while it works, so we should never see a + * buffer here that is not marked JDirty_wait */ if ((!was_jwait) && !buffer_locked(saved_bh)) { reiserfs_warning(s, "journal-813", @@ -1447,7 +1485,10 @@ static int flush_journal_list(struct super_block *s, was_jwait ? ' ' : '!'); } if (was_dirty) { - /* we inc again because saved_bh gets decremented at free_cnode */ + /* + * we inc again because saved_bh gets decremented + * at free_cnode + */ get_bh(saved_bh); set_bit(BLOCK_NEEDS_FLUSH, &cn->state); lock_buffer(saved_bh); @@ -1463,13 +1504,16 @@ static int flush_journal_list(struct super_block *s, (unsigned long long)saved_bh-> b_blocknr, __func__); } - free_cnode: +free_cnode: last = cn; cn = cn->next; if (saved_bh) { - /* we incremented this to keep others from taking the buffer head away */ + /* + * we incremented this to keep others from + * taking the buffer head away + */ put_bh(saved_bh); - if (atomic_read(&(saved_bh->b_count)) < 0) { + if (atomic_read(&saved_bh->b_count) < 0) { reiserfs_warning(s, "journal-945", "saved_bh->b_count < 0"); } @@ -1499,8 +1543,10 @@ static int flush_journal_list(struct super_block *s, #endif err = -EIO; } - /* note, we must clear the JDirty_wait bit after the up to date - ** check, otherwise we race against our flushpage routine + /* + * note, we must clear the JDirty_wait bit + * after the up to date check, otherwise we + * race against our flushpage routine */ BUG_ON(!test_clear_buffer_journal_dirty (cn->bh)); @@ -1518,25 +1564,27 @@ static int flush_journal_list(struct super_block *s, reiserfs_abort(s, -EIO, "Write error while pushing transaction to disk in %s", __func__); - flush_older_and_return: +flush_older_and_return: - /* before we can update the journal header block, we _must_ flush all - ** real blocks from all older transactions to disk. This is because - ** once the header block is updated, this transaction will not be - ** replayed after a crash + /* + * before we can update the journal header block, we _must_ flush all + * real blocks from all older transactions to disk. This is because + * once the header block is updated, this transaction will not be + * replayed after a crash */ if (flushall) { flush_older_journal_lists(s, jl); } err = journal->j_errno; - /* before we can remove everything from the hash tables for this - ** transaction, we must make sure it can never be replayed - ** - ** since we are only called from do_journal_end, we know for sure there - ** are no allocations going on while we are flushing journal lists. So, - ** we only need to update the journal header block for the last list - ** being flushed + /* + * before we can remove everything from the hash tables for this + * transaction, we must make sure it can never be replayed + * + * since we are only called from do_journal_end, we know for sure there + * are no allocations going on while we are flushing journal lists. So, + * we only need to update the journal header block for the last list + * being flushed */ if (!err && flushall) { err = @@ -1561,11 +1609,12 @@ static int flush_journal_list(struct super_block *s, } journal->j_last_flush_id = jl->j_trans_id; - /* not strictly required since we are freeing the list, but it should + /* + * not strictly required since we are freeing the list, but it should * help find code using dead lists later on */ jl->j_len = 0; - atomic_set(&(jl->j_nonzerolen), 0); + atomic_set(&jl->j_nonzerolen, 0); jl->j_start = 0; jl->j_realblock = NULL; jl->j_commit_bh = NULL; @@ -1592,15 +1641,17 @@ static int write_one_transaction(struct super_block *s, cn = jl->j_realblock; while (cn) { - /* if the blocknr == 0, this has been cleared from the hash, - ** skip it + /* + * if the blocknr == 0, this has been cleared from the hash, + * skip it */ if (cn->blocknr == 0) { goto next; } if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) { struct buffer_head *tmp_bh; - /* we can race against journal_mark_freed when we try + /* + * we can race against journal_mark_freed when we try * to lock_buffer(cn->bh), so we have to inc the buffer * count, and recheck things after locking */ @@ -1619,7 +1670,7 @@ static int write_one_transaction(struct super_block *s, } put_bh(tmp_bh); } - next: +next: cn = cn->next; cond_resched(); } @@ -1637,15 +1688,17 @@ static int dirty_one_transaction(struct super_block *s, jl->j_state |= LIST_DIRTY; cn = jl->j_realblock; while (cn) { - /* look for a more recent transaction that logged this - ** buffer. Only the most recent transaction with a buffer in - ** it is allowed to send that buffer to disk + /* + * look for a more recent transaction that logged this + * buffer. Only the most recent transaction with a buffer in + * it is allowed to send that buffer to disk */ pjl = find_newer_jl_for_cn(cn); if (!pjl && cn->blocknr && cn->bh && buffer_journal_dirty(cn->bh)) { BUG_ON(!can_dirty(cn)); - /* if the buffer is prepared, it will either be logged + /* + * if the buffer is prepared, it will either be logged * or restored. If restored, we need to make sure * it actually gets marked dirty */ @@ -1682,7 +1735,8 @@ static int kupdate_transactions(struct super_block *s, goto done; } - /* we've got j_flush_mutex held, nobody is going to delete any + /* + * we've got j_flush_mutex held, nobody is going to delete any * of these lists out from underneath us */ while ((num_trans && transactions_flushed < num_trans) || @@ -1716,20 +1770,21 @@ static int kupdate_transactions(struct super_block *s, write_chunk(&chunk); } - done: +done: mutex_unlock(&journal->j_flush_mutex); return ret; } -/* for o_sync and fsync heavy applications, they tend to use -** all the journa list slots with tiny transactions. These -** trigger lots and lots of calls to update the header block, which -** adds seeks and slows things down. -** -** This function tries to clear out a large chunk of the journal lists -** at once, which makes everything faster since only the newest journal -** list updates the header block -*/ +/* + * for o_sync and fsync heavy applications, they tend to use + * all the journa list slots with tiny transactions. These + * trigger lots and lots of calls to update the header block, which + * adds seeks and slows things down. + * + * This function tries to clear out a large chunk of the journal lists + * at once, which makes everything faster since only the newest journal + * list updates the header block + */ static int flush_used_journal_lists(struct super_block *s, struct reiserfs_journal_list *jl) { @@ -1766,9 +1821,11 @@ static int flush_used_journal_lists(struct super_block *s, } get_journal_list(jl); get_journal_list(flush_jl); - /* try to find a group of blocks we can flush across all the - ** transactions, but only bother if we've actually spanned - ** across multiple lists + + /* + * try to find a group of blocks we can flush across all the + * transactions, but only bother if we've actually spanned + * across multiple lists */ if (flush_jl != jl) { ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i); @@ -1780,9 +1837,9 @@ static int flush_used_journal_lists(struct super_block *s, } /* -** removes any nodes in table with name block and dev as bh. -** only touchs the hnext and hprev pointers. -*/ + * removes any nodes in table with name block and dev as bh. + * only touchs the hnext and hprev pointers. + */ void remove_journal_hash(struct super_block *sb, struct reiserfs_journal_cnode **table, struct reiserfs_journal_list *jl, @@ -1811,8 +1868,12 @@ void remove_journal_hash(struct super_block *sb, cur->blocknr = 0; cur->sb = NULL; cur->state = 0; - if (cur->bh && cur->jlist) /* anybody who clears the cur->bh will also dec the nonzerolen */ - atomic_dec(&(cur->jlist->j_nonzerolen)); + /* + * anybody who clears the cur->bh will also + * dec the nonzerolen + */ + if (cur->bh && cur->jlist) + atomic_dec(&cur->jlist->j_nonzerolen); cur->bh = NULL; cur->jlist = NULL; } @@ -1832,17 +1893,18 @@ static void free_journal_ram(struct super_block *sb) if (journal->j_header_bh) { brelse(journal->j_header_bh); } - /* j_header_bh is on the journal dev, make sure not to release the journal - * dev until we brelse j_header_bh + /* + * j_header_bh is on the journal dev, make sure + * not to release the journal dev until we brelse j_header_bh */ release_journal_dev(sb, journal); vfree(journal); } /* -** call on unmount. Only set error to 1 if you haven't made your way out -** of read_super() yet. Any other caller must keep error at 0. -*/ + * call on unmount. Only set error to 1 if you haven't made your way out + * of read_super() yet. Any other caller must keep error at 0. + */ static int do_journal_release(struct reiserfs_transaction_handle *th, struct super_block *sb, int error) { @@ -1850,21 +1912,25 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, int flushed = 0; struct reiserfs_journal *journal = SB_JOURNAL(sb); - /* we only want to flush out transactions if we were called with error == 0 + /* + * we only want to flush out transactions if we were + * called with error == 0 */ if (!error && !(sb->s_flags & MS_RDONLY)) { /* end the current trans */ BUG_ON(!th->t_trans_id); - do_journal_end(th, sb, 10, FLUSH_ALL); + do_journal_end(th, FLUSH_ALL); - /* make sure something gets logged to force our way into the flush code */ - if (!journal_join(&myth, sb, 1)) { + /* + * make sure something gets logged to force + * our way into the flush code + */ + if (!journal_join(&myth, sb)) { reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1); - journal_mark_dirty(&myth, sb, - SB_BUFFER_WITH_SB(sb)); - do_journal_end(&myth, sb, 1, FLUSH_ALL); + journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb)); + do_journal_end(&myth, FLUSH_ALL); flushed = 1; } } @@ -1872,17 +1938,15 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, /* this also catches errors during the do_journal_end above */ if (!error && reiserfs_is_journal_aborted(journal)) { memset(&myth, 0, sizeof(myth)); - if (!journal_join_abort(&myth, sb, 1)) { + if (!journal_join_abort(&myth, sb)) { reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1); - journal_mark_dirty(&myth, sb, - SB_BUFFER_WITH_SB(sb)); - do_journal_end(&myth, sb, 1, FLUSH_ALL); + journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb)); + do_journal_end(&myth, FLUSH_ALL); } } - reiserfs_mounted_fs_count--; /* wait for all commits to finish */ cancel_delayed_work(&SB_JOURNAL(sb)->j_work); @@ -1893,12 +1957,7 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, reiserfs_write_unlock(sb); cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work); - flush_workqueue(commit_wq); - - if (!reiserfs_mounted_fs_count) { - destroy_workqueue(commit_wq); - commit_wq = NULL; - } + flush_workqueue(REISERFS_SB(sb)->commit_wq); free_journal_ram(sb); @@ -1907,25 +1966,24 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, return 0; } -/* -** call on unmount. flush all journal trans, release all alloc'd ram -*/ +/* * call on unmount. flush all journal trans, release all alloc'd ram */ int journal_release(struct reiserfs_transaction_handle *th, struct super_block *sb) { return do_journal_release(th, sb, 0); } -/* -** only call from an error condition inside reiserfs_read_super! -*/ +/* only call from an error condition inside reiserfs_read_super! */ int journal_release_error(struct reiserfs_transaction_handle *th, struct super_block *sb) { return do_journal_release(th, sb, 1); } -/* compares description block with commit block. returns 1 if they differ, 0 if they are the same */ +/* + * compares description block with commit block. + * returns 1 if they differ, 0 if they are the same + */ static int journal_compare_desc_commit(struct super_block *sb, struct reiserfs_journal_desc *desc, struct reiserfs_journal_commit *commit) @@ -1939,11 +1997,12 @@ static int journal_compare_desc_commit(struct super_block *sb, return 0; } -/* returns 0 if it did not find a description block -** returns -1 if it found a corrupt commit block -** returns 1 if both desc and commit were valid -** NOTE: only called during fs mount -*/ +/* + * returns 0 if it did not find a description block + * returns -1 if it found a corrupt commit block + * returns 1 if both desc and commit were valid + * NOTE: only called during fs mount + */ static int journal_transaction_is_valid(struct super_block *sb, struct buffer_head *d_bh, unsigned int *oldest_invalid_trans_id, @@ -1989,7 +2048,10 @@ static int journal_transaction_is_valid(struct super_block *sb, } offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb); - /* ok, we have a journal description block, lets see if the transaction was valid */ + /* + * ok, we have a journal description block, + * let's see if the transaction was valid + */ c_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + @@ -2041,11 +2103,11 @@ static void brelse_array(struct buffer_head **heads, int num) } /* -** given the start, and values for the oldest acceptable transactions, -** this either reads in a replays a transaction, or returns because the -** transaction is invalid, or too old. -** NOTE: only called during fs mount -*/ + * given the start, and values for the oldest acceptable transactions, + * this either reads in a replays a transaction, or returns because the + * transaction is invalid, or too old. + * NOTE: only called during fs mount + */ static int journal_read_transaction(struct super_block *sb, unsigned long cur_dblock, unsigned long oldest_start, @@ -2119,7 +2181,10 @@ static int journal_read_transaction(struct super_block *sb, } trans_id = get_desc_trans_id(desc); - /* now we know we've got a good transaction, and it was inside the valid time ranges */ + /* + * now we know we've got a good transaction, and it was + * inside the valid time ranges + */ log_blocks = kmalloc(get_desc_trans_len(desc) * sizeof(struct buffer_head *), GFP_NOFS); real_blocks = kmalloc(get_desc_trans_len(desc) * @@ -2164,7 +2229,7 @@ static int journal_read_transaction(struct super_block *sb, reiserfs_warning(sb, "journal-1204", "REPLAY FAILURE fsck required! " "Trying to replay onto a log block"); - abort_replay: +abort_replay: brelse_array(log_blocks, i); brelse_array(real_blocks, i); brelse(c_bh); @@ -2226,7 +2291,10 @@ static int journal_read_transaction(struct super_block *sb, "journal-1095: setting journal " "start to offset %ld", cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb)); - /* init starting values for the first transaction, in case this is the last transaction to be replayed. */ + /* + * init starting values for the first transaction, in case + * this is the last transaction to be replayed. + */ journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb); journal->j_last_flush_trans_id = trans_id; journal->j_trans_id = trans_id + 1; @@ -2240,12 +2308,14 @@ static int journal_read_transaction(struct super_block *sb, return 0; } -/* This function reads blocks starting from block and to max_block of bufsize - size (but no more than BUFNR blocks at a time). This proved to improve - mounting speed on self-rebuilding raid5 arrays at least. - Right now it is only used from journal code. But later we might use it - from other places. - Note: Do not use journal_getblk/sb_getblk functions here! */ +/* + * This function reads blocks starting from block and to max_block of bufsize + * size (but no more than BUFNR blocks at a time). This proved to improve + * mounting speed on self-rebuilding raid5 arrays at least. + * Right now it is only used from journal code. But later we might use it + * from other places. + * Note: Do not use journal_getblk/sb_getblk functions here! + */ static struct buffer_head *reiserfs_breada(struct block_device *dev, b_blocknr_t block, int bufsize, b_blocknr_t max_block) @@ -2284,15 +2354,17 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev, } /* -** read and replay the log -** on a clean unmount, the journal header's next unflushed pointer will -** be to an invalid transaction. This tests that before finding all the -** transactions in the log, which makes normal mount times fast. -** After a crash, this starts with the next unflushed transaction, and -** replays until it finds one too old, or invalid. -** On exit, it sets things up so the first transaction will work correctly. -** NOTE: only called during fs mount -*/ + * read and replay the log + * on a clean unmount, the journal header's next unflushed pointer will be + * to an invalid transaction. This tests that before finding all the + * transactions in the log, which makes normal mount times fast. + * + * After a crash, this starts with the next unflushed transaction, and + * replays until it finds one too old, or invalid. + * + * On exit, it sets things up so the first transaction will work correctly. + * NOTE: only called during fs mount + */ static int journal_read(struct super_block *sb) { struct reiserfs_journal *journal = SB_JOURNAL(sb); @@ -2316,9 +2388,10 @@ static int journal_read(struct super_block *sb) bdevname(journal->j_dev_bd, b)); start = get_seconds(); - /* step 1, read in the journal header block. Check the transaction it says - ** is the first unflushed, and if that transaction is not valid, - ** replay is done + /* + * step 1, read in the journal header block. Check the transaction + * it says is the first unflushed, and if that transaction is not + * valid, replay is done */ journal->j_header_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) @@ -2342,9 +2415,10 @@ static int journal_read(struct super_block *sb) le32_to_cpu(jh->j_last_flush_trans_id)); valid_journal_header = 1; - /* now, we try to read the first unflushed offset. If it is not valid, - ** there is nothing more we can do, and it makes no sense to read - ** through the whole log. + /* + * now, we try to read the first unflushed offset. If it + * is not valid, there is nothing more we can do, and it + * makes no sense to read through the whole log. */ d_bh = journal_bread(sb, @@ -2358,15 +2432,19 @@ static int journal_read(struct super_block *sb) goto start_log_replay; } - /* ok, there are transactions that need to be replayed. start with the first log block, find - ** all the valid transactions, and pick out the oldest. + /* + * ok, there are transactions that need to be replayed. start + * with the first log block, find all the valid transactions, and + * pick out the oldest. */ while (continue_replay && cur_dblock < (SB_ONDISK_JOURNAL_1st_BLOCK(sb) + SB_ONDISK_JOURNAL_SIZE(sb))) { - /* Note that it is required for blocksize of primary fs device and journal - device to be the same */ + /* + * Note that it is required for blocksize of primary fs + * device and journal device to be the same + */ d_bh = reiserfs_breada(journal->j_dev_bd, cur_dblock, sb->s_blocksize, @@ -2413,7 +2491,7 @@ static int journal_read(struct super_block *sb) brelse(d_bh); } - start_log_replay: +start_log_replay: cur_dblock = oldest_start; if (oldest_trans_id) { reiserfs_debug(sb, REISERFS_DEBUG_CODE, @@ -2444,9 +2522,11 @@ static int journal_read(struct super_block *sb) reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1225: No valid " "transactions found"); } - /* j_start does not get set correctly if we don't replay any transactions. - ** if we had a valid journal_header, set j_start to the first unflushed transaction value, - ** copy the trans_id from the header + /* + * j_start does not get set correctly if we don't replay any + * transactions. if we had a valid journal_header, set j_start + * to the first unflushed transaction value, copy the trans_id + * from the header */ if (valid_journal_header && replay_count == 0) { journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset); @@ -2475,8 +2555,9 @@ static int journal_read(struct super_block *sb) _update_journal_header_block(sb, journal->j_start, journal->j_last_flush_trans_id)) { reiserfs_write_unlock(sb); - /* replay failed, caller must call free_journal_ram and abort - ** the mount + /* + * replay failed, caller must call free_journal_ram and abort + * the mount */ return -1; } @@ -2569,7 +2650,7 @@ static int journal_init_dev(struct super_block *super, return 0; } -/** +/* * When creating/tuning a file system user can assign some * journal params within boundaries which depend on the ratio * blocksize/standard_blocksize. @@ -2587,8 +2668,7 @@ static int check_advise_trans_params(struct super_block *sb, struct reiserfs_journal *journal) { if (journal->j_trans_max) { - /* Non-default journal params. - Do sanity check for them. */ + /* Non-default journal params. Do sanity check for them. */ int ratio = 1; if (sb->s_blocksize < REISERFS_STANDARD_BLKSIZE) ratio = REISERFS_STANDARD_BLKSIZE / sb->s_blocksize; @@ -2610,10 +2690,12 @@ static int check_advise_trans_params(struct super_block *sb, return 1; } } else { - /* Default journal params. - The file system was created by old version - of mkreiserfs, so some fields contain zeros, - and we need to advise proper values for them */ + /* + * Default journal params. + * The file system was created by old version + * of mkreiserfs, so some fields contain zeros, + * and we need to advise proper values for them + */ if (sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) { reiserfs_warning(sb, "sh-464", "bad blocksize (%u)", sb->s_blocksize); @@ -2626,9 +2708,7 @@ static int check_advise_trans_params(struct super_block *sb, return 0; } -/* -** must be called once on fs mount. calls journal_read for you -*/ +/* must be called once on fs mount. calls journal_read for you */ int journal_init(struct super_block *sb, const char *j_dev_name, int old_format, unsigned int commit_max_age) { @@ -2667,8 +2747,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name, REISERFS_DISK_OFFSET_IN_BYTES / sb->s_blocksize + 2); - /* Sanity check to see is the standard journal fitting within first bitmap - (actual for small blocksizes) */ + /* + * Sanity check to see is the standard journal fitting + * within first bitmap (actual for small blocksizes) + */ if (!SB_ONDISK_JOURNAL_DEVICE(sb) && (SB_JOURNAL_1st_RESERVED_BLOCK(sb) + SB_ONDISK_JOURNAL_SIZE(sb) > sb->s_blocksize * 8)) { @@ -2754,20 +2836,20 @@ int journal_init(struct super_block *sb, const char *j_dev_name, journal->j_start = 0; journal->j_len = 0; journal->j_len_alloc = 0; - atomic_set(&(journal->j_wcount), 0); - atomic_set(&(journal->j_async_throttle), 0); + atomic_set(&journal->j_wcount, 0); + atomic_set(&journal->j_async_throttle, 0); journal->j_bcount = 0; journal->j_trans_start_time = 0; journal->j_last = NULL; journal->j_first = NULL; - init_waitqueue_head(&(journal->j_join_wait)); + init_waitqueue_head(&journal->j_join_wait); mutex_init(&journal->j_mutex); mutex_init(&journal->j_flush_mutex); journal->j_trans_id = 10; journal->j_mount_id = 10; journal->j_state = 0; - atomic_set(&(journal->j_jlock), 0); + atomic_set(&journal->j_jlock, 0); journal->j_cnode_free_list = allocate_cnodes(num_cnodes); journal->j_cnode_free_orig = journal->j_cnode_free_list; journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0; @@ -2807,23 +2889,19 @@ int journal_init(struct super_block *sb, const char *j_dev_name, goto free_and_return; } - reiserfs_mounted_fs_count++; - if (reiserfs_mounted_fs_count <= 1) - commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0); - INIT_DELAYED_WORK(&journal->j_work, flush_async_commits); journal->j_work_sb = sb; return 0; - free_and_return: +free_and_return: free_journal_ram(sb); return 1; } /* -** test for a polite end of the current transaction. Used by file_write, and should -** be used by delete to make sure they don't write more than can fit inside a single -** transaction -*/ + * test for a polite end of the current transaction. Used by file_write, + * and should be used by delete to make sure they don't write more than + * can fit inside a single transaction + */ int journal_transaction_should_end(struct reiserfs_transaction_handle *th, int new_alloc) { @@ -2835,7 +2913,7 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th, return 0; if (journal->j_must_wait > 0 || (journal->j_len_alloc + new_alloc) >= journal->j_max_batch || - atomic_read(&(journal->j_jlock)) || + atomic_read(&journal->j_jlock) || (now - journal->j_trans_start_time) > journal->j_max_trans_age || journal->j_cnode_free < (journal->j_trans_max * 3)) { return 1; @@ -2846,8 +2924,7 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th, return 0; } -/* this must be called inside a transaction -*/ +/* this must be called inside a transaction */ void reiserfs_block_writes(struct reiserfs_transaction_handle *th) { struct reiserfs_journal *journal = SB_JOURNAL(th->t_super); @@ -2857,8 +2934,7 @@ void reiserfs_block_writes(struct reiserfs_transaction_handle *th) return; } -/* this must be called without a transaction started -*/ +/* this must be called without a transaction started */ void reiserfs_allow_writes(struct super_block *s) { struct reiserfs_journal *journal = SB_JOURNAL(s); @@ -2866,8 +2942,7 @@ void reiserfs_allow_writes(struct super_block *s) wake_up(&journal->j_join_wait); } -/* this must be called without a transaction started -*/ +/* this must be called without a transaction started */ void reiserfs_wait_on_write_block(struct super_block *s) { struct reiserfs_journal *journal = SB_JOURNAL(s); @@ -2929,11 +3004,12 @@ static void let_transaction_grow(struct super_block *sb, unsigned int trans_id) } } -/* join == true if you must join an existing transaction. -** join == false if you can deal with waiting for others to finish -** -** this will block until the transaction is joinable. send the number of blocks you -** expect to use in nblocks. +/* + * join == true if you must join an existing transaction. + * join == false if you can deal with waiting for others to finish + * + * this will block until the transaction is joinable. send the number of + * blocks you expect to use in nblocks. */ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct super_block *sb, unsigned long nblocks, @@ -2955,7 +3031,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, th->t_refcount = 1; th->t_super = sb; - relock: +relock: lock_journal(sb); if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) { unlock_journal(sb); @@ -2974,9 +3050,11 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, } now = get_seconds(); - /* if there is no room in the journal OR - ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning - ** we don't sleep if there aren't other writers + /* + * if there is no room in the journal OR + * if this transaction is too old, and we weren't called joinable, + * wait for it to finish before beginning we don't sleep if there + * aren't other writers */ if ((!join && journal->j_must_wait > 0) || @@ -2990,7 +3068,8 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) { old_trans_id = journal->j_trans_id; - unlock_journal(sb); /* allow others to finish this transaction */ + /* allow others to finish this transaction */ + unlock_journal(sb); if (!join && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch && @@ -3002,8 +3081,9 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, goto relock; } } - /* don't mess with joining the transaction if all we have to do is - * wait for someone else to do a commit + /* + * don't mess with joining the transaction if all we + * have to do is wait for someone else to do a commit */ if (atomic_read(&journal->j_jlock)) { while (journal->j_trans_id == old_trans_id && @@ -3012,15 +3092,15 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, } goto relock; } - retval = journal_join(&myth, sb, 1); + retval = journal_join(&myth, sb); if (retval) goto out_fail; /* someone might have ended the transaction while we joined */ if (old_trans_id != journal->j_trans_id) { - retval = do_journal_end(&myth, sb, 1, 0); + retval = do_journal_end(&myth, 0); } else { - retval = do_journal_end(&myth, sb, 1, COMMIT_NOW); + retval = do_journal_end(&myth, COMMIT_NOW); } if (retval) @@ -3033,7 +3113,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, if (journal->j_trans_start_time == 0) { journal->j_trans_start_time = get_seconds(); } - atomic_inc(&(journal->j_wcount)); + atomic_inc(&journal->j_wcount); journal->j_len_alloc += nblocks; th->t_blocks_logged = 0; th->t_blocks_allocated = nblocks; @@ -3042,11 +3122,13 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, INIT_LIST_HEAD(&th->t_list); return 0; - out_fail: +out_fail: memset(th, 0, sizeof(*th)); - /* Re-set th->t_super, so we can properly keep track of how many + /* + * Re-set th->t_super, so we can properly keep track of how many * persistent transactions there are. We need to do this so if this - * call is part of a failed restart_transaction, we can free it later */ + * call is part of a failed restart_transaction, we can free it later + */ th->t_super = sb; return retval; } @@ -3059,14 +3141,15 @@ struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct int ret; struct reiserfs_transaction_handle *th; - /* if we're nesting into an existing transaction. It will be - ** persistent on its own + /* + * if we're nesting into an existing transaction. It will be + * persistent on its own */ if (reiserfs_transaction_running(s)) { th = current->journal_info; th->t_refcount++; BUG_ON(th->t_refcount < 2); - + return th; } th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS); @@ -3087,7 +3170,7 @@ int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th) struct super_block *s = th->t_super; int ret = 0; if (th->t_trans_id) - ret = journal_end(th, th->t_super, th->t_blocks_allocated); + ret = journal_end(th); else ret = -EIO; if (th->t_refcount == 0) { @@ -3098,29 +3181,31 @@ int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th) } static int journal_join(struct reiserfs_transaction_handle *th, - struct super_block *sb, unsigned long nblocks) + struct super_block *sb) { struct reiserfs_transaction_handle *cur_th = current->journal_info; - /* this keeps do_journal_end from NULLing out the current->journal_info - ** pointer + /* + * this keeps do_journal_end from NULLing out the + * current->journal_info pointer */ th->t_handle_save = cur_th; BUG_ON(cur_th && cur_th->t_refcount > 1); - return do_journal_begin_r(th, sb, nblocks, JBEGIN_JOIN); + return do_journal_begin_r(th, sb, 1, JBEGIN_JOIN); } int journal_join_abort(struct reiserfs_transaction_handle *th, - struct super_block *sb, unsigned long nblocks) + struct super_block *sb) { struct reiserfs_transaction_handle *cur_th = current->journal_info; - /* this keeps do_journal_end from NULLing out the current->journal_info - ** pointer + /* + * this keeps do_journal_end from NULLing out the + * current->journal_info pointer */ th->t_handle_save = cur_th; BUG_ON(cur_th && cur_th->t_refcount > 1); - return do_journal_begin_r(th, sb, nblocks, JBEGIN_ABORT); + return do_journal_begin_r(th, sb, 1, JBEGIN_ABORT); } int journal_begin(struct reiserfs_transaction_handle *th, @@ -3142,9 +3227,10 @@ int journal_begin(struct reiserfs_transaction_handle *th, "journal_info != 0"); return 0; } else { - /* we've ended up with a handle from a different filesystem. - ** save it and restore on journal_end. This should never - ** really happen... + /* + * we've ended up with a handle from a different + * filesystem. save it and restore on journal_end. + * This should never really happen... */ reiserfs_warning(sb, "clm-2100", "nesting info a different FS"); @@ -3157,9 +3243,10 @@ int journal_begin(struct reiserfs_transaction_handle *th, ret = do_journal_begin_r(th, sb, nblocks, JBEGIN_REG); BUG_ON(current->journal_info != th); - /* I guess this boils down to being the reciprocal of clm-2100 above. - * If do_journal_begin_r fails, we need to put it back, since journal_end - * won't be called to do it. */ + /* + * I guess this boils down to being the reciprocal of clm-2100 above. + * If do_journal_begin_r fails, we need to put it back, since + * journal_end won't be called to do it. */ if (ret) current->journal_info = th->t_handle_save; else @@ -3169,17 +3256,19 @@ int journal_begin(struct reiserfs_transaction_handle *th, } /* -** puts bh into the current transaction. If it was already there, reorders removes the -** old pointers from the hash, and puts new ones in (to make sure replay happen in the right order). -** -** if it was dirty, cleans and files onto the clean list. I can't let it be dirty again until the -** transaction is committed. -** -** if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len. -*/ + * puts bh into the current transaction. If it was already there, reorders + * removes the old pointers from the hash, and puts new ones in (to make + * sure replay happen in the right order). + * + * if it was dirty, cleans and files onto the clean list. I can't let it + * be dirty again until the transaction is committed. + * + * if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len. + */ int journal_mark_dirty(struct reiserfs_transaction_handle *th, - struct super_block *sb, struct buffer_head *bh) + struct buffer_head *bh) { + struct super_block *sb = th->t_super; struct reiserfs_journal *journal = SB_JOURNAL(sb); struct reiserfs_journal_cnode *cn = NULL; int count_already_incd = 0; @@ -3201,9 +3290,10 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, return 0; } - /* this must be turned into a panic instead of a warning. We can't allow - ** a dirty or journal_dirty or locked buffer to be logged, as some changes - ** could get to disk too early. NOT GOOD. + /* + * this must be turned into a panic instead of a warning. We can't + * allow a dirty or journal_dirty or locked buffer to be logged, as + * some changes could get to disk too early. NOT GOOD. */ if (!prepared || buffer_dirty(bh)) { reiserfs_warning(sb, "journal-1777", @@ -3216,14 +3306,16 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, buffer_journal_dirty(bh) ? ' ' : '!'); } - if (atomic_read(&(journal->j_wcount)) <= 0) { + if (atomic_read(&journal->j_wcount) <= 0) { reiserfs_warning(sb, "journal-1409", "returning because j_wcount was %d", - atomic_read(&(journal->j_wcount))); + atomic_read(&journal->j_wcount)); return 1; } - /* this error means I've screwed up, and we've overflowed the transaction. - ** Nothing can be done here, except make the FS readonly or panic. + /* + * this error means I've screwed up, and we've overflowed + * the transaction. Nothing can be done here, except make the + * FS readonly or panic. */ if (journal->j_len >= journal->j_trans_max) { reiserfs_panic(th->t_super, "journal-1413", @@ -3280,9 +3372,9 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, return 0; } -int journal_end(struct reiserfs_transaction_handle *th, - struct super_block *sb, unsigned long nblocks) +int journal_end(struct reiserfs_transaction_handle *th) { + struct super_block *sb = th->t_super; if (!current->journal_info && th->t_refcount > 1) reiserfs_warning(sb, "REISER-NESTING", "th NULL, refcount %d", th->t_refcount); @@ -3297,8 +3389,9 @@ int journal_end(struct reiserfs_transaction_handle *th, struct reiserfs_transaction_handle *cur_th = current->journal_info; - /* we aren't allowed to close a nested transaction on a different - ** filesystem from the one in the task struct + /* + * we aren't allowed to close a nested transaction on a + * different filesystem from the one in the task struct */ BUG_ON(cur_th->t_super != th->t_super); @@ -3308,17 +3401,18 @@ int journal_end(struct reiserfs_transaction_handle *th, } return 0; } else { - return do_journal_end(th, sb, nblocks, 0); + return do_journal_end(th, 0); } } -/* removes from the current transaction, relsing and descrementing any counters. -** also files the removed buffer directly onto the clean list -** -** called by journal_mark_freed when a block has been deleted -** -** returns 1 if it cleaned and relsed the buffer. 0 otherwise -*/ +/* + * removes from the current transaction, relsing and descrementing any counters. + * also files the removed buffer directly onto the clean list + * + * called by journal_mark_freed when a block has been deleted + * + * returns 1 if it cleaned and relsed the buffer. 0 otherwise + */ static int remove_from_transaction(struct super_block *sb, b_blocknr_t blocknr, int already_cleaned) { @@ -3354,7 +3448,7 @@ static int remove_from_transaction(struct super_block *sb, clear_buffer_dirty(bh); clear_buffer_journal_test(bh); put_bh(bh); - if (atomic_read(&(bh->b_count)) < 0) { + if (atomic_read(&bh->b_count) < 0) { reiserfs_warning(sb, "journal-1752", "b_count < 0"); } @@ -3367,15 +3461,16 @@ static int remove_from_transaction(struct super_block *sb, } /* -** for any cnode in a journal list, it can only be dirtied of all the -** transactions that include it are committed to disk. -** this checks through each transaction, and returns 1 if you are allowed to dirty, -** and 0 if you aren't -** -** it is called by dirty_journal_list, which is called after flush_commit_list has gotten all the log -** blocks for a given transaction on disk -** -*/ + * for any cnode in a journal list, it can only be dirtied of all the + * transactions that include it are committed to disk. + * this checks through each transaction, and returns 1 if you are allowed + * to dirty, and 0 if you aren't + * + * it is called by dirty_journal_list, which is called after + * flush_commit_list has gotten all the log blocks for a given + * transaction on disk + * + */ static int can_dirty(struct reiserfs_journal_cnode *cn) { struct super_block *sb = cn->sb; @@ -3383,9 +3478,10 @@ static int can_dirty(struct reiserfs_journal_cnode *cn) struct reiserfs_journal_cnode *cur = cn->hprev; int can_dirty = 1; - /* first test hprev. These are all newer than cn, so any node here - ** with the same block number and dev means this node can't be sent - ** to disk right now. + /* + * first test hprev. These are all newer than cn, so any node here + * with the same block number and dev means this node can't be sent + * to disk right now. */ while (cur && can_dirty) { if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb && @@ -3394,13 +3490,14 @@ static int can_dirty(struct reiserfs_journal_cnode *cn) } cur = cur->hprev; } - /* then test hnext. These are all older than cn. As long as they - ** are committed to the log, it is safe to write cn to disk + /* + * then test hnext. These are all older than cn. As long as they + * are committed to the log, it is safe to write cn to disk */ cur = cn->hnext; while (cur && can_dirty) { if (cur->jlist && cur->jlist->j_len > 0 && - atomic_read(&(cur->jlist->j_commit_left)) > 0 && cur->bh && + atomic_read(&cur->jlist->j_commit_left) > 0 && cur->bh && cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) { can_dirty = 0; } @@ -3409,12 +3506,13 @@ static int can_dirty(struct reiserfs_journal_cnode *cn) return can_dirty; } -/* syncs the commit blocks, but does not force the real buffers to disk -** will wait until the current transaction is done/committed before returning -*/ -int journal_end_sync(struct reiserfs_transaction_handle *th, - struct super_block *sb, unsigned long nblocks) +/* + * syncs the commit blocks, but does not force the real buffers to disk + * will wait until the current transaction is done/committed before returning + */ +int journal_end_sync(struct reiserfs_transaction_handle *th) { + struct super_block *sb = th->t_super; struct reiserfs_journal *journal = SB_JOURNAL(sb); BUG_ON(!th->t_trans_id); @@ -3423,14 +3521,12 @@ int journal_end_sync(struct reiserfs_transaction_handle *th, if (journal->j_len == 0) { reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1); - journal_mark_dirty(th, sb, SB_BUFFER_WITH_SB(sb)); + journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb)); } - return do_journal_end(th, sb, nblocks, COMMIT_NOW | WAIT); + return do_journal_end(th, COMMIT_NOW | WAIT); } -/* -** writeback the pending async commits to disk -*/ +/* writeback the pending async commits to disk */ static void flush_async_commits(struct work_struct *work) { struct reiserfs_journal *journal = @@ -3450,9 +3546,9 @@ static void flush_async_commits(struct work_struct *work) } /* -** flushes any old transactions to disk -** ends the current transaction if it is too old -*/ + * flushes any old transactions to disk + * ends the current transaction if it is too old + */ void reiserfs_flush_old_commits(struct super_block *sb) { time_t now; @@ -3460,48 +3556,53 @@ void reiserfs_flush_old_commits(struct super_block *sb) struct reiserfs_journal *journal = SB_JOURNAL(sb); now = get_seconds(); - /* safety check so we don't flush while we are replaying the log during + /* + * safety check so we don't flush while we are replaying the log during * mount */ if (list_empty(&journal->j_journal_list)) return; - /* check the current transaction. If there are no writers, and it is + /* + * check the current transaction. If there are no writers, and it is * too old, finish it, and force the commit blocks to disk */ if (atomic_read(&journal->j_wcount) <= 0 && journal->j_trans_start_time > 0 && journal->j_len > 0 && (now - journal->j_trans_start_time) > journal->j_max_trans_age) { - if (!journal_join(&th, sb, 1)) { + if (!journal_join(&th, sb)) { reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1); - journal_mark_dirty(&th, sb, - SB_BUFFER_WITH_SB(sb)); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb)); - /* we're only being called from kreiserfsd, it makes no sense to do - ** an async commit so that kreiserfsd can do it later + /* + * we're only being called from kreiserfsd, it makes + * no sense to do an async commit so that kreiserfsd + * can do it later */ - do_journal_end(&th, sb, 1, COMMIT_NOW | WAIT); + do_journal_end(&th, COMMIT_NOW | WAIT); } } } /* -** returns 0 if do_journal_end should return right away, returns 1 if do_journal_end should finish the commit -** -** if the current transaction is too old, but still has writers, this will wait on j_join_wait until all -** the writers are done. By the time it wakes up, the transaction it was called has already ended, so it just -** flushes the commit list and returns 0. -** -** Won't batch when flush or commit_now is set. Also won't batch when others are waiting on j_join_wait. -** -** Note, we can't allow the journal_end to proceed while there are still writers in the log. -*/ -static int check_journal_end(struct reiserfs_transaction_handle *th, - struct super_block *sb, unsigned long nblocks, - int flags) + * returns 0 if do_journal_end should return right away, returns 1 if + * do_journal_end should finish the commit + * + * if the current transaction is too old, but still has writers, this will + * wait on j_join_wait until all the writers are done. By the time it + * wakes up, the transaction it was called has already ended, so it just + * flushes the commit list and returns 0. + * + * Won't batch when flush or commit_now is set. Also won't batch when + * others are waiting on j_join_wait. + * + * Note, we can't allow the journal_end to proceed while there are still + * writers in the log. + */ +static int check_journal_end(struct reiserfs_transaction_handle *th, int flags) { time_t now; @@ -3509,6 +3610,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, int commit_now = flags & COMMIT_NOW; int wait_on_commit = flags & WAIT; struct reiserfs_journal_list *jl; + struct super_block *sb = th->t_super; struct reiserfs_journal *journal = SB_JOURNAL(sb); BUG_ON(!th->t_trans_id); @@ -3520,23 +3622,27 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, } journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged); - if (atomic_read(&(journal->j_wcount)) > 0) { /* <= 0 is allowed. unmounting might not call begin */ - atomic_dec(&(journal->j_wcount)); - } + /* <= 0 is allowed. unmounting might not call begin */ + if (atomic_read(&journal->j_wcount) > 0) + atomic_dec(&journal->j_wcount); - /* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released - ** will be dealt with by next transaction that actually writes something, but should be taken - ** care of in this trans + /* + * BUG, deal with case where j_len is 0, but people previously + * freed blocks need to be released will be dealt with by next + * transaction that actually writes something, but should be taken + * care of in this trans */ BUG_ON(journal->j_len == 0); - /* if wcount > 0, and we are called to with flush or commit_now, - ** we wait on j_join_wait. We will wake up when the last writer has - ** finished the transaction, and started it on its way to the disk. - ** Then, we flush the commit or journal list, and just return 0 - ** because the rest of journal end was already done for this transaction. + /* + * if wcount > 0, and we are called to with flush or commit_now, + * we wait on j_join_wait. We will wake up when the last writer has + * finished the transaction, and started it on its way to the disk. + * Then, we flush the commit or journal list, and just return 0 + * because the rest of journal end was already done for this + * transaction. */ - if (atomic_read(&(journal->j_wcount)) > 0) { + if (atomic_read(&journal->j_wcount) > 0) { if (flush || commit_now) { unsigned trans_id; @@ -3544,27 +3650,30 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, trans_id = jl->j_trans_id; if (wait_on_commit) jl->j_state |= LIST_COMMIT_PENDING; - atomic_set(&(journal->j_jlock), 1); + atomic_set(&journal->j_jlock, 1); if (flush) { journal->j_next_full_flush = 1; } unlock_journal(sb); - /* sleep while the current transaction is still j_jlocked */ + /* + * sleep while the current transaction is + * still j_jlocked + */ while (journal->j_trans_id == trans_id) { if (atomic_read(&journal->j_jlock)) { queue_log_writer(sb); } else { lock_journal(sb); if (journal->j_trans_id == trans_id) { - atomic_set(&(journal->j_jlock), + atomic_set(&journal->j_jlock, 1); } unlock_journal(sb); } } BUG_ON(journal->j_trans_id == trans_id); - + if (commit_now && journal_list_still_alive(sb, trans_id) && wait_on_commit) { @@ -3584,7 +3693,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, } /* don't batch when someone is waiting on j_join_wait */ /* don't batch when syncing the commit or flushing the whole trans */ - if (!(journal->j_must_wait > 0) && !(atomic_read(&(journal->j_jlock))) + if (!(journal->j_must_wait > 0) && !(atomic_read(&journal->j_jlock)) && !flush && !commit_now && (journal->j_len < journal->j_max_batch) && journal->j_len_alloc < journal->j_max_batch && journal->j_cnode_free > (journal->j_trans_max * 3)) { @@ -3602,19 +3711,22 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, } /* -** Does all the work that makes deleting blocks safe. -** when deleting a block mark BH_JNew, just remove it from the current transaction, clean it's buffer_head and move on. -** -** otherwise: -** set a bit for the block in the journal bitmap. That will prevent it from being allocated for unformatted nodes -** before this transaction has finished. -** -** mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers. That will prevent any old transactions with -** this block from trying to flush to the real location. Since we aren't removing the cnode from the journal_list_hash, -** the block can't be reallocated yet. -** -** Then remove it from the current transaction, decrementing any counters and filing it on the clean list. -*/ + * Does all the work that makes deleting blocks safe. + * when deleting a block mark BH_JNew, just remove it from the current + * transaction, clean it's buffer_head and move on. + * + * otherwise: + * set a bit for the block in the journal bitmap. That will prevent it from + * being allocated for unformatted nodes before this transaction has finished. + * + * mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers. + * That will prevent any old transactions with this block from trying to flush + * to the real location. Since we aren't removing the cnode from the + * journal_list_hash, *the block can't be reallocated yet. + * + * Then remove it from the current transaction, decrementing any counters and + * filing it on the clean list. + */ int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_block *sb, b_blocknr_t blocknr) { @@ -3637,7 +3749,10 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th, reiserfs_clean_and_file_buffer(bh); cleaned = remove_from_transaction(sb, blocknr, cleaned); } else { - /* set the bit for this block in the journal bitmap for this transaction */ + /* + * set the bit for this block in the journal bitmap + * for this transaction + */ jb = journal->j_current_jl->j_list_bitmap; if (!jb) { reiserfs_panic(sb, "journal-1702", @@ -3653,17 +3768,22 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th, } cleaned = remove_from_transaction(sb, blocknr, cleaned); - /* find all older transactions with this block, make sure they don't try to write it out */ + /* + * find all older transactions with this block, + * make sure they don't try to write it out + */ cn = get_journal_hash_dev(sb, journal->j_list_hash_table, blocknr); while (cn) { if (sb == cn->sb && blocknr == cn->blocknr) { set_bit(BLOCK_FREED, &cn->state); if (cn->bh) { + /* + * remove_from_transaction will brelse + * the buffer if it was in the current + * trans + */ if (!cleaned) { - /* remove_from_transaction will brelse the buffer if it was - ** in the current trans - */ clear_buffer_journal_dirty(cn-> bh); clear_buffer_dirty(cn->bh); @@ -3672,16 +3792,19 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th, cleaned = 1; put_bh(cn->bh); if (atomic_read - (&(cn->bh->b_count)) < 0) { + (&cn->bh->b_count) < 0) { reiserfs_warning(sb, "journal-2138", "cn->bh->b_count < 0"); } } - if (cn->jlist) { /* since we are clearing the bh, we MUST dec nonzerolen */ - atomic_dec(& - (cn->jlist-> - j_nonzerolen)); + /* + * since we are clearing the bh, + * we MUST dec nonzerolen + */ + if (cn->jlist) { + atomic_dec(&cn->jlist-> + j_nonzerolen); } cn->bh = NULL; } @@ -3714,10 +3837,16 @@ static int __commit_trans_jl(struct inode *inode, unsigned long id, struct reiserfs_journal *journal = SB_JOURNAL(sb); int ret = 0; - /* is it from the current transaction, or from an unknown transaction? */ + /* + * is it from the current transaction, + * or from an unknown transaction? + */ if (id == journal->j_trans_id) { jl = journal->j_current_jl; - /* try to let other writers come in and grow this transaction */ + /* + * try to let other writers come in and + * grow this transaction + */ let_transaction_grow(sb, id); if (journal->j_trans_id != id) { goto flush_commit_only; @@ -3731,21 +3860,22 @@ static int __commit_trans_jl(struct inode *inode, unsigned long id, if (journal->j_trans_id != id) { reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1); - journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb)); - ret = journal_end(&th, sb, 1); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb)); + ret = journal_end(&th); goto flush_commit_only; } - ret = journal_end_sync(&th, sb, 1); + ret = journal_end_sync(&th); if (!ret) ret = 1; } else { - /* this gets tricky, we have to make sure the journal list in + /* + * this gets tricky, we have to make sure the journal list in * the inode still exists. We know the list is still around * if we've got a larger transaction id than the oldest list */ - flush_commit_only: +flush_commit_only: if (journal_list_still_alive(inode->i_sb, id)) { /* * we only set ret to 1 when we know for sure @@ -3768,7 +3898,8 @@ int reiserfs_commit_for_inode(struct inode *inode) unsigned int id = REISERFS_I(inode)->i_trans_id; struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl; - /* for the whole inode, assume unset id means it was + /* + * for the whole inode, assume unset id means it was * changed in the current transaction. More conservative */ if (!id || !jl) { @@ -3806,12 +3937,11 @@ void reiserfs_restore_prepared_buffer(struct super_block *sb, extern struct tree_balance *cur_tb; /* -** before we can change a metadata block, we have to make sure it won't -** be written to disk while we are altering it. So, we must: -** clean it -** wait on it. -** -*/ + * before we can change a metadata block, we have to make sure it won't + * be written to disk while we are altering it. So, we must: + * clean it + * wait on it. + */ int reiserfs_prepare_for_journal(struct super_block *sb, struct buffer_head *bh, int wait) { @@ -3832,19 +3962,18 @@ int reiserfs_prepare_for_journal(struct super_block *sb, } /* -** long and ugly. If flush, will not return until all commit -** blocks and all real buffers in the trans are on disk. -** If no_async, won't return until all commit blocks are on disk. -** -** keep reading, there are comments as you go along -** -** If the journal is aborted, we just clean up. Things like flushing -** journal lists, etc just won't happen. -*/ -static int do_journal_end(struct reiserfs_transaction_handle *th, - struct super_block *sb, unsigned long nblocks, - int flags) + * long and ugly. If flush, will not return until all commit + * blocks and all real buffers in the trans are on disk. + * If no_async, won't return until all commit blocks are on disk. + * + * keep reading, there are comments as you go along + * + * If the journal is aborted, we just clean up. Things like flushing + * journal lists, etc just won't happen. + */ +static int do_journal_end(struct reiserfs_transaction_handle *th, int flags) { + struct super_block *sb = th->t_super; struct reiserfs_journal *journal = SB_JOURNAL(sb); struct reiserfs_journal_cnode *cn, *next, *jl_cn; struct reiserfs_journal_cnode *last_cn = NULL; @@ -3866,9 +3995,12 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, BUG_ON(th->t_refcount > 1); BUG_ON(!th->t_trans_id); + BUG_ON(!th->t_super); - /* protect flush_older_commits from doing mistakes if the - transaction ID counter gets overflowed. */ + /* + * protect flush_older_commits from doing mistakes if the + * transaction ID counter gets overflowed. + */ if (th->t_trans_id == ~0U) flags |= FLUSH_ALL | COMMIT_NOW | WAIT; flush = flags & FLUSH_ALL; @@ -3879,7 +4011,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, if (journal->j_len == 0) { reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1); - journal_mark_dirty(th, sb, SB_BUFFER_WITH_SB(sb)); + journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb)); } lock_journal(sb); @@ -3892,10 +4024,12 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, wait_on_commit = 1; } - /* check_journal_end locks the journal, and unlocks if it does not return 1 - ** it tells us if we should continue with the journal_end, or just return + /* + * check_journal_end locks the journal, and unlocks if it does + * not return 1 it tells us if we should continue with the + * journal_end, or just return */ - if (!check_journal_end(th, sb, nblocks, flags)) { + if (!check_journal_end(th, flags)) { reiserfs_schedule_old_flush(sb); wake_queued_writers(sb); reiserfs_async_progress_wait(sb); @@ -3908,19 +4042,23 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, } /* - ** j must wait means we have to flush the log blocks, and the real blocks for - ** this transaction + * j must wait means we have to flush the log blocks, and the + * real blocks for this transaction */ if (journal->j_must_wait > 0) { flush = 1; } #ifdef REISERFS_PREALLOCATE - /* quota ops might need to nest, setup the journal_info pointer for them - * and raise the refcount so that it is > 0. */ + /* + * quota ops might need to nest, setup the journal_info pointer + * for them and raise the refcount so that it is > 0. + */ current->journal_info = th; th->t_refcount++; - reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into - * the transaction */ + + /* it should not involve new blocks into the transaction */ + reiserfs_discard_all_prealloc(th); + th->t_refcount--; current->journal_info = th->t_handle_save; #endif @@ -3936,7 +4074,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8); set_desc_trans_id(desc, journal->j_trans_id); - /* setup commit block. Don't write (keep it clean too) this one until after everyone else is written */ + /* + * setup commit block. Don't write (keep it clean too) this one + * until after everyone else is written + */ c_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + ((journal->j_start + journal->j_len + 1) % SB_ONDISK_JOURNAL_SIZE(sb))); @@ -3948,7 +4089,8 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, /* init this journal list */ jl = journal->j_current_jl; - /* we lock the commit before doing anything because + /* + * we lock the commit before doing anything because * we want to make sure nobody tries to run flush_commit_list until * the new transaction is fully setup, and we've already flushed the * ordered bh list @@ -3968,9 +4110,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, atomic_set(&jl->j_commit_left, journal->j_len + 2); jl->j_realblock = NULL; - /* The ENTIRE FOR LOOP MUST not cause schedule to occur. - ** for each real block, add it to the journal list hash, - ** copy into real block index array in the commit or desc block + /* + * The ENTIRE FOR LOOP MUST not cause schedule to occur. + * for each real block, add it to the journal list hash, + * copy into real block index array in the commit or desc block */ trans_half = journal_trans_half(sb->s_blocksize); for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) { @@ -3989,9 +4132,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, last_cn->next = jl_cn; } last_cn = jl_cn; - /* make sure the block we are trying to log is not a block - of journal or reserved area */ - + /* + * make sure the block we are trying to log + * is not a block of journal or reserved area + */ if (is_block_in_log_or_reserved_area (sb, cn->bh->b_blocknr)) { reiserfs_panic(sb, "journal-2332", @@ -4021,19 +4165,26 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, set_desc_trans_id(desc, journal->j_trans_id); set_commit_trans_len(commit, journal->j_len); - /* special check in case all buffers in the journal were marked for not logging */ + /* + * special check in case all buffers in the journal + * were marked for not logging + */ BUG_ON(journal->j_len == 0); - /* we're about to dirty all the log blocks, mark the description block + /* + * we're about to dirty all the log blocks, mark the description block * dirty now too. Don't mark the commit block dirty until all the * others are on disk */ mark_buffer_dirty(d_bh); - /* first data block is j_start + 1, so add one to cur_write_start wherever you use it */ + /* + * first data block is j_start + 1, so add one to + * cur_write_start wherever you use it + */ cur_write_start = journal->j_start; cn = journal->j_first; - jindex = 1; /* start at one so we don't get the desc again */ + jindex = 1; /* start at one so we don't get the desc again */ while (cn) { clear_buffer_journal_new(cn->bh); /* copy all the real blocks into log area. dirty log blocks */ @@ -4059,7 +4210,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, set_buffer_journal_dirty(cn->bh); clear_buffer_journaled(cn->bh); } else { - /* JDirty cleared sometime during transaction. don't log this one */ + /* + * JDirty cleared sometime during transaction. + * don't log this one + */ reiserfs_warning(sb, "journal-2048", "BAD, buffer in journal hash, " "but not JDirty!"); @@ -4071,9 +4225,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, reiserfs_cond_resched(sb); } - /* we are done with both the c_bh and d_bh, but - ** c_bh must be written after all other commit blocks, - ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1. + /* + * we are done with both the c_bh and d_bh, but + * c_bh must be written after all other commit blocks, + * so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1. */ journal->j_current_jl = alloc_journal_list(sb); @@ -4088,7 +4243,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, journal->j_start = (journal->j_start + journal->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(sb); - atomic_set(&(journal->j_wcount), 0); + atomic_set(&journal->j_wcount, 0); journal->j_bcount = 0; journal->j_last = NULL; journal->j_first = NULL; @@ -4104,15 +4259,18 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, journal->j_next_async_flush = 0; init_journal_hash(sb); - // make sure reiserfs_add_jh sees the new current_jl before we - // write out the tails + /* + * make sure reiserfs_add_jh sees the new current_jl before we + * write out the tails + */ smp_mb(); - /* tail conversion targets have to hit the disk before we end the + /* + * tail conversion targets have to hit the disk before we end the * transaction. Otherwise a later transaction might repack the tail - * before this transaction commits, leaving the data block unflushed and - * clean, if we crash before the later transaction commits, the data block - * is lost. + * before this transaction commits, leaving the data block unflushed + * and clean, if we crash before the later transaction commits, the + * data block is lost. */ if (!list_empty(&jl->j_tail_bh_list)) { depth = reiserfs_write_unlock_nested(sb); @@ -4123,24 +4281,27 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, BUG_ON(!list_empty(&jl->j_tail_bh_list)); mutex_unlock(&jl->j_commit_mutex); - /* honor the flush wishes from the caller, simple commits can - ** be done outside the journal lock, they are done below - ** - ** if we don't flush the commit list right now, we put it into - ** the work queue so the people waiting on the async progress work - ** queue don't wait for this proc to flush journal lists and such. + /* + * honor the flush wishes from the caller, simple commits can + * be done outside the journal lock, they are done below + * + * if we don't flush the commit list right now, we put it into + * the work queue so the people waiting on the async progress work + * queue don't wait for this proc to flush journal lists and such. */ if (flush) { flush_commit_list(sb, jl, 1); flush_journal_list(sb, jl, 1); } else if (!(jl->j_state & LIST_COMMIT_PENDING)) - queue_delayed_work(commit_wq, &journal->j_work, HZ / 10); + queue_delayed_work(REISERFS_SB(sb)->commit_wq, + &journal->j_work, HZ / 10); - /* if the next transaction has any chance of wrapping, flush - ** transactions that might get overwritten. If any journal lists are very - ** old flush them as well. + /* + * if the next transaction has any chance of wrapping, flush + * transactions that might get overwritten. If any journal lists + * are very old flush them as well. */ - first_jl: +first_jl: list_for_each_safe(entry, safe, &journal->j_journal_list) { temp_jl = JOURNAL_LIST_ENTRY(entry); if (journal->j_start <= temp_jl->j_start) { @@ -4151,8 +4312,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, } else if ((journal->j_start + journal->j_trans_max + 1) < SB_ONDISK_JOURNAL_SIZE(sb)) { - /* if we don't cross into the next transaction and we don't - * wrap, there is no way we can overlap any later transactions + /* + * if we don't cross into the next + * transaction and we don't wrap, there is + * no way we can overlap any later transactions * break now */ break; @@ -4166,10 +4329,12 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, flush_used_journal_lists(sb, temp_jl); goto first_jl; } else { - /* we don't overlap anything from out start to the end of the - * log, and our wrapped portion doesn't overlap anything at - * the start of the log. We can break - */ + /* + * we don't overlap anything from out start + * to the end of the log, and our wrapped + * portion doesn't overlap anything at + * the start of the log. We can break + */ break; } } @@ -4183,23 +4348,25 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, "could not get a list bitmap"); } - atomic_set(&(journal->j_jlock), 0); + atomic_set(&journal->j_jlock, 0); unlock_journal(sb); /* wake up any body waiting to join. */ clear_bit(J_WRITERS_QUEUED, &journal->j_state); - wake_up(&(journal->j_join_wait)); + wake_up(&journal->j_join_wait); if (!flush && wait_on_commit && journal_list_still_alive(sb, commit_trans_id)) { flush_commit_list(sb, jl, 1); } - out: +out: reiserfs_check_lock_depth(sb, "journal end2"); memset(th, 0, sizeof(*th)); - /* Re-set th->t_super, so we can properly keep track of how many + /* + * Re-set th->t_super, so we can properly keep track of how many * persistent transactions there are. We need to do this so if this - * call is part of a failed restart_transaction, we can free it later */ + * call is part of a failed restart_transaction, we can free it later + */ th->t_super = sb; return journal->j_errno; diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c index 79e5a8b4c22..d6744c8b24e 100644 --- a/fs/reiserfs/lbalance.c +++ b/fs/reiserfs/lbalance.c @@ -8,46 +8,42 @@ #include "reiserfs.h" #include <linux/buffer_head.h> -/* these are used in do_balance.c */ - -/* leaf_move_items - leaf_shift_left - leaf_shift_right - leaf_delete_items - leaf_insert_into_buf - leaf_paste_in_buffer - leaf_cut_from_buffer - leaf_paste_entries - */ - -/* copy copy_count entries from source directory item to dest buffer (creating new item if needed) */ +/* + * copy copy_count entries from source directory item to dest buffer + * (creating new item if needed) + */ static void leaf_copy_dir_entries(struct buffer_info *dest_bi, struct buffer_head *source, int last_first, int item_num, int from, int copy_count) { struct buffer_head *dest = dest_bi->bi_bh; - int item_num_in_dest; /* either the number of target item, - or if we must create a new item, - the number of the item we will - create it next to */ + /* + * either the number of target item, or if we must create a + * new item, the number of the item we will create it next to + */ + int item_num_in_dest; + struct item_head *ih; struct reiserfs_de_head *deh; int copy_records_len; /* length of all records in item to be copied */ char *records; - ih = B_N_PITEM_HEAD(source, item_num); + ih = item_head(source, item_num); RFALSE(!is_direntry_le_ih(ih), "vs-10000: item must be directory item"); - /* length of all record to be copied and first byte of the last of them */ + /* + * length of all record to be copied and first byte of + * the last of them + */ deh = B_I_DEH(source, ih); if (copy_count) { - copy_records_len = (from ? deh_location(&(deh[from - 1])) : + copy_records_len = (from ? deh_location(&deh[from - 1]) : ih_item_len(ih)) - - deh_location(&(deh[from + copy_count - 1])); + deh_location(&deh[from + copy_count - 1]); records = source->b_data + ih_location(ih) + - deh_location(&(deh[from + copy_count - 1])); + deh_location(&deh[from + copy_count - 1]); } else { copy_records_len = 0; records = NULL; @@ -59,12 +55,15 @@ static void leaf_copy_dir_entries(struct buffer_info *dest_bi, LAST_TO_FIRST) ? ((B_NR_ITEMS(dest)) ? 0 : -1) : (B_NR_ITEMS(dest) - 1); - /* if there are no items in dest or the first/last item in dest is not item of the same directory */ + /* + * if there are no items in dest or the first/last item in + * dest is not item of the same directory + */ if ((item_num_in_dest == -1) || (last_first == FIRST_TO_LAST && le_ih_k_offset(ih) == DOT_OFFSET) || (last_first == LAST_TO_FIRST && comp_short_le_keys /*COMP_SHORT_KEYS */ (&ih->ih_key, - B_N_PKEY(dest, + leaf_key(dest, item_num_in_dest)))) { /* create new item in dest */ @@ -80,16 +79,22 @@ static void leaf_copy_dir_entries(struct buffer_info *dest_bi, if (last_first == LAST_TO_FIRST) { /* form key by the following way */ - if (from < I_ENTRY_COUNT(ih)) { + if (from < ih_entry_count(ih)) { set_le_ih_k_offset(&new_ih, - deh_offset(&(deh[from]))); - /*memcpy (&new_ih.ih_key.k_offset, &deh[from].deh_offset, SHORT_KEY_SIZE); */ + deh_offset(&deh[from])); } else { - /* no entries will be copied to this item in this function */ + /* + * no entries will be copied to this + * item in this function + */ set_le_ih_k_offset(&new_ih, U32_MAX); - /* this item is not yet valid, but we want I_IS_DIRECTORY_ITEM to return 1 for it, so we -1 */ + /* + * this item is not yet valid, but we + * want I_IS_DIRECTORY_ITEM to return 1 + * for it, so we -1 + */ } - set_le_key_k_type(KEY_FORMAT_3_5, &(new_ih.ih_key), + set_le_key_k_type(KEY_FORMAT_3_5, &new_ih.ih_key, TYPE_DIRENTRY); } @@ -113,36 +118,44 @@ static void leaf_copy_dir_entries(struct buffer_info *dest_bi, leaf_paste_entries(dest_bi, item_num_in_dest, (last_first == - FIRST_TO_LAST) ? I_ENTRY_COUNT(B_N_PITEM_HEAD(dest, + FIRST_TO_LAST) ? ih_entry_count(item_head(dest, item_num_in_dest)) : 0, copy_count, deh + from, records, DEH_SIZE * copy_count + copy_records_len); } -/* Copy the first (if last_first == FIRST_TO_LAST) or last (last_first == LAST_TO_FIRST) item or - part of it or nothing (see the return 0 below) from SOURCE to the end - (if last_first) or beginning (!last_first) of the DEST */ +/* + * Copy the first (if last_first == FIRST_TO_LAST) or last + * (last_first == LAST_TO_FIRST) item or part of it or nothing + * (see the return 0 below) from SOURCE to the end (if last_first) + * or beginning (!last_first) of the DEST + */ /* returns 1 if anything was copied, else 0 */ static int leaf_copy_boundary_item(struct buffer_info *dest_bi, struct buffer_head *src, int last_first, int bytes_or_entries) { struct buffer_head *dest = dest_bi->bi_bh; - int dest_nr_item, src_nr_item; /* number of items in the source and destination buffers */ + /* number of items in the source and destination buffers */ + int dest_nr_item, src_nr_item; struct item_head *ih; struct item_head *dih; dest_nr_item = B_NR_ITEMS(dest); + /* + * if ( DEST is empty or first item of SOURCE and last item of + * DEST are the items of different objects or of different types ) + * then there is no need to treat this item differently from the + * other items that we copy, so we return + */ if (last_first == FIRST_TO_LAST) { - /* if ( DEST is empty or first item of SOURCE and last item of DEST are the items of different objects - or of different types ) then there is no need to treat this item differently from the other items - that we copy, so we return */ - ih = B_N_PITEM_HEAD(src, 0); - dih = B_N_PITEM_HEAD(dest, dest_nr_item - 1); + ih = item_head(src, 0); + dih = item_head(dest, dest_nr_item - 1); + + /* there is nothing to merge */ if (!dest_nr_item - || (!op_is_left_mergeable(&(ih->ih_key), src->b_size))) - /* there is nothing to merge */ + || (!op_is_left_mergeable(&ih->ih_key, src->b_size))) return 0; RFALSE(!ih_item_len(ih), @@ -157,8 +170,11 @@ static int leaf_copy_boundary_item(struct buffer_info *dest_bi, return 1; } - /* copy part of the body of the first item of SOURCE to the end of the body of the last item of the DEST - part defined by 'bytes_or_entries'; if bytes_or_entries == -1 copy whole body; don't create new item header + /* + * copy part of the body of the first item of SOURCE + * to the end of the body of the last item of the DEST + * part defined by 'bytes_or_entries'; if bytes_or_entries + * == -1 copy whole body; don't create new item header */ if (bytes_or_entries == -1) bytes_or_entries = ih_item_len(ih); @@ -176,11 +192,13 @@ static int leaf_copy_boundary_item(struct buffer_info *dest_bi, } #endif - /* merge first item (or its part) of src buffer with the last - item of dest buffer. Both are of the same file */ + /* + * merge first item (or its part) of src buffer with the last + * item of dest buffer. Both are of the same file + */ leaf_paste_in_buffer(dest_bi, dest_nr_item - 1, ih_item_len(dih), - bytes_or_entries, B_I_PITEM(src, ih), 0); + bytes_or_entries, ih_item_body(src, ih), 0); if (is_indirect_le_ih(dih)) { RFALSE(get_ih_free_space(dih), @@ -195,19 +213,23 @@ static int leaf_copy_boundary_item(struct buffer_info *dest_bi, /* copy boundary item to right (last_first == LAST_TO_FIRST) */ - /* ( DEST is empty or last item of SOURCE and first item of DEST - are the items of different object or of different types ) + /* + * (DEST is empty or last item of SOURCE and first item of DEST + * are the items of different object or of different types) */ src_nr_item = B_NR_ITEMS(src); - ih = B_N_PITEM_HEAD(src, src_nr_item - 1); - dih = B_N_PITEM_HEAD(dest, 0); + ih = item_head(src, src_nr_item - 1); + dih = item_head(dest, 0); - if (!dest_nr_item || !op_is_left_mergeable(&(dih->ih_key), src->b_size)) + if (!dest_nr_item || !op_is_left_mergeable(&dih->ih_key, src->b_size)) return 0; if (is_direntry_le_ih(ih)) { + /* + * bytes_or_entries = entries number in last + * item body of SOURCE + */ if (bytes_or_entries == -1) - /* bytes_or_entries = entries number in last item body of SOURCE */ bytes_or_entries = ih_entry_count(ih); leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST, @@ -217,9 +239,11 @@ static int leaf_copy_boundary_item(struct buffer_info *dest_bi, return 1; } - /* copy part of the body of the last item of SOURCE to the begin of the body of the first item of the DEST; - part defined by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body; change first item key of the DEST; - don't create new item header + /* + * copy part of the body of the last item of SOURCE to the + * begin of the body of the first item of the DEST; part defined + * by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body; + * change first item key of the DEST; don't create new item header */ RFALSE(is_indirect_le_ih(ih) && get_ih_free_space(ih), @@ -270,15 +294,18 @@ static int leaf_copy_boundary_item(struct buffer_info *dest_bi, } leaf_paste_in_buffer(dest_bi, 0, 0, bytes_or_entries, - B_I_PITEM(src, + ih_item_body(src, ih) + ih_item_len(ih) - bytes_or_entries, 0); return 1; } -/* copy cpy_mun items from buffer src to buffer dest - * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning from first-th item in src to tail of dest - * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning from first-th item in src to head of dest +/* + * copy cpy_mun items from buffer src to buffer dest + * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning + * from first-th item in src to tail of dest + * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning + * from first-th item in src to head of dest */ static void leaf_copy_items_entirely(struct buffer_info *dest_bi, struct buffer_head *src, int last_first, @@ -311,11 +338,14 @@ static void leaf_copy_items_entirely(struct buffer_info *dest_bi, nr = blkh_nr_item(blkh); free_space = blkh_free_space(blkh); - /* we will insert items before 0-th or nr-th item in dest buffer. It depends of last_first parameter */ + /* + * we will insert items before 0-th or nr-th item in dest buffer. + * It depends of last_first parameter + */ dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr; /* location of head of first new item */ - ih = B_N_PITEM_HEAD(dest, dest_before); + ih = item_head(dest, dest_before); RFALSE(blkh_free_space(blkh) < cpy_num * IH_SIZE, "vs-10140: not enough free space for headers %d (needed %d)", @@ -325,7 +355,7 @@ static void leaf_copy_items_entirely(struct buffer_info *dest_bi, memmove(ih + cpy_num, ih, (nr - dest_before) * IH_SIZE); /* copy item headers */ - memcpy(ih, B_N_PITEM_HEAD(src, first), cpy_num * IH_SIZE); + memcpy(ih, item_head(src, first), cpy_num * IH_SIZE); free_space -= (IH_SIZE * cpy_num); set_blkh_free_space(blkh, free_space); @@ -338,8 +368,8 @@ static void leaf_copy_items_entirely(struct buffer_info *dest_bi, } /* prepare space for items */ - last_loc = ih_location(&(ih[nr + cpy_num - 1 - dest_before])); - last_inserted_loc = ih_location(&(ih[cpy_num - 1])); + last_loc = ih_location(&ih[nr + cpy_num - 1 - dest_before]); + last_inserted_loc = ih_location(&ih[cpy_num - 1]); /* check free space */ RFALSE(free_space < j - last_inserted_loc, @@ -352,7 +382,8 @@ static void leaf_copy_items_entirely(struct buffer_info *dest_bi, /* copy items */ memcpy(dest->b_data + last_inserted_loc, - B_N_PITEM(src, (first + cpy_num - 1)), j - last_inserted_loc); + item_body(src, (first + cpy_num - 1)), + j - last_inserted_loc); /* sizes, item number */ set_blkh_nr_item(blkh, nr + cpy_num); @@ -376,8 +407,10 @@ static void leaf_copy_items_entirely(struct buffer_info *dest_bi, } } -/* This function splits the (liquid) item into two items (useful when - shifting part of an item into another node.) */ +/* + * This function splits the (liquid) item into two items (useful when + * shifting part of an item into another node.) + */ static void leaf_item_bottle(struct buffer_info *dest_bi, struct buffer_head *src, int last_first, int item_num, int cpy_bytes) @@ -389,17 +422,22 @@ static void leaf_item_bottle(struct buffer_info *dest_bi, "vs-10170: bytes == - 1 means: do not split item"); if (last_first == FIRST_TO_LAST) { - /* if ( if item in position item_num in buffer SOURCE is directory item ) */ - ih = B_N_PITEM_HEAD(src, item_num); + /* + * if ( if item in position item_num in buffer SOURCE + * is directory item ) + */ + ih = item_head(src, item_num); if (is_direntry_le_ih(ih)) leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, item_num, 0, cpy_bytes); else { struct item_head n_ih; - /* copy part of the body of the item number 'item_num' of SOURCE to the end of the DEST - part defined by 'cpy_bytes'; create new item header; change old item_header (????); - n_ih = new item_header; + /* + * copy part of the body of the item number 'item_num' + * of SOURCE to the end of the DEST part defined by + * 'cpy_bytes'; create new item header; change old + * item_header (????); n_ih = new item_header; */ memcpy(&n_ih, ih, IH_SIZE); put_ih_item_len(&n_ih, cpy_bytes); @@ -411,30 +449,36 @@ static void leaf_item_bottle(struct buffer_info *dest_bi, set_ih_free_space(&n_ih, 0); } - RFALSE(op_is_left_mergeable(&(ih->ih_key), src->b_size), + RFALSE(op_is_left_mergeable(&ih->ih_key, src->b_size), "vs-10190: bad mergeability of item %h", ih); n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ leaf_insert_into_buf(dest_bi, B_NR_ITEMS(dest), &n_ih, - B_N_PITEM(src, item_num), 0); + item_body(src, item_num), 0); } } else { - /* if ( if item in position item_num in buffer SOURCE is directory item ) */ - ih = B_N_PITEM_HEAD(src, item_num); + /* + * if ( if item in position item_num in buffer + * SOURCE is directory item ) + */ + ih = item_head(src, item_num); if (is_direntry_le_ih(ih)) leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST, item_num, - I_ENTRY_COUNT(ih) - cpy_bytes, + ih_entry_count(ih) - cpy_bytes, cpy_bytes); else { struct item_head n_ih; - /* copy part of the body of the item number 'item_num' of SOURCE to the begin of the DEST - part defined by 'cpy_bytes'; create new item header; - n_ih = new item_header; + /* + * copy part of the body of the item number 'item_num' + * of SOURCE to the begin of the DEST part defined by + * 'cpy_bytes'; create new item header; + * n_ih = new item_header; */ memcpy(&n_ih, ih, SHORT_KEY_SIZE); - n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ + /* Endian safe, both le */ + n_ih.ih_version = ih->ih_version; if (is_direct_le_ih(ih)) { set_le_ih_k_offset(&n_ih, @@ -458,20 +502,22 @@ static void leaf_item_bottle(struct buffer_info *dest_bi, /* set item length */ put_ih_item_len(&n_ih, cpy_bytes); - n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ + /* Endian safe, both le */ + n_ih.ih_version = ih->ih_version; leaf_insert_into_buf(dest_bi, 0, &n_ih, - B_N_PITEM(src, - item_num) + - ih_item_len(ih) - cpy_bytes, 0); + item_body(src, item_num) + + ih_item_len(ih) - cpy_bytes, 0); } } } -/* If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE to DEST. - If cpy_bytes not equal to minus one than copy cpy_num-1 whole items from SOURCE to DEST. - From last item copy cpy_num bytes for regular item and cpy_num directory entries for - directory item. */ +/* + * If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE + * to DEST. If cpy_bytes not equal to minus one than copy cpy_num-1 whole + * items from SOURCE to DEST. From last item copy cpy_num bytes for regular + * item and cpy_num directory entries for directory item. + */ static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src, int last_first, int cpy_num, int cpy_bytes) { @@ -498,22 +544,34 @@ static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src, else bytes = -1; - /* copy the first item or it part or nothing to the end of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes)) */ + /* + * copy the first item or it part or nothing to the end of + * the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes)) + */ i = leaf_copy_boundary_item(dest_bi, src, FIRST_TO_LAST, bytes); cpy_num -= i; if (cpy_num == 0) return i; pos += i; if (cpy_bytes == -1) - /* copy first cpy_num items starting from position 'pos' of SOURCE to end of DEST */ + /* + * copy first cpy_num items starting from position + * 'pos' of SOURCE to end of DEST + */ leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST, pos, cpy_num); else { - /* copy first cpy_num-1 items starting from position 'pos-1' of the SOURCE to the end of the DEST */ + /* + * copy first cpy_num-1 items starting from position + * 'pos-1' of the SOURCE to the end of the DEST + */ leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST, pos, cpy_num - 1); - /* copy part of the item which number is cpy_num+pos-1 to the end of the DEST */ + /* + * copy part of the item which number is + * cpy_num+pos-1 to the end of the DEST + */ leaf_item_bottle(dest_bi, src, FIRST_TO_LAST, cpy_num + pos - 1, cpy_bytes); } @@ -525,7 +583,11 @@ static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src, else bytes = -1; - /* copy the last item or it part or nothing to the begin of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes)); */ + /* + * copy the last item or it part or nothing to the + * begin of the DEST + * (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes)); + */ i = leaf_copy_boundary_item(dest_bi, src, LAST_TO_FIRST, bytes); cpy_num -= i; @@ -534,15 +596,24 @@ static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src, pos = src_nr_item - cpy_num - i; if (cpy_bytes == -1) { - /* starting from position 'pos' copy last cpy_num items of SOURCE to begin of DEST */ + /* + * starting from position 'pos' copy last cpy_num + * items of SOURCE to begin of DEST + */ leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST, pos, cpy_num); } else { - /* copy last cpy_num-1 items starting from position 'pos+1' of the SOURCE to the begin of the DEST; */ + /* + * copy last cpy_num-1 items starting from position + * 'pos+1' of the SOURCE to the begin of the DEST; + */ leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST, pos + 1, cpy_num - 1); - /* copy part of the item which number is pos to the begin of the DEST */ + /* + * copy part of the item which number is pos to + * the begin of the DEST + */ leaf_item_bottle(dest_bi, src, LAST_TO_FIRST, pos, cpy_bytes); } @@ -550,9 +621,11 @@ static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src, return i; } -/* there are types of coping: from S[0] to L[0], from S[0] to R[0], - from R[0] to L[0]. for each of these we have to define parent and - positions of destination and source buffers */ +/* + * there are types of coping: from S[0] to L[0], from S[0] to R[0], + * from R[0] to L[0]. for each of these we have to define parent and + * positions of destination and source buffers + */ static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb, struct buffer_info *dest_bi, struct buffer_info *src_bi, @@ -568,7 +641,9 @@ static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb, src_bi->tb = tb; src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path); src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0); - src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0); /* src->b_item_order */ + + /* src->b_item_order */ + src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0); dest_bi->tb = tb; dest_bi->bi_bh = tb->L[0]; dest_bi->bi_parent = tb->FL[0]; @@ -633,8 +708,10 @@ static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb, shift_mode, src_bi->bi_bh, dest_bi->bi_bh); } -/* copy mov_num items and mov_bytes of the (mov_num-1)th item to - neighbor. Delete them from source */ +/* + * copy mov_num items and mov_bytes of the (mov_num-1)th item to + * neighbor. Delete them from source + */ int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num, int mov_bytes, struct buffer_head *Snew) { @@ -657,18 +734,24 @@ int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num, return ret_value; } -/* Shift shift_num items (and shift_bytes of last shifted item if shift_bytes != -1) - from S[0] to L[0] and replace the delimiting key */ +/* + * Shift shift_num items (and shift_bytes of last shifted item if + * shift_bytes != -1) from S[0] to L[0] and replace the delimiting key + */ int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes) { struct buffer_head *S0 = PATH_PLAST_BUFFER(tb->tb_path); int i; - /* move shift_num (and shift_bytes bytes) items from S[0] to left neighbor L[0] */ + /* + * move shift_num (and shift_bytes bytes) items from S[0] + * to left neighbor L[0] + */ i = leaf_move_items(LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, NULL); if (shift_num) { - if (B_NR_ITEMS(S0) == 0) { /* number of items in S[0] == 0 */ + /* number of items in S[0] == 0 */ + if (B_NR_ITEMS(S0) == 0) { RFALSE(shift_bytes != -1, "vs-10270: S0 is empty now, but shift_bytes != -1 (%d)", @@ -691,10 +774,10 @@ int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes) replace_key(tb, tb->CFL[0], tb->lkey[0], S0, 0); RFALSE((shift_bytes != -1 && - !(is_direntry_le_ih(B_N_PITEM_HEAD(S0, 0)) - && !I_ENTRY_COUNT(B_N_PITEM_HEAD(S0, 0)))) && + !(is_direntry_le_ih(item_head(S0, 0)) + && !ih_entry_count(item_head(S0, 0)))) && (!op_is_left_mergeable - (B_N_PKEY(S0, 0), S0->b_size)), + (leaf_key(S0, 0), S0->b_size)), "vs-10280: item must be mergeable"); } } @@ -704,13 +787,18 @@ int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes) /* CLEANING STOPPED HERE */ -/* Shift shift_num (shift_bytes) items from S[0] to the right neighbor, and replace the delimiting key */ +/* + * Shift shift_num (shift_bytes) items from S[0] to the right neighbor, + * and replace the delimiting key + */ int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes) { - // struct buffer_head * S0 = PATH_PLAST_BUFFER (tb->tb_path); int ret_value; - /* move shift_num (and shift_bytes) items from S[0] to right neighbor R[0] */ + /* + * move shift_num (and shift_bytes) items from S[0] to + * right neighbor R[0] + */ ret_value = leaf_move_items(LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, NULL); @@ -725,12 +813,16 @@ int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes) static void leaf_delete_items_entirely(struct buffer_info *bi, int first, int del_num); -/* If del_bytes == -1, starting from position 'first' delete del_num items in whole in buffer CUR. - If not. - If last_first == 0. Starting from position 'first' delete del_num-1 items in whole. Delete part of body of - the first item. Part defined by del_bytes. Don't delete first item header - If last_first == 1. Starting from position 'first+1' delete del_num-1 items in whole. Delete part of body of - the last item . Part defined by del_bytes. Don't delete last item header. +/* + * If del_bytes == -1, starting from position 'first' delete del_num + * items in whole in buffer CUR. + * If not. + * If last_first == 0. Starting from position 'first' delete del_num-1 + * items in whole. Delete part of body of the first item. Part defined by + * del_bytes. Don't delete first item header + * If last_first == 1. Starting from position 'first+1' delete del_num-1 + * items in whole. Delete part of body of the last item . Part defined by + * del_bytes. Don't delete last item header. */ void leaf_delete_items(struct buffer_info *cur_bi, int last_first, int first, int del_num, int del_bytes) @@ -761,32 +853,43 @@ void leaf_delete_items(struct buffer_info *cur_bi, int last_first, leaf_delete_items_entirely(cur_bi, first, del_num); else { if (last_first == FIRST_TO_LAST) { - /* delete del_num-1 items beginning from item in position first */ + /* + * delete del_num-1 items beginning from + * item in position first + */ leaf_delete_items_entirely(cur_bi, first, del_num - 1); - /* delete the part of the first item of the bh - do not delete item header + /* + * delete the part of the first item of the bh + * do not delete item header */ leaf_cut_from_buffer(cur_bi, 0, 0, del_bytes); } else { struct item_head *ih; int len; - /* delete del_num-1 items beginning from item in position first+1 */ + /* + * delete del_num-1 items beginning from + * item in position first+1 + */ leaf_delete_items_entirely(cur_bi, first + 1, del_num - 1); - ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh) - 1); + ih = item_head(bh, B_NR_ITEMS(bh) - 1); if (is_direntry_le_ih(ih)) /* the last item is directory */ - /* len = numbers of directory entries in this item */ + /* + * len = numbers of directory entries + * in this item + */ len = ih_entry_count(ih); else /* len = body len of item */ len = ih_item_len(ih); - /* delete the part of the last item of the bh - do not delete item header + /* + * delete the part of the last item of the bh + * do not delete item header */ leaf_cut_from_buffer(cur_bi, B_NR_ITEMS(bh) - 1, len - del_bytes, del_bytes); @@ -820,10 +923,10 @@ void leaf_insert_into_buf(struct buffer_info *bi, int before, zeros_number, ih_item_len(inserted_item_ih)); /* get item new item must be inserted before */ - ih = B_N_PITEM_HEAD(bh, before); + ih = item_head(bh, before); /* prepare space for the body of new item */ - last_loc = nr ? ih_location(&(ih[nr - before - 1])) : bh->b_size; + last_loc = nr ? ih_location(&ih[nr - before - 1]) : bh->b_size; unmoved_loc = before ? ih_location(ih - 1) : bh->b_size; memmove(bh->b_data + last_loc - ih_item_len(inserted_item_ih), @@ -846,8 +949,8 @@ void leaf_insert_into_buf(struct buffer_info *bi, int before, /* change locations */ for (i = before; i < nr + 1; i++) { - unmoved_loc -= ih_item_len(&(ih[i - before])); - put_ih_location(&(ih[i - before]), unmoved_loc); + unmoved_loc -= ih_item_len(&ih[i - before]); + put_ih_location(&ih[i - before], unmoved_loc); } /* sizes, free space, item number */ @@ -867,8 +970,10 @@ void leaf_insert_into_buf(struct buffer_info *bi, int before, } } -/* paste paste_size bytes to affected_item_num-th item. - When item is a directory, this only prepare space for new entries */ +/* + * paste paste_size bytes to affected_item_num-th item. + * When item is a directory, this only prepare space for new entries + */ void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num, int pos_in_item, int paste_size, const char *body, int zeros_number) @@ -902,9 +1007,9 @@ void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num, #endif /* CONFIG_REISERFS_CHECK */ /* item to be appended */ - ih = B_N_PITEM_HEAD(bh, affected_item_num); + ih = item_head(bh, affected_item_num); - last_loc = ih_location(&(ih[nr - affected_item_num - 1])); + last_loc = ih_location(&ih[nr - affected_item_num - 1]); unmoved_loc = affected_item_num ? ih_location(ih - 1) : bh->b_size; /* prepare space */ @@ -913,8 +1018,8 @@ void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num, /* change locations */ for (i = affected_item_num; i < nr; i++) - put_ih_location(&(ih[i - affected_item_num]), - ih_location(&(ih[i - affected_item_num])) - + put_ih_location(&ih[i - affected_item_num], + ih_location(&ih[i - affected_item_num]) - paste_size); if (body) { @@ -957,10 +1062,12 @@ void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num, } } -/* cuts DEL_COUNT entries beginning from FROM-th entry. Directory item - does not have free space, so it moves DEHs and remaining records as - necessary. Return value is size of removed part of directory item - in bytes. */ +/* + * cuts DEL_COUNT entries beginning from FROM-th entry. Directory item + * does not have free space, so it moves DEHs and remaining records as + * necessary. Return value is size of removed part of directory item + * in bytes. + */ static int leaf_cut_entries(struct buffer_head *bh, struct item_head *ih, int from, int del_count) { @@ -971,12 +1078,14 @@ static int leaf_cut_entries(struct buffer_head *bh, int cut_records_len; /* length of all removed records */ int i; - /* make sure, that item is directory and there are enough entries to - remove */ + /* + * make sure that item is directory and there are enough entries to + * remove + */ RFALSE(!is_direntry_le_ih(ih), "10180: item is not directory item"); - RFALSE(I_ENTRY_COUNT(ih) < from + del_count, + RFALSE(ih_entry_count(ih) < from + del_count, "10185: item contains not enough entries: entry_count = %d, from = %d, to delete = %d", - I_ENTRY_COUNT(ih), from, del_count); + ih_entry_count(ih), from, del_count); if (del_count == 0) return 0; @@ -987,22 +1096,24 @@ static int leaf_cut_entries(struct buffer_head *bh, /* entry head array */ deh = B_I_DEH(bh, ih); - /* first byte of remaining entries, those are BEFORE cut entries - (prev_record) and length of all removed records (cut_records_len) */ + /* + * first byte of remaining entries, those are BEFORE cut entries + * (prev_record) and length of all removed records (cut_records_len) + */ prev_record_offset = - (from ? deh_location(&(deh[from - 1])) : ih_item_len(ih)); + (from ? deh_location(&deh[from - 1]) : ih_item_len(ih)); cut_records_len = prev_record_offset /*from_record */ - - deh_location(&(deh[from + del_count - 1])); + deh_location(&deh[from + del_count - 1]); prev_record = item + prev_record_offset; /* adjust locations of remaining entries */ - for (i = I_ENTRY_COUNT(ih) - 1; i > from + del_count - 1; i--) - put_deh_location(&(deh[i]), + for (i = ih_entry_count(ih) - 1; i > from + del_count - 1; i--) + put_deh_location(&deh[i], deh_location(&deh[i]) - (DEH_SIZE * del_count)); for (i = 0; i < from; i++) - put_deh_location(&(deh[i]), + put_deh_location(&deh[i], deh_location(&deh[i]) - (DEH_SIZE * del_count + cut_records_len)); @@ -1021,14 +1132,15 @@ static int leaf_cut_entries(struct buffer_head *bh, return DEH_SIZE * del_count + cut_records_len; } -/* when cut item is part of regular file - pos_in_item - first byte that must be cut - cut_size - number of bytes to be cut beginning from pos_in_item - - when cut item is part of directory - pos_in_item - number of first deleted entry - cut_size - count of deleted entries - */ +/* + * when cut item is part of regular file + * pos_in_item - first byte that must be cut + * cut_size - number of bytes to be cut beginning from pos_in_item + * + * when cut item is part of directory + * pos_in_item - number of first deleted entry + * cut_size - count of deleted entries + */ void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num, int pos_in_item, int cut_size) { @@ -1043,7 +1155,7 @@ void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num, nr = blkh_nr_item(blkh); /* item head of truncated item */ - ih = B_N_PITEM_HEAD(bh, cut_item_num); + ih = item_head(bh, cut_item_num); if (is_direntry_le_ih(ih)) { /* first cut entry () */ @@ -1055,7 +1167,6 @@ void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num, cut_item_num); /* change item key by key of first entry in the item */ set_le_ih_k_offset(ih, deh_offset(B_I_DEH(bh, ih))); - /*memcpy (&ih->ih_key.k_offset, &(B_I_DEH (bh, ih)->deh_offset), SHORT_KEY_SIZE); */ } } else { /* item is direct or indirect */ @@ -1089,7 +1200,7 @@ void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num, } /* location of the last item */ - last_loc = ih_location(&(ih[nr - cut_item_num - 1])); + last_loc = ih_location(&ih[nr - cut_item_num - 1]); /* location of the item, which is remaining at the same place */ unmoved_loc = cut_item_num ? ih_location(ih - 1) : bh->b_size; @@ -1108,7 +1219,7 @@ void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num, /* change locations */ for (i = cut_item_num; i < nr; i++) - put_ih_location(&(ih[i - cut_item_num]), + put_ih_location(&ih[i - cut_item_num], ih_location(&ih[i - cut_item_num]) + cut_size); /* size, free space */ @@ -1156,14 +1267,14 @@ static void leaf_delete_items_entirely(struct buffer_info *bi, return; } - ih = B_N_PITEM_HEAD(bh, first); + ih = item_head(bh, first); /* location of unmovable item */ j = (first == 0) ? bh->b_size : ih_location(ih - 1); /* delete items */ - last_loc = ih_location(&(ih[nr - 1 - first])); - last_removed_loc = ih_location(&(ih[del_num - 1])); + last_loc = ih_location(&ih[nr - 1 - first]); + last_removed_loc = ih_location(&ih[del_num - 1]); memmove(bh->b_data + last_loc + j - last_removed_loc, bh->b_data + last_loc, last_removed_loc - last_loc); @@ -1173,8 +1284,8 @@ static void leaf_delete_items_entirely(struct buffer_info *bi, /* change item location */ for (i = first; i < nr - del_num; i++) - put_ih_location(&(ih[i - first]), - ih_location(&(ih[i - first])) + (j - + put_ih_location(&ih[i - first], + ih_location(&ih[i - first]) + (j - last_removed_loc)); /* sizes, item number */ @@ -1195,7 +1306,10 @@ static void leaf_delete_items_entirely(struct buffer_info *bi, } } -/* paste new_entry_count entries (new_dehs, records) into position before to item_num-th item */ +/* + * paste new_entry_count entries (new_dehs, records) into position + * before to item_num-th item + */ void leaf_paste_entries(struct buffer_info *bi, int item_num, int before, @@ -1213,13 +1327,16 @@ void leaf_paste_entries(struct buffer_info *bi, if (new_entry_count == 0) return; - ih = B_N_PITEM_HEAD(bh, item_num); + ih = item_head(bh, item_num); - /* make sure, that item is directory, and there are enough records in it */ + /* + * make sure, that item is directory, and there are enough + * records in it + */ RFALSE(!is_direntry_le_ih(ih), "10225: item is not directory item"); - RFALSE(I_ENTRY_COUNT(ih) < before, + RFALSE(ih_entry_count(ih) < before, "10230: there are no entry we paste entries before. entry_count = %d, before = %d", - I_ENTRY_COUNT(ih), before); + ih_entry_count(ih), before); /* first byte of dest item */ item = bh->b_data + ih_location(ih); @@ -1230,21 +1347,21 @@ void leaf_paste_entries(struct buffer_info *bi, /* new records will be pasted at this point */ insert_point = item + - (before ? deh_location(&(deh[before - 1])) + (before ? deh_location(&deh[before - 1]) : (ih_item_len(ih) - paste_size)); /* adjust locations of records that will be AFTER new records */ - for (i = I_ENTRY_COUNT(ih) - 1; i >= before; i--) - put_deh_location(&(deh[i]), - deh_location(&(deh[i])) + + for (i = ih_entry_count(ih) - 1; i >= before; i--) + put_deh_location(&deh[i], + deh_location(&deh[i]) + (DEH_SIZE * new_entry_count)); /* adjust locations of records that will be BEFORE new records */ for (i = 0; i < before; i++) - put_deh_location(&(deh[i]), - deh_location(&(deh[i])) + paste_size); + put_deh_location(&deh[i], + deh_location(&deh[i]) + paste_size); - old_entry_num = I_ENTRY_COUNT(ih); + old_entry_num = ih_entry_count(ih); put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count); /* prepare space for pasted records */ @@ -1266,10 +1383,10 @@ void leaf_paste_entries(struct buffer_info *bi, /* set locations of new records */ for (i = 0; i < new_entry_count; i++) { - put_deh_location(&(deh[i]), - deh_location(&(deh[i])) + + put_deh_location(&deh[i], + deh_location(&deh[i]) + (-deh_location - (&(new_dehs[new_entry_count - 1])) + + (&new_dehs[new_entry_count - 1]) + insert_point + DEH_SIZE * new_entry_count - item)); } @@ -1277,28 +1394,26 @@ void leaf_paste_entries(struct buffer_info *bi, /* change item key if necessary (when we paste before 0-th entry */ if (!before) { set_le_ih_k_offset(ih, deh_offset(new_dehs)); -/* memcpy (&ih->ih_key.k_offset, - &new_dehs->deh_offset, SHORT_KEY_SIZE);*/ } #ifdef CONFIG_REISERFS_CHECK { int prev, next; /* check record locations */ deh = B_I_DEH(bh, ih); - for (i = 0; i < I_ENTRY_COUNT(ih); i++) { + for (i = 0; i < ih_entry_count(ih); i++) { next = (i < - I_ENTRY_COUNT(ih) - - 1) ? deh_location(&(deh[i + 1])) : 0; - prev = (i != 0) ? deh_location(&(deh[i - 1])) : 0; + ih_entry_count(ih) - + 1) ? deh_location(&deh[i + 1]) : 0; + prev = (i != 0) ? deh_location(&deh[i - 1]) : 0; - if (prev && prev <= deh_location(&(deh[i]))) + if (prev && prev <= deh_location(&deh[i])) reiserfs_error(sb_from_bi(bi), "vs-10240", "directory item (%h) " "corrupted (prev %a, " "cur(%d) %a)", ih, deh + i - 1, i, deh + i); - if (next && next >= deh_location(&(deh[i]))) + if (next && next >= deh_location(&deh[i])) reiserfs_error(sb_from_bi(bi), "vs-10250", "directory item (%h) " "corrupted (cur(%d) %a, " diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index dc5236f6de1..cd11358b10c 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -22,8 +22,10 @@ #define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); } #define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i); -// directory item contains array of entry headers. This performs -// binary search through that array +/* + * directory item contains array of entry headers. This performs + * binary search through that array + */ static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off) { struct item_head *ih = de->de_ih; @@ -31,7 +33,7 @@ static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off) int rbound, lbound, j; lbound = 0; - rbound = I_ENTRY_COUNT(ih) - 1; + rbound = ih_entry_count(ih) - 1; for (j = (rbound + lbound) / 2; lbound <= rbound; j = (rbound + lbound) / 2) { @@ -43,7 +45,7 @@ static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off) lbound = j + 1; continue; } - // this is not name found, but matched third key component + /* this is not name found, but matched third key component */ de->de_entry_num = j; return NAME_FOUND; } @@ -52,17 +54,21 @@ static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off) return NAME_NOT_FOUND; } -// comment? maybe something like set de to point to what the path points to? +/* + * comment? maybe something like set de to point to what the path points to? + */ static inline void set_de_item_location(struct reiserfs_dir_entry *de, struct treepath *path) { de->de_bh = get_last_bh(path); - de->de_ih = get_ih(path); + de->de_ih = tp_item_head(path); de->de_deh = B_I_DEH(de->de_bh, de->de_ih); de->de_item_num = PATH_LAST_POSITION(path); } -// de_bh, de_ih, de_deh (points to first element of array), de_item_num is set +/* + * de_bh, de_ih, de_deh (points to first element of array), de_item_num is set + */ inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de) { struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num; @@ -71,17 +77,17 @@ inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de) de->de_entrylen = entry_length(de->de_bh, de->de_ih, de->de_entry_num); de->de_namelen = de->de_entrylen - (de_with_sd(deh) ? SD_SIZE : 0); - de->de_name = B_I_PITEM(de->de_bh, de->de_ih) + deh_location(deh); + de->de_name = ih_item_body(de->de_bh, de->de_ih) + deh_location(deh); if (de->de_name[de->de_namelen - 1] == 0) de->de_namelen = strlen(de->de_name); } -// what entry points to +/* what entry points to */ static inline void set_de_object_key(struct reiserfs_dir_entry *de) { BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih)); - de->de_dir_id = deh_dir_id(&(de->de_deh[de->de_entry_num])); - de->de_objectid = deh_objectid(&(de->de_deh[de->de_entry_num])); + de->de_dir_id = deh_dir_id(&de->de_deh[de->de_entry_num]); + de->de_objectid = deh_objectid(&de->de_deh[de->de_entry_num]); } static inline void store_de_entry_key(struct reiserfs_dir_entry *de) @@ -96,21 +102,20 @@ static inline void store_de_entry_key(struct reiserfs_dir_entry *de) le32_to_cpu(de->de_ih->ih_key.k_dir_id); de->de_entry_key.on_disk_key.k_objectid = le32_to_cpu(de->de_ih->ih_key.k_objectid); - set_cpu_key_k_offset(&(de->de_entry_key), deh_offset(deh)); - set_cpu_key_k_type(&(de->de_entry_key), TYPE_DIRENTRY); + set_cpu_key_k_offset(&de->de_entry_key, deh_offset(deh)); + set_cpu_key_k_type(&de->de_entry_key, TYPE_DIRENTRY); } -/* We assign a key to each directory item, and place multiple entries -in a single directory item. A directory item has a key equal to the -key of the first directory entry in it. - -This function first calls search_by_key, then, if item whose first -entry matches is not found it looks for the entry inside directory -item found by search_by_key. Fills the path to the entry, and to the -entry position in the item - -*/ - +/* + * We assign a key to each directory item, and place multiple entries in a + * single directory item. A directory item has a key equal to the key of + * the first directory entry in it. + + * This function first calls search_by_key, then, if item whose first entry + * matches is not found it looks for the entry inside directory item found + * by search_by_key. Fills the path to the entry, and to the entry position + * in the item + */ /* The function is NOT SCHEDULE-SAFE! */ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key, struct treepath *path, struct reiserfs_dir_entry *de) @@ -144,7 +149,7 @@ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key, #ifdef CONFIG_REISERFS_CHECK if (!is_direntry_le_ih(de->de_ih) || - COMP_SHORT_KEYS(&(de->de_ih->ih_key), key)) { + COMP_SHORT_KEYS(&de->de_ih->ih_key, key)) { print_block(de->de_bh, 0, -1, -1); reiserfs_panic(sb, "vs-7005", "found item %h is not directory " "item or does not belong to the same directory " @@ -152,12 +157,17 @@ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key, } #endif /* CONFIG_REISERFS_CHECK */ - /* binary search in directory item by third componen t of the - key. sets de->de_entry_num of de */ + /* + * binary search in directory item by third component of the + * key. sets de->de_entry_num of de + */ retval = bin_search_in_dir_item(de, cpu_key_k_offset(key)); path->pos_in_item = de->de_entry_num; if (retval != NAME_NOT_FOUND) { - // ugly, but rename needs de_bh, de_deh, de_name, de_namelen, de_objectid set + /* + * ugly, but rename needs de_bh, de_deh, de_name, + * de_namelen, de_objectid set + */ set_de_name_and_namelen(de); set_de_object_key(de); } @@ -166,11 +176,12 @@ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key, /* Keyed 32-bit hash function using TEA in a Davis-Meyer function */ -/* The third component is hashed, and you can choose from more than - one hash function. Per directory hashes are not yet implemented - but are thought about. This function should be moved to hashes.c - Jedi, please do so. -Hans */ - +/* + * The third component is hashed, and you can choose from more than + * one hash function. Per directory hashes are not yet implemented + * but are thought about. This function should be moved to hashes.c + * Jedi, please do so. -Hans + */ static __u32 get_third_component(struct super_block *s, const char *name, int len) { @@ -183,11 +194,13 @@ static __u32 get_third_component(struct super_block *s, res = REISERFS_SB(s)->s_hash_function(name, len); - // take bits from 7-th to 30-th including both bounds + /* take bits from 7-th to 30-th including both bounds */ res = GET_HASH_VALUE(res); if (res == 0) - // needed to have no names before "." and ".." those have hash - // value == 0 and generation conters 1 and 2 accordingly + /* + * needed to have no names before "." and ".." those have hash + * value == 0 and generation conters 1 and 2 accordingly + */ res = 128; return res + MAX_GENERATION_NUMBER; } @@ -208,7 +221,7 @@ static int reiserfs_match(struct reiserfs_dir_entry *de, /* de's de_bh, de_ih, de_deh, de_item_num, de_entry_num are set already */ - /* used when hash collisions exist */ +/* used when hash collisions exist */ static int linear_search_in_dir_item(struct cpu_key *key, struct reiserfs_dir_entry *de, @@ -220,7 +233,7 @@ static int linear_search_in_dir_item(struct cpu_key *key, i = de->de_entry_num; - if (i == I_ENTRY_COUNT(de->de_ih) || + if (i == ih_entry_count(de->de_ih) || GET_HASH_VALUE(deh_offset(deh + i)) != GET_HASH_VALUE(cpu_key_k_offset(key))) { i--; @@ -232,43 +245,50 @@ static int linear_search_in_dir_item(struct cpu_key *key, deh += i; for (; i >= 0; i--, deh--) { + /* hash value does not match, no need to check whole name */ if (GET_HASH_VALUE(deh_offset(deh)) != GET_HASH_VALUE(cpu_key_k_offset(key))) { - // hash value does not match, no need to check whole name return NAME_NOT_FOUND; } - /* mark, that this generation number is used */ + /* mark that this generation number is used */ if (de->de_gen_number_bit_string) set_bit(GET_GENERATION_NUMBER(deh_offset(deh)), de->de_gen_number_bit_string); - // calculate pointer to name and namelen + /* calculate pointer to name and namelen */ de->de_entry_num = i; set_de_name_and_namelen(de); + /* + * de's de_name, de_namelen, de_recordlen are set. + * Fill the rest. + */ if ((retval = reiserfs_match(de, name, namelen)) != NAME_NOT_FOUND) { - // de's de_name, de_namelen, de_recordlen are set. Fill the rest: - // key of pointed object + /* key of pointed object */ set_de_object_key(de); store_de_entry_key(de); - // retval can be NAME_FOUND or NAME_FOUND_INVISIBLE + /* retval can be NAME_FOUND or NAME_FOUND_INVISIBLE */ return retval; } } if (GET_GENERATION_NUMBER(le_ih_k_offset(de->de_ih)) == 0) - /* we have reached left most entry in the node. In common we - have to go to the left neighbor, but if generation counter - is 0 already, we know for sure, that there is no name with - the same hash value */ - // FIXME: this work correctly only because hash value can not - // be 0. Btw, in case of Yura's hash it is probably possible, - // so, this is a bug + /* + * we have reached left most entry in the node. In common we + * have to go to the left neighbor, but if generation counter + * is 0 already, we know for sure, that there is no name with + * the same hash value + */ + /* + * FIXME: this work correctly only because hash value can not + * be 0. Btw, in case of Yura's hash it is probably possible, + * so, this is a bug + */ return NAME_NOT_FOUND; RFALSE(de->de_item_num, @@ -277,8 +297,10 @@ static int linear_search_in_dir_item(struct cpu_key *key, return GOTO_PREVIOUS_ITEM; } -// may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND -// FIXME: should add something like IOERROR +/* + * may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND + * FIXME: should add something like IOERROR + */ static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen, struct treepath *path_to_entry, struct reiserfs_dir_entry *de) @@ -307,13 +329,19 @@ static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen, retval = linear_search_in_dir_item(&key_to_search, de, name, namelen); + /* + * there is no need to scan directory anymore. + * Given entry found or does not exist + */ if (retval != GOTO_PREVIOUS_ITEM) { - /* there is no need to scan directory anymore. Given entry found or does not exist */ path_to_entry->pos_in_item = de->de_entry_num; return retval; } - /* there is left neighboring item of this directory and given entry can be there */ + /* + * there is left neighboring item of this directory + * and given entry can be there + */ set_cpu_key_k_offset(&key_to_search, le_ih_k_offset(de->de_ih) - 1); pathrelse(path_to_entry); @@ -341,14 +369,16 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, pathrelse(&path_to_entry); if (retval == NAME_FOUND) { inode = reiserfs_iget(dir->i_sb, - (struct cpu_key *)&(de.de_dir_id)); + (struct cpu_key *)&de.de_dir_id); if (!inode || IS_ERR(inode)) { reiserfs_write_unlock(dir->i_sb); return ERR_PTR(-EACCES); } - /* Propagate the private flag so we know we're - * in the priv tree */ + /* + * Propagate the private flag so we know we're + * in the priv tree + */ if (IS_PRIVATE(dir)) inode->i_flags |= S_PRIVATE; } @@ -361,9 +391,9 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, } /* -** looks up the dentry of the parent directory for child. -** taken from ext2_get_parent -*/ + * looks up the dentry of the parent directory for child. + * taken from ext2_get_parent + */ struct dentry *reiserfs_get_parent(struct dentry *child) { int retval; @@ -384,7 +414,7 @@ struct dentry *reiserfs_get_parent(struct dentry *child) reiserfs_write_unlock(dir->i_sb); return ERR_PTR(-ENOENT); } - inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&(de.de_dir_id)); + inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&de.de_dir_id); reiserfs_write_unlock(dir->i_sb); return d_obtain_alias(inode); @@ -406,8 +436,13 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, struct reiserfs_dir_entry de; DECLARE_BITMAP(bit_string, MAX_GENERATION_NUMBER + 1); int gen_number; - char small_buf[32 + DEH_SIZE]; /* 48 bytes now and we avoid kmalloc - if we create file with short name */ + + /* + * 48 bytes now and we avoid kmalloc if we + * create file with short name + */ + char small_buf[32 + DEH_SIZE]; + char *buffer; int buflen, paste_size; int retval; @@ -439,21 +474,30 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, (get_inode_sd_version(dir) == STAT_DATA_V1) ? (DEH_SIZE + namelen) : buflen; - /* fill buffer : directory entry head, name[, dir objectid | , stat data | ,stat data, dir objectid ] */ + /* + * fill buffer : directory entry head, name[, dir objectid | , + * stat data | ,stat data, dir objectid ] + */ deh = (struct reiserfs_de_head *)buffer; deh->deh_location = 0; /* JDM Endian safe if 0 */ put_deh_offset(deh, cpu_key_k_offset(&entry_key)); deh->deh_state = 0; /* JDM Endian safe if 0 */ /* put key (ino analog) to de */ - deh->deh_dir_id = INODE_PKEY(inode)->k_dir_id; /* safe: k_dir_id is le */ - deh->deh_objectid = INODE_PKEY(inode)->k_objectid; /* safe: k_objectid is le */ + + /* safe: k_dir_id is le */ + deh->deh_dir_id = INODE_PKEY(inode)->k_dir_id; + /* safe: k_objectid is le */ + deh->deh_objectid = INODE_PKEY(inode)->k_objectid; /* copy name */ memcpy((char *)(deh + 1), name, namelen); /* padd by 0s to the 4 byte boundary */ padd_item((char *)(deh + 1), ROUND_UP(namelen), namelen); - /* entry is ready to be pasted into tree, set 'visibility' and 'stat data in entry' attributes */ + /* + * entry is ready to be pasted into tree, set 'visibility' + * and 'stat data in entry' attributes + */ mark_de_without_sd(deh); visible ? mark_de_visible(deh) : mark_de_hidden(deh); @@ -499,7 +543,8 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, /* update max-hash-collisions counter in reiserfs_sb_info */ PROC_INFO_MAX(th->t_super, max_hash_collisions, gen_number); - if (gen_number != 0) { /* we need to re-search for the insertion point */ + /* we need to re-search for the insertion point */ + if (gen_number != 0) { if (search_by_entry_key(dir->i_sb, &entry_key, &path, &de) != NAME_NOT_FOUND) { reiserfs_warning(dir->i_sb, "vs-7032", @@ -527,18 +572,19 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, dir->i_size += paste_size; dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; if (!S_ISDIR(inode->i_mode) && visible) - // reiserfs_mkdir or reiserfs_rename will do that by itself + /* reiserfs_mkdir or reiserfs_rename will do that by itself */ reiserfs_update_sd(th, dir); reiserfs_check_path(&path); return 0; } -/* quota utility function, call if you've had to abort after calling -** new_inode_init, and have not called reiserfs_new_inode yet. -** This should only be called on inodes that do not have stat data -** inserted into the tree yet. -*/ +/* + * quota utility function, call if you've had to abort after calling + * new_inode_init, and have not called reiserfs_new_inode yet. + * This should only be called on inodes that do not have stat data + * inserted into the tree yet. + */ static int drop_new_inode(struct inode *inode) { dquot_drop(inode); @@ -548,18 +594,23 @@ static int drop_new_inode(struct inode *inode) return 0; } -/* utility function that does setup for reiserfs_new_inode. -** dquot_initialize needs lots of credits so it's better to have it -** outside of a transaction, so we had to pull some bits of -** reiserfs_new_inode out into this func. -*/ +/* + * utility function that does setup for reiserfs_new_inode. + * dquot_initialize needs lots of credits so it's better to have it + * outside of a transaction, so we had to pull some bits of + * reiserfs_new_inode out into this func. + */ static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode) { - /* Make inode invalid - just in case we are going to drop it before - * the initialization happens */ + /* + * Make inode invalid - just in case we are going to drop it before + * the initialization happens + */ INODE_PKEY(inode)->k_objectid = 0; - /* the quota init calls have to know who to charge the quota to, so - ** we have to set uid and gid here + + /* + * the quota init calls have to know who to charge the quota to, so + * we have to set uid and gid here */ inode_init_owner(inode, dir, mode); dquot_initialize(inode); @@ -571,7 +622,10 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mod { int retval; struct inode *inode; - /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ + /* + * We need blocks for transaction + (user+group)*(quotas + * for new inode + update of quota for directory owner) + */ int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + @@ -618,7 +672,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mod int err; drop_nlink(inode); reiserfs_update_sd(&th, inode); - err = journal_end(&th, dir->i_sb, jbegin_count); + err = journal_end(&th); if (err) retval = err; unlock_new_inode(inode); @@ -630,9 +684,9 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mod unlock_new_inode(inode); d_instantiate(dentry, inode); - retval = journal_end(&th, dir->i_sb, jbegin_count); + retval = journal_end(&th); - out_failed: +out_failed: reiserfs_write_unlock(dir->i_sb); return retval; } @@ -644,7 +698,10 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode struct inode *inode; struct reiserfs_transaction_handle th; struct reiserfs_security_handle security; - /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ + /* + * We need blocks for transaction + (user+group)*(quotas + * for new inode + update of quota for directory owner) + */ int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + @@ -685,7 +742,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode inode->i_op = &reiserfs_special_inode_operations; init_special_inode(inode, inode->i_mode, rdev); - //FIXME: needed for block and char devices only + /* FIXME: needed for block and char devices only */ reiserfs_update_sd(&th, inode); reiserfs_update_inode_transaction(inode); @@ -698,7 +755,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode int err; drop_nlink(inode); reiserfs_update_sd(&th, inode); - err = journal_end(&th, dir->i_sb, jbegin_count); + err = journal_end(&th); if (err) retval = err; unlock_new_inode(inode); @@ -708,9 +765,9 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode unlock_new_inode(inode); d_instantiate(dentry, inode); - retval = journal_end(&th, dir->i_sb, jbegin_count); + retval = journal_end(&th); - out_failed: +out_failed: reiserfs_write_unlock(dir->i_sb); return retval; } @@ -721,7 +778,10 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode struct inode *inode; struct reiserfs_transaction_handle th; struct reiserfs_security_handle security; - /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ + /* + * We need blocks for transaction + (user+group)*(quotas + * for new inode + update of quota for directory owner) + */ int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + @@ -730,7 +790,10 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode dquot_initialize(dir); #ifdef DISPLACE_NEW_PACKING_LOCALITIES - /* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */ + /* + * set flag that new packing locality created and new blocks + * for the content of that directory are not displaced yet + */ REISERFS_I(dir)->new_packing_locality = 1; #endif mode = S_IFDIR | mode; @@ -754,8 +817,9 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode goto out_failed; } - /* inc the link count now, so another writer doesn't overflow it while - ** we sleep later on. + /* + * inc the link count now, so another writer doesn't overflow + * it while we sleep later on. */ INC_DIR_INODE_NLINK(dir) @@ -774,7 +838,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode inode->i_op = &reiserfs_dir_inode_operations; inode->i_fop = &reiserfs_dir_operations; - // note, _this_ add_entry will not update dir's stat data + /* note, _this_ add_entry will not update dir's stat data */ retval = reiserfs_add_entry(&th, dir, dentry->d_name.name, dentry->d_name.len, inode, 1 /*visible */ ); @@ -783,19 +847,19 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode clear_nlink(inode); DEC_DIR_INODE_NLINK(dir); reiserfs_update_sd(&th, inode); - err = journal_end(&th, dir->i_sb, jbegin_count); + err = journal_end(&th); if (err) retval = err; unlock_new_inode(inode); iput(inode); goto out_failed; } - // the above add_entry did not update dir's stat data + /* the above add_entry did not update dir's stat data */ reiserfs_update_sd(&th, dir); unlock_new_inode(inode); d_instantiate(dentry, inode); - retval = journal_end(&th, dir->i_sb, jbegin_count); + retval = journal_end(&th); out_failed: reiserfs_write_unlock(dir->i_sb); return retval; @@ -803,10 +867,11 @@ out_failed: static inline int reiserfs_empty_dir(struct inode *inode) { - /* we can cheat because an old format dir cannot have - ** EMPTY_DIR_SIZE, and a new format dir cannot have - ** EMPTY_DIR_SIZE_V1. So, if the inode is either size, - ** regardless of disk format version, the directory is empty. + /* + * we can cheat because an old format dir cannot have + * EMPTY_DIR_SIZE, and a new format dir cannot have + * EMPTY_DIR_SIZE_V1. So, if the inode is either size, + * regardless of disk format version, the directory is empty. */ if (inode->i_size != EMPTY_DIR_SIZE && inode->i_size != EMPTY_DIR_SIZE_V1) { @@ -824,10 +889,12 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) INITIALIZE_PATH(path); struct reiserfs_dir_entry de; - /* we will be doing 2 balancings and update 2 stat data, we change quotas - * of the owner of the directory and of the owner of the parent directory. - * The quota structure is possibly deleted only on last iput => outside - * of this transaction */ + /* + * we will be doing 2 balancings and update 2 stat data, we + * change quotas of the owner of the directory and of the owner + * of the parent directory. The quota structure is possibly + * deleted only on last iput => outside of this transaction + */ jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); @@ -856,8 +923,9 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) reiserfs_update_inode_transaction(dir); if (de.de_objectid != inode->i_ino) { - // FIXME: compare key of an object and a key found in the - // entry + /* + * FIXME: compare key of an object and a key found in the entry + */ retval = -EIO; goto end_rmdir; } @@ -867,7 +935,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) } /* cut entry from dir directory */ - retval = reiserfs_cut_from_item(&th, &path, &(de.de_entry_key), dir, NULL, /* page */ + retval = reiserfs_cut_from_item(&th, &path, &de.de_entry_key, + dir, NULL, /* page */ 0 /*new file size - not used here */ ); if (retval < 0) goto end_rmdir; @@ -888,18 +957,20 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) /* prevent empty directory from getting lost */ add_save_link(&th, inode, 0 /* not truncate */ ); - retval = journal_end(&th, dir->i_sb, jbegin_count); + retval = journal_end(&th); reiserfs_check_path(&path); - out_rmdir: +out_rmdir: reiserfs_write_unlock(dir->i_sb); return retval; - end_rmdir: - /* we must release path, because we did not call - reiserfs_cut_from_item, or reiserfs_cut_from_item does not - release path if operation was not complete */ +end_rmdir: + /* + * we must release path, because we did not call + * reiserfs_cut_from_item, or reiserfs_cut_from_item does not + * release path if operation was not complete + */ pathrelse(&path); - err = journal_end(&th, dir->i_sb, jbegin_count); + err = journal_end(&th); reiserfs_write_unlock(dir->i_sb); return err ? err : retval; } @@ -918,10 +989,13 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) inode = dentry->d_inode; - /* in this transaction we can be doing at max two balancings and update - * two stat datas, we change quotas of the owner of the directory and of - * the owner of the parent directory. The quota structure is possibly - * deleted only on iput => outside of this transaction */ + /* + * in this transaction we can be doing at max two balancings and + * update two stat datas, we change quotas of the owner of the + * directory and of the owner of the parent directory. The quota + * structure is possibly deleted only on iput => outside of + * this transaction + */ jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); @@ -946,8 +1020,9 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) reiserfs_update_inode_transaction(dir); if (de.de_objectid != inode->i_ino) { - // FIXME: compare key of an object and a key found in the - // entry + /* + * FIXME: compare key of an object and a key found in the entry + */ retval = -EIO; goto end_unlink; } @@ -968,7 +1043,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) savelink = inode->i_nlink; retval = - reiserfs_cut_from_item(&th, &path, &(de.de_entry_key), dir, NULL, + reiserfs_cut_from_item(&th, &path, &de.de_entry_key, dir, NULL, 0); if (retval < 0) { inc_nlink(inode); @@ -985,18 +1060,18 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) /* prevent file from getting lost */ add_save_link(&th, inode, 0 /* not truncate */ ); - retval = journal_end(&th, dir->i_sb, jbegin_count); + retval = journal_end(&th); reiserfs_check_path(&path); reiserfs_write_unlock(dir->i_sb); return retval; - end_unlink: +end_unlink: pathrelse(&path); - err = journal_end(&th, dir->i_sb, jbegin_count); + err = journal_end(&th); reiserfs_check_path(&path); if (err) retval = err; - out_unlink: +out_unlink: reiserfs_write_unlock(dir->i_sb); return retval; } @@ -1011,7 +1086,10 @@ static int reiserfs_symlink(struct inode *parent_dir, struct reiserfs_transaction_handle th; struct reiserfs_security_handle security; int mode = S_IFLNK | S_IRWXUGO; - /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ + /* + * We need blocks for transaction + (user+group)*(quotas for + * new inode + update of quota for directory owner) + */ int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) + @@ -1070,17 +1148,13 @@ static int reiserfs_symlink(struct inode *parent_dir, inode->i_op = &reiserfs_symlink_inode_operations; inode->i_mapping->a_ops = &reiserfs_address_space_operations; - // must be sure this inode is written with this transaction - // - //reiserfs_update_sd (&th, inode, READ_BLOCKS); - retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name, dentry->d_name.len, inode, 1 /*visible */ ); if (retval) { int err; drop_nlink(inode); reiserfs_update_sd(&th, inode); - err = journal_end(&th, parent_dir->i_sb, jbegin_count); + err = journal_end(&th); if (err) retval = err; unlock_new_inode(inode); @@ -1090,8 +1164,8 @@ static int reiserfs_symlink(struct inode *parent_dir, unlock_new_inode(inode); d_instantiate(dentry, inode); - retval = journal_end(&th, parent_dir->i_sb, jbegin_count); - out_failed: + retval = journal_end(&th); +out_failed: reiserfs_write_unlock(parent_dir->i_sb); return retval; } @@ -1102,7 +1176,10 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, int retval; struct inode *inode = old_dentry->d_inode; struct reiserfs_transaction_handle th; - /* We need blocks for transaction + update of quotas for the owners of the directory */ + /* + * We need blocks for transaction + update of quotas for + * the owners of the directory + */ int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); @@ -1111,7 +1188,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, reiserfs_write_lock(dir->i_sb); if (inode->i_nlink >= REISERFS_LINK_MAX) { - //FIXME: sd_nlink is 32 bit for new files + /* FIXME: sd_nlink is 32 bit for new files */ reiserfs_write_unlock(dir->i_sb); return -EMLINK; } @@ -1137,7 +1214,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, if (retval) { int err; drop_nlink(inode); - err = journal_end(&th, dir->i_sb, jbegin_count); + err = journal_end(&th); reiserfs_write_unlock(dir->i_sb); return err ? err : retval; } @@ -1147,7 +1224,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, ihold(inode); d_instantiate(dentry, inode); - retval = journal_end(&th, dir->i_sb, jbegin_count); + retval = journal_end(&th); reiserfs_write_unlock(dir->i_sb); return retval; } @@ -1158,9 +1235,9 @@ static int de_still_valid(const char *name, int len, { struct reiserfs_dir_entry tmp = *de; - // recalculate pointer to name and name length + /* recalculate pointer to name and name length */ set_de_name_and_namelen(&tmp); - // FIXME: could check more + /* FIXME: could check more */ if (tmp.de_namelen != len || memcmp(name, de->de_name, len)) return 0; return 1; @@ -1217,14 +1294,16 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, unsigned long savelink = 1; struct timespec ctime; - /* three balancings: (1) old name removal, (2) new name insertion - and (3) maybe "save" link insertion - stat data updates: (1) old directory, - (2) new directory and (3) maybe old object stat data (when it is - directory) and (4) maybe stat data of object to which new entry - pointed initially and (5) maybe block containing ".." of - renamed directory - quota updates: two parent directories */ + /* + * three balancings: (1) old name removal, (2) new name insertion + * and (3) maybe "save" link insertion + * stat data updates: (1) old directory, + * (2) new directory and (3) maybe old object stat data (when it is + * directory) and (4) maybe stat data of object to which new entry + * pointed initially and (5) maybe block containing ".." of + * renamed directory + * quota updates: two parent directories + */ jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 5 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb); @@ -1235,8 +1314,10 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_inode = old_dentry->d_inode; new_dentry_inode = new_dentry->d_inode; - // make sure, that oldname still exists and points to an object we - // are going to rename + /* + * make sure that oldname still exists and points to an object we + * are going to rename + */ old_de.de_gen_number_bit_string = NULL; reiserfs_write_lock(old_dir->i_sb); retval = @@ -1256,10 +1337,11 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_inode_mode = old_inode->i_mode; if (S_ISDIR(old_inode_mode)) { - // make sure, that directory being renamed has correct ".." - // and that its new parent directory has not too many links - // already - + /* + * make sure that directory being renamed has correct ".." + * and that its new parent directory has not too many links + * already + */ if (new_dentry_inode) { if (!reiserfs_empty_dir(new_dentry_inode)) { reiserfs_write_unlock(old_dir->i_sb); @@ -1267,8 +1349,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, } } - /* directory is renamed, its parent directory will be changed, - ** so find ".." entry + /* + * directory is renamed, its parent directory will be changed, + * so find ".." entry */ dot_dot_de.de_gen_number_bit_string = NULL; retval = @@ -1303,7 +1386,7 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, "new entry is found, new inode == 0"); } } else if (retval) { - int err = journal_end(&th, old_dir->i_sb, jbegin_count); + int err = journal_end(&th); reiserfs_write_unlock(old_dir->i_sb); return err ? err : retval; } @@ -1311,8 +1394,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, reiserfs_update_inode_transaction(old_dir); reiserfs_update_inode_transaction(new_dir); - /* this makes it so an fsync on an open fd for the old name will - ** commit the rename operation + /* + * this makes it so an fsync on an open fd for the old name will + * commit the rename operation */ reiserfs_update_inode_transaction(old_inode); @@ -1320,38 +1404,45 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, reiserfs_update_inode_transaction(new_dentry_inode); while (1) { - // look for old name using corresponding entry key (found by reiserfs_find_entry) + /* + * look for old name using corresponding entry key + * (found by reiserfs_find_entry) + */ if ((retval = search_by_entry_key(new_dir->i_sb, &old_de.de_entry_key, &old_entry_path, &old_de)) != NAME_FOUND) { pathrelse(&old_entry_path); - journal_end(&th, old_dir->i_sb, jbegin_count); + journal_end(&th); reiserfs_write_unlock(old_dir->i_sb); return -EIO; } - copy_item_head(&old_entry_ih, get_ih(&old_entry_path)); + copy_item_head(&old_entry_ih, tp_item_head(&old_entry_path)); reiserfs_prepare_for_journal(old_inode->i_sb, old_de.de_bh, 1); - // look for new name by reiserfs_find_entry + /* look for new name by reiserfs_find_entry */ new_de.de_gen_number_bit_string = NULL; retval = reiserfs_find_entry(new_dir, new_dentry->d_name.name, new_dentry->d_name.len, &new_entry_path, &new_de); - // reiserfs_add_entry should not return IO_ERROR, because it is called with essentially same parameters from - // reiserfs_add_entry above, and we'll catch any i/o errors before we get here. + /* + * reiserfs_add_entry should not return IO_ERROR, + * because it is called with essentially same parameters from + * reiserfs_add_entry above, and we'll catch any i/o errors + * before we get here. + */ if (retval != NAME_FOUND_INVISIBLE && retval != NAME_FOUND) { pathrelse(&new_entry_path); pathrelse(&old_entry_path); - journal_end(&th, old_dir->i_sb, jbegin_count); + journal_end(&th); reiserfs_write_unlock(old_dir->i_sb); return -EIO; } - copy_item_head(&new_entry_ih, get_ih(&new_entry_path)); + copy_item_head(&new_entry_ih, tp_item_head(&new_entry_path)); reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1); @@ -1364,28 +1455,32 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, pathrelse(&dot_dot_entry_path); pathrelse(&new_entry_path); pathrelse(&old_entry_path); - journal_end(&th, old_dir->i_sb, jbegin_count); + journal_end(&th); reiserfs_write_unlock(old_dir->i_sb); return -EIO; } copy_item_head(&dot_dot_ih, - get_ih(&dot_dot_entry_path)); - // node containing ".." gets into transaction + tp_item_head(&dot_dot_entry_path)); + /* node containing ".." gets into transaction */ reiserfs_prepare_for_journal(old_inode->i_sb, dot_dot_de.de_bh, 1); } - /* we should check seals here, not do - this stuff, yes? Then, having - gathered everything into RAM we - should lock the buffers, yes? -Hans */ - /* probably. our rename needs to hold more - ** than one path at once. The seals would - ** have to be written to deal with multi-path - ** issues -chris + /* + * we should check seals here, not do + * this stuff, yes? Then, having + * gathered everything into RAM we + * should lock the buffers, yes? -Hans */ - /* sanity checking before doing the rename - avoid races many - ** of the above checks could have scheduled. We have to be - ** sure our items haven't been shifted by another process. + /* + * probably. our rename needs to hold more + * than one path at once. The seals would + * have to be written to deal with multi-path + * issues -chris + */ + /* + * sanity checking before doing the rename - avoid races many + * of the above checks could have scheduled. We have to be + * sure our items haven't been shifted by another process. */ if (item_moved(&new_entry_ih, &new_entry_path) || !entry_points_to_object(new_dentry->d_name.name, @@ -1430,24 +1525,28 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, break; } - /* ok, all the changes can be done in one fell swoop when we - have claimed all the buffers needed. */ + /* + * ok, all the changes can be done in one fell swoop when we + * have claimed all the buffers needed. + */ mark_de_visible(new_de.de_deh + new_de.de_entry_num); set_ino_in_dir_entry(&new_de, INODE_PKEY(old_inode)); - journal_mark_dirty(&th, old_dir->i_sb, new_de.de_bh); + journal_mark_dirty(&th, new_de.de_bh); mark_de_hidden(old_de.de_deh + old_de.de_entry_num); - journal_mark_dirty(&th, old_dir->i_sb, old_de.de_bh); + journal_mark_dirty(&th, old_de.de_bh); ctime = CURRENT_TIME_SEC; old_dir->i_ctime = old_dir->i_mtime = ctime; new_dir->i_ctime = new_dir->i_mtime = ctime; - /* thanks to Alex Adriaanse <alex_a@caltech.edu> for patch which adds ctime update of - renamed object */ + /* + * thanks to Alex Adriaanse <alex_a@caltech.edu> for patch + * which adds ctime update of renamed object + */ old_inode->i_ctime = ctime; if (new_dentry_inode) { - // adjust link number of the victim + /* adjust link number of the victim */ if (S_ISDIR(new_dentry_inode->i_mode)) { clear_nlink(new_dentry_inode); } else { @@ -1460,25 +1559,32 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (S_ISDIR(old_inode_mode)) { /* adjust ".." of renamed directory */ set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir)); - journal_mark_dirty(&th, new_dir->i_sb, dot_dot_de.de_bh); + journal_mark_dirty(&th, dot_dot_de.de_bh); + /* + * there (in new_dir) was no directory, so it got new link + * (".." of renamed directory) + */ if (!new_dentry_inode) - /* there (in new_dir) was no directory, so it got new link - (".." of renamed directory) */ INC_DIR_INODE_NLINK(new_dir); /* old directory lost one link - ".. " of renamed directory */ DEC_DIR_INODE_NLINK(old_dir); } - // looks like in 2.3.99pre3 brelse is atomic. so we can use pathrelse + /* + * looks like in 2.3.99pre3 brelse is atomic. + * so we can use pathrelse + */ pathrelse(&new_entry_path); pathrelse(&dot_dot_entry_path); - // FIXME: this reiserfs_cut_from_item's return value may screw up - // anybody, but it will panic if will not be able to find the - // entry. This needs one more clean up + /* + * FIXME: this reiserfs_cut_from_item's return value may screw up + * anybody, but it will panic if will not be able to find the + * entry. This needs one more clean up + */ if (reiserfs_cut_from_item - (&th, &old_entry_path, &(old_de.de_entry_key), old_dir, NULL, + (&th, &old_entry_path, &old_de.de_entry_key, old_dir, NULL, 0) < 0) reiserfs_error(old_dir->i_sb, "vs-7060", "couldn't not cut old name. Fsck later?"); @@ -1496,16 +1602,13 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, reiserfs_update_sd(&th, new_dentry_inode); } - retval = journal_end(&th, old_dir->i_sb, jbegin_count); + retval = journal_end(&th); reiserfs_write_unlock(old_dir->i_sb); return retval; } -/* - * directories can handle most operations... - */ +/* directories can handle most operations... */ const struct inode_operations reiserfs_dir_inode_operations = { - //&reiserfs_dir_operations, /* default_file_ops */ .create = reiserfs_create, .lookup = reiserfs_lookup, .link = reiserfs_link, @@ -1522,6 +1625,7 @@ const struct inode_operations reiserfs_dir_inode_operations = { .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, .get_acl = reiserfs_get_acl, + .set_acl = reiserfs_set_acl, }; /* @@ -1538,8 +1642,6 @@ const struct inode_operations reiserfs_symlink_inode_operations = { .listxattr = reiserfs_listxattr, .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, - .get_acl = reiserfs_get_acl, - }; /* @@ -1553,4 +1655,5 @@ const struct inode_operations reiserfs_special_inode_operations = { .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, .get_acl = reiserfs_get_acl, + .set_acl = reiserfs_set_acl, }; diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c index f732d6a5251..99a5d5dae46 100644 --- a/fs/reiserfs/objectid.c +++ b/fs/reiserfs/objectid.c @@ -7,7 +7,7 @@ #include <linux/time.h> #include "reiserfs.h" -// find where objectid map starts +/* find where objectid map starts */ #define objectid_map(s,rs) (old_format_only (s) ? \ (__le32 *)((struct reiserfs_super_block_v1 *)(rs) + 1) :\ (__le32 *)((rs) + 1)) @@ -20,7 +20,7 @@ static void check_objectid_map(struct super_block *s, __le32 * map) reiserfs_panic(s, "vs-15010", "map corrupted: %lx", (long unsigned int)le32_to_cpu(map[0])); - // FIXME: add something else here + /* FIXME: add something else here */ } #else @@ -29,19 +29,21 @@ static void check_objectid_map(struct super_block *s, __le32 * map) } #endif -/* When we allocate objectids we allocate the first unused objectid. - Each sequence of objectids in use (the odd sequences) is followed - by a sequence of objectids not in use (the even sequences). We - only need to record the last objectid in each of these sequences - (both the odd and even sequences) in order to fully define the - boundaries of the sequences. A consequence of allocating the first - objectid not in use is that under most conditions this scheme is - extremely compact. The exception is immediately after a sequence - of operations which deletes a large number of objects of - non-sequential objectids, and even then it will become compact - again as soon as more objects are created. Note that many - interesting optimizations of layout could result from complicating - objectid assignment, but we have deferred making them for now. */ +/* + * When we allocate objectids we allocate the first unused objectid. + * Each sequence of objectids in use (the odd sequences) is followed + * by a sequence of objectids not in use (the even sequences). We + * only need to record the last objectid in each of these sequences + * (both the odd and even sequences) in order to fully define the + * boundaries of the sequences. A consequence of allocating the first + * objectid not in use is that under most conditions this scheme is + * extremely compact. The exception is immediately after a sequence + * of operations which deletes a large number of objects of + * non-sequential objectids, and even then it will become compact + * again as soon as more objects are created. Note that many + * interesting optimizations of layout could result from complicating + * objectid assignment, but we have deferred making them for now. + */ /* get unique object identifier */ __u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th) @@ -64,26 +66,30 @@ __u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th) return 0; } - /* This incrementation allocates the first unused objectid. That - is to say, the first entry on the objectid map is the first - unused objectid, and by incrementing it we use it. See below - where we check to see if we eliminated a sequence of unused - objectids.... */ + /* + * This incrementation allocates the first unused objectid. That + * is to say, the first entry on the objectid map is the first + * unused objectid, and by incrementing it we use it. See below + * where we check to see if we eliminated a sequence of unused + * objectids.... + */ map[1] = cpu_to_le32(unused_objectid + 1); - /* Now we check to see if we eliminated the last remaining member of - the first even sequence (and can eliminate the sequence by - eliminating its last objectid from oids), and can collapse the - first two odd sequences into one sequence. If so, then the net - result is to eliminate a pair of objectids from oids. We do this - by shifting the entire map to the left. */ + /* + * Now we check to see if we eliminated the last remaining member of + * the first even sequence (and can eliminate the sequence by + * eliminating its last objectid from oids), and can collapse the + * first two odd sequences into one sequence. If so, then the net + * result is to eliminate a pair of objectids from oids. We do this + * by shifting the entire map to the left. + */ if (sb_oid_cursize(rs) > 2 && map[1] == map[2]) { memmove(map + 1, map + 3, (sb_oid_cursize(rs) - 3) * sizeof(__u32)); set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2); } - journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s)); + journal_mark_dirty(th, SB_BUFFER_WITH_SB(s)); return unused_objectid; } @@ -97,30 +103,33 @@ void reiserfs_release_objectid(struct reiserfs_transaction_handle *th, int i = 0; BUG_ON(!th->t_trans_id); - //return; + /*return; */ check_objectid_map(s, map); reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); - journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s)); - - /* start at the beginning of the objectid map (i = 0) and go to - the end of it (i = disk_sb->s_oid_cursize). Linear search is - what we use, though it is possible that binary search would be - more efficient after performing lots of deletions (which is - when oids is large.) We only check even i's. */ + journal_mark_dirty(th, SB_BUFFER_WITH_SB(s)); + + /* + * start at the beginning of the objectid map (i = 0) and go to + * the end of it (i = disk_sb->s_oid_cursize). Linear search is + * what we use, though it is possible that binary search would be + * more efficient after performing lots of deletions (which is + * when oids is large.) We only check even i's. + */ while (i < sb_oid_cursize(rs)) { if (objectid_to_release == le32_to_cpu(map[i])) { /* This incrementation unallocates the objectid. */ - //map[i]++; le32_add_cpu(&map[i], 1); - /* Did we unallocate the last member of an odd sequence, and can shrink oids? */ + /* + * Did we unallocate the last member of an + * odd sequence, and can shrink oids? + */ if (map[i] == map[i + 1]) { /* shrink objectid map */ memmove(map + i, map + i + 2, (sb_oid_cursize(rs) - i - 2) * sizeof(__u32)); - //disk_sb->s_oid_cursize -= 2; set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2); RFALSE(sb_oid_cursize(rs) < 2 || @@ -135,14 +144,19 @@ void reiserfs_release_objectid(struct reiserfs_transaction_handle *th, objectid_to_release < le32_to_cpu(map[i + 1])) { /* size of objectid map is not changed */ if (objectid_to_release + 1 == le32_to_cpu(map[i + 1])) { - //objectid_map[i+1]--; le32_add_cpu(&map[i + 1], -1); return; } - /* JDM comparing two little-endian values for equality -- safe */ + /* + * JDM comparing two little-endian values for + * equality -- safe + */ + /* + * objectid map must be expanded, but + * there is no space + */ if (sb_oid_cursize(rs) == sb_oid_maxsize(rs)) { - /* objectid map must be expanded, but there is no space */ PROC_INFO_INC(s, leaked_oid); return; } @@ -178,8 +192,9 @@ int reiserfs_convert_objectid_map_v1(struct super_block *s) new_objectid_map = (__le32 *) (disk_sb + 1); if (cur_size > new_size) { - /* mark everyone used that was listed as free at the end of the objectid - ** map + /* + * mark everyone used that was listed as free at + * the end of the objectid map */ objectid_map[new_size - 1] = objectid_map[cur_size - 1]; set_sb_oid_cursize(disk_sb, new_size); diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index 54944d5a4a6..c9b47e91baf 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -172,18 +172,19 @@ static char *is_there_reiserfs_struct(char *fmt, int *what) return k; } -/* debugging reiserfs we used to print out a lot of different - variables, like keys, item headers, buffer heads etc. Values of - most fields matter. So it took a long time just to write - appropriative printk. With this reiserfs_warning you can use format - specification for complex structures like you used to do with - printfs for integers, doubles and pointers. For instance, to print - out key structure you have to write just: - reiserfs_warning ("bad key %k", key); - instead of - printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid, - key->k_offset, key->k_uniqueness); -*/ +/* + * debugging reiserfs we used to print out a lot of different + * variables, like keys, item headers, buffer heads etc. Values of + * most fields matter. So it took a long time just to write + * appropriative printk. With this reiserfs_warning you can use format + * specification for complex structures like you used to do with + * printfs for integers, doubles and pointers. For instance, to print + * out key structure you have to write just: + * reiserfs_warning ("bad key %k", key); + * instead of + * printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid, + * key->k_offset, key->k_uniqueness); + */ static DEFINE_SPINLOCK(error_lock); static void prepare_error_buf(const char *fmt, va_list args) { @@ -243,15 +244,16 @@ static void prepare_error_buf(const char *fmt, va_list args) } -/* in addition to usual conversion specifiers this accepts reiserfs - specific conversion specifiers: - %k to print little endian key, - %K to print cpu key, - %h to print item_head, - %t to print directory entry - %z to print block head (arg must be struct buffer_head * - %b to print buffer_head -*/ +/* + * in addition to usual conversion specifiers this accepts reiserfs + * specific conversion specifiers: + * %k to print little endian key, + * %K to print cpu key, + * %h to print item_head, + * %t to print directory entry + * %z to print block head (arg must be struct buffer_head * + * %b to print buffer_head + */ #define do_reiserfs_warning(fmt)\ {\ @@ -304,50 +306,52 @@ void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...) #endif } -/* The format: - - maintainer-errorid: [function-name:] message - - where errorid is unique to the maintainer and function-name is - optional, is recommended, so that anyone can easily find the bug - with a simple grep for the short to type string - maintainer-errorid. Don't bother with reusing errorids, there are - lots of numbers out there. - - Example: - - reiserfs_panic( - p_sb, "reiser-29: reiserfs_new_blocknrs: " - "one of search_start or rn(%d) is equal to MAX_B_NUM," - "which means that we are optimizing location based on the bogus location of a temp buffer (%p).", - rn, bh - ); - - Regular panic()s sometimes clear the screen before the message can - be read, thus the need for the while loop. - - Numbering scheme for panic used by Vladimir and Anatoly( Hans completely ignores this scheme, and considers it - pointless complexity): - - panics in reiserfs.h have numbers from 1000 to 1999 - super.c 2000 to 2999 - preserve.c (unused) 3000 to 3999 - bitmap.c 4000 to 4999 - stree.c 5000 to 5999 - prints.c 6000 to 6999 - namei.c 7000 to 7999 - fix_nodes.c 8000 to 8999 - dir.c 9000 to 9999 - lbalance.c 10000 to 10999 - ibalance.c 11000 to 11999 not ready - do_balan.c 12000 to 12999 - inode.c 13000 to 13999 - file.c 14000 to 14999 - objectid.c 15000 - 15999 - buffer.c 16000 - 16999 - symlink.c 17000 - 17999 - - . */ +/* + * The format: + * + * maintainer-errorid: [function-name:] message + * + * where errorid is unique to the maintainer and function-name is + * optional, is recommended, so that anyone can easily find the bug + * with a simple grep for the short to type string + * maintainer-errorid. Don't bother with reusing errorids, there are + * lots of numbers out there. + * + * Example: + * + * reiserfs_panic( + * p_sb, "reiser-29: reiserfs_new_blocknrs: " + * "one of search_start or rn(%d) is equal to MAX_B_NUM," + * "which means that we are optimizing location based on the " + * "bogus location of a temp buffer (%p).", + * rn, bh + * ); + * + * Regular panic()s sometimes clear the screen before the message can + * be read, thus the need for the while loop. + * + * Numbering scheme for panic used by Vladimir and Anatoly( Hans completely + * ignores this scheme, and considers it pointless complexity): + * + * panics in reiserfs_fs.h have numbers from 1000 to 1999 + * super.c 2000 to 2999 + * preserve.c (unused) 3000 to 3999 + * bitmap.c 4000 to 4999 + * stree.c 5000 to 5999 + * prints.c 6000 to 6999 + * namei.c 7000 to 7999 + * fix_nodes.c 8000 to 8999 + * dir.c 9000 to 9999 + * lbalance.c 10000 to 10999 + * ibalance.c 11000 to 11999 not ready + * do_balan.c 12000 to 12999 + * inode.c 13000 to 13999 + * file.c 14000 to 14999 + * objectid.c 15000 - 15999 + * buffer.c 16000 - 16999 + * symlink.c 17000 - 17999 + * + * . */ void __reiserfs_panic(struct super_block *sb, const char *id, const char *function, const char *fmt, ...) @@ -411,9 +415,11 @@ void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...) reiserfs_abort_journal(sb, errno); } -/* this prints internal nodes (4 keys/items in line) (dc_number, - dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number, - dc_size)...*/ +/* + * this prints internal nodes (4 keys/items in line) (dc_number, + * dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number, + * dc_size)... + */ static int print_internal(struct buffer_head *bh, int first, int last) { struct reiserfs_key *key; @@ -439,7 +445,7 @@ static int print_internal(struct buffer_head *bh, int first, int last) dc = B_N_CHILD(bh, from); reiserfs_printk("PTR %d: %y ", from, dc); - for (i = from, key = B_N_PDELIM_KEY(bh, from), dc++; i < to; + for (i = from, key = internal_key(bh, from), dc++; i < to; i++, key++, dc++) { reiserfs_printk("KEY %d: %k PTR %d: %y ", i, key, i + 1, dc); if (i && i % 4 == 0) @@ -463,7 +469,7 @@ static int print_leaf(struct buffer_head *bh, int print_mode, int first, check_leaf(bh); blkh = B_BLK_HEAD(bh); - ih = B_N_PITEM_HEAD(bh, 0); + ih = item_head(bh, 0); nr = blkh_nr_item(blkh); printk @@ -496,7 +502,7 @@ static int print_leaf(struct buffer_head *bh, int print_mode, int first, ("-------------------------------------------------------------------------------\n"); reiserfs_printk("|%2d| %h |\n", i, ih); if (print_mode & PRINT_LEAF_ITEMS) - op_print_item(ih, B_I_PITEM(bh, ih)); + op_print_item(ih, ih_item_body(bh, ih)); } printk @@ -543,9 +549,11 @@ static int print_super_block(struct buffer_head *bh) printk("Block count %u\n", sb_block_count(rs)); printk("Blocksize %d\n", sb_blocksize(rs)); printk("Free blocks %u\n", sb_free_blocks(rs)); - // FIXME: this would be confusing if - // someone stores reiserfs super block in some data block ;) + /* + * FIXME: this would be confusing if + * someone stores reiserfs super block in some data block ;) // skipped = (bh->b_blocknr * bh->b_size) / sb_blocksize(rs); + */ skipped = bh->b_blocknr; data_blocks = sb_block_count(rs) - skipped - 1 - sb_bmap_nr(rs) - (!is_reiserfs_jr(rs) ? sb_jp_journal_size(rs) + @@ -581,8 +589,8 @@ static int print_desc_block(struct buffer_head *bh) return 0; } - -void print_block(struct buffer_head *bh, ...) //int print_mode, int first, int last) +/* ..., int print_mode, int first, int last) */ +void print_block(struct buffer_head *bh, ...) { va_list args; int mode, first, last; @@ -644,11 +652,11 @@ void store_print_tb(struct tree_balance *tb) "* %d * %3lld(%2d) * %3lld(%2d) * %3lld(%2d) * %5lld * %5lld * %5lld * %5lld * %5lld *\n", h, (tbSh) ? (long long)(tbSh->b_blocknr) : (-1LL), - (tbSh) ? atomic_read(&(tbSh->b_count)) : -1, + (tbSh) ? atomic_read(&tbSh->b_count) : -1, (tb->L[h]) ? (long long)(tb->L[h]->b_blocknr) : (-1LL), - (tb->L[h]) ? atomic_read(&(tb->L[h]->b_count)) : -1, + (tb->L[h]) ? atomic_read(&tb->L[h]->b_count) : -1, (tb->R[h]) ? (long long)(tb->R[h]->b_blocknr) : (-1LL), - (tb->R[h]) ? atomic_read(&(tb->R[h]->b_count)) : -1, + (tb->R[h]) ? atomic_read(&tb->R[h]->b_count) : -1, (tbFh) ? (long long)(tbFh->b_blocknr) : (-1LL), (tb->FL[h]) ? (long long)(tb->FL[h]-> b_blocknr) : (-1LL), @@ -665,9 +673,9 @@ void store_print_tb(struct tree_balance *tb) "* h * size * ln * lb * rn * rb * blkn * s0 * s1 * s1b * s2 * s2b * curb * lk * rk *\n" "* 0 * %4d * %2d * %2d * %2d * %2d * %4d * %2d * %2d * %3d * %2d * %3d * %4d * %2d * %2d *\n", tb->insert_size[0], tb->lnum[0], tb->lbytes, tb->rnum[0], - tb->rbytes, tb->blknum[0], tb->s0num, tb->s1num, tb->s1bytes, - tb->s2num, tb->s2bytes, tb->cur_blknum, tb->lkey[0], - tb->rkey[0]); + tb->rbytes, tb->blknum[0], tb->s0num, tb->snum[0], + tb->sbytes[0], tb->snum[1], tb->sbytes[1], + tb->cur_blknum, tb->lkey[0], tb->rkey[0]); /* this prints balance parameters for non-leaf levels */ h = 0; @@ -690,7 +698,7 @@ void store_print_tb(struct tree_balance *tb) "%p (%llu %d)%s", tb->FEB[i], tb->FEB[i] ? (unsigned long long)tb->FEB[i]-> b_blocknr : 0ULL, - tb->FEB[i] ? atomic_read(&(tb->FEB[i]->b_count)) : 0, + tb->FEB[i] ? atomic_read(&tb->FEB[i]->b_count) : 0, (i == ARRAY_SIZE(tb->FEB) - 1) ? "\n" : ", "); sprintf(print_tb_buf + strlen(print_tb_buf), @@ -744,8 +752,8 @@ void check_leaf(struct buffer_head *bh) if (!bh) return; check_leaf_block_head(bh); - for (i = 0, ih = B_N_PITEM_HEAD(bh, 0); i < B_NR_ITEMS(bh); i++, ih++) - op_check_item(ih, B_I_PITEM(bh, ih)); + for (i = 0, ih = item_head(bh, 0); i < B_NR_ITEMS(bh); i++, ih++) + op_check_item(ih, ih_item_body(bh, ih)); } void check_internal(struct buffer_head *bh) diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index a958444a75f..02b0b7d0f7d 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -419,7 +419,7 @@ int reiserfs_proc_info_init(struct super_block *sb) char *s; /* Some block devices use /'s */ - strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE); + strlcpy(b, sb->s_id, BDEVNAME_SIZE); s = strchr(b, '/'); if (s) *s = '!'; @@ -449,7 +449,7 @@ int reiserfs_proc_info_done(struct super_block *sb) char *s; /* Some block devices use /'s */ - strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE); + strlcpy(b, sb->s_id, BDEVNAME_SIZE); s = strchr(b, '/'); if (s) *s = '!'; diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index f8adaee537c..bf53888c7f5 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -1,5 +1,6 @@ /* - * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for licensing and copyright details + * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for + * licensing and copyright details */ #include <linux/reiserfs_fs.h> @@ -23,52 +24,73 @@ struct reiserfs_journal_list; -/** bitmasks for i_flags field in reiserfs-specific part of inode */ +/* bitmasks for i_flags field in reiserfs-specific part of inode */ typedef enum { - /** this says what format of key do all items (but stat data) of - an object have. If this is set, that format is 3.6 otherwise - - 3.5 */ + /* + * this says what format of key do all items (but stat data) of + * an object have. If this is set, that format is 3.6 otherwise - 3.5 + */ i_item_key_version_mask = 0x0001, - /** If this is unset, object has 3.5 stat data, otherwise, it has - 3.6 stat data with 64bit size, 32bit nlink etc. */ + + /* + * If this is unset, object has 3.5 stat data, otherwise, + * it has 3.6 stat data with 64bit size, 32bit nlink etc. + */ i_stat_data_version_mask = 0x0002, - /** file might need tail packing on close */ + + /* file might need tail packing on close */ i_pack_on_close_mask = 0x0004, - /** don't pack tail of file */ + + /* don't pack tail of file */ i_nopack_mask = 0x0008, - /** If those is set, "safe link" was created for this file during - truncate or unlink. Safe link is used to avoid leakage of disk - space on crash with some files open, but unlinked. */ + + /* + * If either of these are set, "safe link" was created for this + * file during truncate or unlink. Safe link is used to avoid + * leakage of disk space on crash with some files open, but unlinked. + */ i_link_saved_unlink_mask = 0x0010, i_link_saved_truncate_mask = 0x0020, + i_has_xattr_dir = 0x0040, i_data_log = 0x0080, } reiserfs_inode_flags; struct reiserfs_inode_info { __u32 i_key[4]; /* key is still 4 32 bit integers */ - /** transient inode flags that are never stored on disk. Bitmasks - for this field are defined above. */ + + /* + * transient inode flags that are never stored on disk. Bitmasks + * for this field are defined above. + */ __u32 i_flags; - __u32 i_first_direct_byte; // offset of first byte stored in direct item. + /* offset of first byte stored in direct item. */ + __u32 i_first_direct_byte; /* copy of persistent inode flags read from sd_attrs. */ __u32 i_attrs; - int i_prealloc_block; /* first unused block of a sequence of unused blocks */ + /* first unused block of a sequence of unused blocks */ + int i_prealloc_block; int i_prealloc_count; /* length of that sequence */ - struct list_head i_prealloc_list; /* per-transaction list of inodes which - * have preallocated blocks */ - unsigned new_packing_locality:1; /* new_packig_locality is created; new blocks - * for the contents of this directory should be - * displaced */ + /* per-transaction list of inodes which have preallocated blocks */ + struct list_head i_prealloc_list; - /* we use these for fsync or O_SYNC to decide which transaction - ** needs to be committed in order for this inode to be properly - ** flushed */ + /* + * new_packing_locality is created; new blocks for the contents + * of this directory should be displaced + */ + unsigned new_packing_locality:1; + + /* + * we use these for fsync or O_SYNC to decide which transaction + * needs to be committed in order for this inode to be properly + * flushed + */ unsigned int i_trans_id; + struct reiserfs_journal_list *i_jl; atomic_t openers; struct mutex tailpack; @@ -82,9 +104,10 @@ typedef enum { reiserfs_attrs_cleared = 0x00000001, } reiserfs_super_block_flags; -/* struct reiserfs_super_block accessors/mutators - * since this is a disk structure, it will always be in - * little endian format. */ +/* + * struct reiserfs_super_block accessors/mutators since this is a disk + * structure, it will always be in little endian format. + */ #define sb_block_count(sbp) (le32_to_cpu((sbp)->s_v1.s_block_count)) #define set_sb_block_count(sbp,v) ((sbp)->s_v1.s_block_count = cpu_to_le32(v)) #define sb_free_blocks(sbp) (le32_to_cpu((sbp)->s_v1.s_free_blocks)) @@ -152,48 +175,61 @@ typedef enum { /* LOGGING -- */ -/* These all interelate for performance. -** -** If the journal block count is smaller than n transactions, you lose speed. -** I don't know what n is yet, I'm guessing 8-16. -** -** typical transaction size depends on the application, how often fsync is -** called, and how many metadata blocks you dirty in a 30 second period. -** The more small files (<16k) you use, the larger your transactions will -** be. -** -** If your journal fills faster than dirty buffers get flushed to disk, it must flush them before allowing the journal -** to wrap, which slows things down. If you need high speed meta data updates, the journal should be big enough -** to prevent wrapping before dirty meta blocks get to disk. -** -** If the batch max is smaller than the transaction max, you'll waste space at the end of the journal -** because journal_end sets the next transaction to start at 0 if the next transaction has any chance of wrapping. -** -** The large the batch max age, the better the speed, and the more meta data changes you'll lose after a crash. -** -*/ +/* + * These all interelate for performance. + * + * If the journal block count is smaller than n transactions, you lose speed. + * I don't know what n is yet, I'm guessing 8-16. + * + * typical transaction size depends on the application, how often fsync is + * called, and how many metadata blocks you dirty in a 30 second period. + * The more small files (<16k) you use, the larger your transactions will + * be. + * + * If your journal fills faster than dirty buffers get flushed to disk, it + * must flush them before allowing the journal to wrap, which slows things + * down. If you need high speed meta data updates, the journal should be + * big enough to prevent wrapping before dirty meta blocks get to disk. + * + * If the batch max is smaller than the transaction max, you'll waste space + * at the end of the journal because journal_end sets the next transaction + * to start at 0 if the next transaction has any chance of wrapping. + * + * The large the batch max age, the better the speed, and the more meta + * data changes you'll lose after a crash. + */ /* don't mess with these for a while */ - /* we have a node size define somewhere in reiserfs_fs.h. -Hans */ +/* we have a node size define somewhere in reiserfs_fs.h. -Hans */ #define JOURNAL_BLOCK_SIZE 4096 /* BUG gotta get rid of this */ #define JOURNAL_MAX_CNODE 1500 /* max cnodes to allocate. */ #define JOURNAL_HASH_SIZE 8192 -#define JOURNAL_NUM_BITMAPS 5 /* number of copies of the bitmaps to have floating. Must be >= 2 */ - -/* One of these for every block in every transaction -** Each one is in two hash tables. First, a hash of the current transaction, and after journal_end, a -** hash of all the in memory transactions. -** next and prev are used by the current transaction (journal_hash). -** hnext and hprev are used by journal_list_hash. If a block is in more than one transaction, the journal_list_hash -** links it in multiple times. This allows flush_journal_list to remove just the cnode belonging -** to a given transaction. -*/ + +/* number of copies of the bitmaps to have floating. Must be >= 2 */ +#define JOURNAL_NUM_BITMAPS 5 + +/* + * One of these for every block in every transaction + * Each one is in two hash tables. First, a hash of the current transaction, + * and after journal_end, a hash of all the in memory transactions. + * next and prev are used by the current transaction (journal_hash). + * hnext and hprev are used by journal_list_hash. If a block is in more + * than one transaction, the journal_list_hash links it in multiple times. + * This allows flush_journal_list to remove just the cnode belonging to a + * given transaction. + */ struct reiserfs_journal_cnode { struct buffer_head *bh; /* real buffer head */ struct super_block *sb; /* dev of real buffer head */ - __u32 blocknr; /* block number of real buffer head, == 0 when buffer on disk */ + + /* block number of real buffer head, == 0 when buffer on disk */ + __u32 blocknr; + unsigned long state; - struct reiserfs_journal_list *jlist; /* journal list this cnode lives in */ + + /* journal list this cnode lives in */ + struct reiserfs_journal_list *jlist; + struct reiserfs_journal_cnode *next; /* next in transaction list */ struct reiserfs_journal_cnode *prev; /* prev in transaction list */ struct reiserfs_journal_cnode *hprev; /* prev in hash list */ @@ -212,18 +248,22 @@ struct reiserfs_list_bitmap { }; /* -** one of these for each transaction. The most important part here is the j_realblock. -** this list of cnodes is used to hash all the blocks in all the commits, to mark all the -** real buffer heads dirty once all the commits hit the disk, -** and to make sure every real block in a transaction is on disk before allowing the log area -** to be overwritten */ + * one of these for each transaction. The most important part here is the + * j_realblock. this list of cnodes is used to hash all the blocks in all + * the commits, to mark all the real buffer heads dirty once all the commits + * hit the disk, and to make sure every real block in a transaction is on + * disk before allowing the log area to be overwritten + */ struct reiserfs_journal_list { unsigned long j_start; unsigned long j_state; unsigned long j_len; atomic_t j_nonzerolen; atomic_t j_commit_left; - atomic_t j_older_commits_done; /* all commits older than this on disk */ + + /* all commits older than this on disk */ + atomic_t j_older_commits_done; + struct mutex j_commit_mutex; unsigned int j_trans_id; time_t j_timestamp; @@ -234,11 +274,15 @@ struct reiserfs_journal_list { /* time ordered list of all active transactions */ struct list_head j_list; - /* time ordered list of all transactions we haven't tried to flush yet */ + /* + * time ordered list of all transactions we haven't tried + * to flush yet + */ struct list_head j_working_list; /* list of tail conversion targets in need of flush before commit */ struct list_head j_tail_bh_list; + /* list of data=ordered buffers in need of flush before commit */ struct list_head j_bh_list; int j_refcount; @@ -246,46 +290,83 @@ struct reiserfs_journal_list { struct reiserfs_journal { struct buffer_head **j_ap_blocks; /* journal blocks on disk */ - struct reiserfs_journal_cnode *j_last; /* newest journal block */ - struct reiserfs_journal_cnode *j_first; /* oldest journal block. start here for traverse */ + /* newest journal block */ + struct reiserfs_journal_cnode *j_last; + + /* oldest journal block. start here for traverse */ + struct reiserfs_journal_cnode *j_first; struct block_device *j_dev_bd; fmode_t j_dev_mode; - int j_1st_reserved_block; /* first block on s_dev of reserved area journal */ + + /* first block on s_dev of reserved area journal */ + int j_1st_reserved_block; unsigned long j_state; unsigned int j_trans_id; unsigned long j_mount_id; - unsigned long j_start; /* start of current waiting commit (index into j_ap_blocks) */ + + /* start of current waiting commit (index into j_ap_blocks) */ + unsigned long j_start; unsigned long j_len; /* length of current waiting commit */ - unsigned long j_len_alloc; /* number of buffers requested by journal_begin() */ + + /* number of buffers requested by journal_begin() */ + unsigned long j_len_alloc; + atomic_t j_wcount; /* count of writers for current commit */ - unsigned long j_bcount; /* batch count. allows turning X transactions into 1 */ - unsigned long j_first_unflushed_offset; /* first unflushed transactions offset */ - unsigned j_last_flush_trans_id; /* last fully flushed journal timestamp */ + + /* batch count. allows turning X transactions into 1 */ + unsigned long j_bcount; + + /* first unflushed transactions offset */ + unsigned long j_first_unflushed_offset; + + /* last fully flushed journal timestamp */ + unsigned j_last_flush_trans_id; + struct buffer_head *j_header_bh; time_t j_trans_start_time; /* time this transaction started */ struct mutex j_mutex; struct mutex j_flush_mutex; - wait_queue_head_t j_join_wait; /* wait for current transaction to finish before starting new one */ - atomic_t j_jlock; /* lock for j_join_wait */ + + /* wait for current transaction to finish before starting new one */ + wait_queue_head_t j_join_wait; + + atomic_t j_jlock; /* lock for j_join_wait */ int j_list_bitmap_index; /* number of next list bitmap to use */ - int j_must_wait; /* no more journal begins allowed. MUST sleep on j_join_wait */ - int j_next_full_flush; /* next journal_end will flush all journal list */ - int j_next_async_flush; /* next journal_end will flush all async commits */ + + /* no more journal begins allowed. MUST sleep on j_join_wait */ + int j_must_wait; + + /* next journal_end will flush all journal list */ + int j_next_full_flush; + + /* next journal_end will flush all async commits */ + int j_next_async_flush; int j_cnode_used; /* number of cnodes on the used list */ int j_cnode_free; /* number of cnodes on the free list */ - unsigned int j_trans_max; /* max number of blocks in a transaction. */ - unsigned int j_max_batch; /* max number of blocks to batch into a trans */ - unsigned int j_max_commit_age; /* in seconds, how old can an async commit be */ - unsigned int j_max_trans_age; /* in seconds, how old can a transaction be */ - unsigned int j_default_max_commit_age; /* the default for the max commit age */ + /* max number of blocks in a transaction. */ + unsigned int j_trans_max; + + /* max number of blocks to batch into a trans */ + unsigned int j_max_batch; + + /* in seconds, how old can an async commit be */ + unsigned int j_max_commit_age; + + /* in seconds, how old can a transaction be */ + unsigned int j_max_trans_age; + + /* the default for the max commit age */ + unsigned int j_default_max_commit_age; struct reiserfs_journal_cnode *j_cnode_free_list; - struct reiserfs_journal_cnode *j_cnode_free_orig; /* orig pointer returned from vmalloc */ + + /* orig pointer returned from vmalloc */ + struct reiserfs_journal_cnode *j_cnode_free_orig; struct reiserfs_journal_list *j_current_jl; int j_free_bitmap_nodes; @@ -306,14 +387,21 @@ struct reiserfs_journal { /* list of all active transactions */ struct list_head j_journal_list; + /* lists that haven't been touched by writeback attempts */ struct list_head j_working_list; - struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS]; /* array of bitmaps to record the deleted blocks */ - struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE]; /* hash table for real buffer heads in current trans */ - struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE]; /* hash table for all the real buffer heads in all - the transactions */ - struct list_head j_prealloc_list; /* list of inodes which have preallocated blocks */ + /* hash table for real buffer heads in current trans */ + struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE]; + + /* hash table for all the real buffer heads in all the transactions */ + struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE]; + + /* array of bitmaps to record the deleted blocks */ + struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS]; + + /* list of inodes which have preallocated blocks */ + struct list_head j_prealloc_list; int j_persistent_trans; unsigned long j_max_trans_size; unsigned long j_max_batch_size; @@ -328,11 +416,12 @@ struct reiserfs_journal { enum journal_state_bits { J_WRITERS_BLOCKED = 1, /* set when new writers not allowed */ - J_WRITERS_QUEUED, /* set when log is full due to too many writers */ - J_ABORTED, /* set when log is aborted */ + J_WRITERS_QUEUED, /* set when log is full due to too many writers */ + J_ABORTED, /* set when log is aborted */ }; -#define JOURNAL_DESC_MAGIC "ReIsErLB" /* ick. magic string to find desc blocks in the journal */ +/* ick. magic string to find desc blocks in the journal */ +#define JOURNAL_DESC_MAGIC "ReIsErLB" typedef __u32(*hashf_t) (const signed char *, int); @@ -364,7 +453,10 @@ typedef struct reiserfs_proc_info_data { stat_cnt_t leaked_oid; stat_cnt_t leaves_removable; - /* balances per level. Use explicit 5 as MAX_HEIGHT is not visible yet. */ + /* + * balances per level. + * Use explicit 5 as MAX_HEIGHT is not visible yet. + */ stat_cnt_t balance_at[5]; /* XXX */ /* sbk == search_by_key */ stat_cnt_t sbk_read_at[5]; /* XXX */ @@ -416,47 +508,75 @@ typedef struct reiserfs_proc_info_data { /* reiserfs union of in-core super block data */ struct reiserfs_sb_info { - struct buffer_head *s_sbh; /* Buffer containing the super block */ - /* both the comment and the choice of - name are unclear for s_rs -Hans */ - struct reiserfs_super_block *s_rs; /* Pointer to the super block in the buffer */ + /* Buffer containing the super block */ + struct buffer_head *s_sbh; + + /* Pointer to the on-disk super block in the buffer */ + struct reiserfs_super_block *s_rs; struct reiserfs_bitmap_info *s_ap_bitmap; - struct reiserfs_journal *s_journal; /* pointer to journal information */ + + /* pointer to journal information */ + struct reiserfs_journal *s_journal; + unsigned short s_mount_state; /* reiserfs state (valid, invalid) */ /* Serialize writers access, replace the old bkl */ struct mutex lock; + /* Owner of the lock (can be recursive) */ struct task_struct *lock_owner; + /* Depth of the lock, start from -1 like the bkl */ int lock_depth; + struct workqueue_struct *commit_wq; + /* Comment? -Hans */ void (*end_io_handler) (struct buffer_head *, int); - hashf_t s_hash_function; /* pointer to function which is used - to sort names in directory. Set on - mount */ - unsigned long s_mount_opt; /* reiserfs's mount options are set - here (currently - NOTAIL, NOLOG, - REPLAYONLY) */ - - struct { /* This is a structure that describes block allocator options */ - unsigned long bits; /* Bitfield for enable/disable kind of options */ - unsigned long large_file_size; /* size started from which we consider file to be a large one(in blocks) */ + + /* + * pointer to function which is used to sort names in directory. + * Set on mount + */ + hashf_t s_hash_function; + + /* reiserfs's mount options are set here */ + unsigned long s_mount_opt; + + /* This is a structure that describes block allocator options */ + struct { + /* Bitfield for enable/disable kind of options */ + unsigned long bits; + + /* + * size started from which we consider file + * to be a large one (in blocks) + */ + unsigned long large_file_size; + int border; /* percentage of disk, border takes */ - int preallocmin; /* Minimal file size (in blocks) starting from which we do preallocations */ - int preallocsize; /* Number of blocks we try to prealloc when file - reaches preallocmin size (in blocks) or - prealloc_list is empty. */ + + /* + * Minimal file size (in blocks) starting + * from which we do preallocations + */ + int preallocmin; + + /* + * Number of blocks we try to prealloc when file + * reaches preallocmin size (in blocks) or prealloc_list + is empty. + */ + int preallocsize; } s_alloc_options; /* Comment? -Hans */ wait_queue_head_t s_wait; - /* To be obsoleted soon by per buffer seals.. -Hans */ - atomic_t s_generation_counter; // increased by one every time the - // tree gets re-balanced - unsigned long s_properties; /* File system properties. Currently holds - on-disk FS format */ + /* increased by one every time the tree gets re-balanced */ + atomic_t s_generation_counter; + + /* File system properties. Currently holds on-disk FS format */ + unsigned long s_properties; /* session statistics */ int s_disk_reads; @@ -469,14 +589,23 @@ struct reiserfs_sb_info { int s_bmaps_without_search; int s_direct2indirect; int s_indirect2direct; - /* set up when it's ok for reiserfs_read_inode2() to read from - disk inode with nlink==0. Currently this is only used during - finish_unfinished() processing at mount time */ + + /* + * set up when it's ok for reiserfs_read_inode2() to read from + * disk inode with nlink==0. Currently this is only used during + * finish_unfinished() processing at mount time + */ int s_is_unlinked_ok; + reiserfs_proc_info_data_t s_proc_info_data; struct proc_dir_entry *procdir; - int reserved_blocks; /* amount of blocks reserved for further allocations */ - spinlock_t bitmap_lock; /* this lock on now only used to protect reserved_blocks variable */ + + /* amount of blocks reserved for further allocations */ + int reserved_blocks; + + + /* this lock on now only used to protect reserved_blocks variable */ + spinlock_t bitmap_lock; struct dentry *priv_root; /* root of /.reiserfs_priv */ struct dentry *xattr_root; /* root of /.reiserfs_priv/xattrs */ int j_errno; @@ -492,14 +621,13 @@ struct reiserfs_sb_info { char *s_jdev; /* Stored jdev for mount option showing */ #ifdef CONFIG_REISERFS_CHECK - struct tree_balance *cur_tb; /* - * Detects whether more than one - * copy of tb exists per superblock - * as a means of checking whether - * do_balance is executing concurrently - * against another tree reader/writer - * on a same mount point. - */ + /* + * Detects whether more than one copy of tb exists per superblock + * as a means of checking whether do_balance is executing + * concurrently against another tree reader/writer on a same + * mount point. + */ + struct tree_balance *cur_tb; #endif }; @@ -508,25 +636,36 @@ struct reiserfs_sb_info { #define REISERFS_3_6 1 #define REISERFS_OLD_FORMAT 2 -enum reiserfs_mount_options { /* Mount options */ - REISERFS_LARGETAIL, /* large tails will be created in a session */ - REISERFS_SMALLTAIL, /* small (for files less than block size) tails will be created in a session */ - REPLAYONLY, /* replay journal and return 0. Use by fsck */ - REISERFS_CONVERT, /* -o conv: causes conversion of old - format super block to the new - format. If not specified - old - partition will be dealt with in a - manner of 3.5.x */ - -/* -o hash={tea, rupasov, r5, detect} is meant for properly mounting -** reiserfs disks from 3.5.19 or earlier. 99% of the time, this option -** is not required. If the normal autodection code can't determine which -** hash to use (because both hashes had the same value for a file) -** use this option to force a specific hash. It won't allow you to override -** the existing hash on the FS, so if you have a tea hash disk, and mount -** with -o hash=rupasov, the mount will fail. -*/ +enum reiserfs_mount_options { + /* large tails will be created in a session */ + REISERFS_LARGETAIL, + /* + * small (for files less than block size) tails will + * be created in a session + */ + REISERFS_SMALLTAIL, + + /* replay journal and return 0. Use by fsck */ + REPLAYONLY, + + /* + * -o conv: causes conversion of old format super block to the + * new format. If not specified - old partition will be dealt + * with in a manner of 3.5.x + */ + REISERFS_CONVERT, + + /* + * -o hash={tea, rupasov, r5, detect} is meant for properly mounting + * reiserfs disks from 3.5.19 or earlier. 99% of the time, this + * option is not required. If the normal autodection code can't + * determine which hash to use (because both hashes had the same + * value for a file) use this option to force a specific hash. + * It won't allow you to override the existing hash on the FS, so + * if you have a tea hash disk, and mount with -o hash=rupasov, + * the mount will fail. + */ FORCE_TEA_HASH, /* try to force tea hash on mount */ FORCE_RUPASOV_HASH, /* try to force rupasov hash on mount */ FORCE_R5_HASH, /* try to force rupasov hash on mount */ @@ -536,9 +675,11 @@ enum reiserfs_mount_options { REISERFS_DATA_ORDERED, REISERFS_DATA_WRITEBACK, -/* used for testing experimental features, makes benchmarking new - features with and without more convenient, should never be used by - users in any code shipped to users (ideally) */ + /* + * used for testing experimental features, makes benchmarking new + * features with and without more convenient, should never be used by + * users in any code shipped to users (ideally) + */ REISERFS_NO_BORDER, REISERFS_NO_UNHASHED_RELOCATION, @@ -608,14 +749,6 @@ int reiserfs_resize(struct super_block *, unsigned long); #define SB_DISK_JOURNAL_HEAD(s) (SB_JOURNAL(s)->j_header_bh->) -/* A safe version of the "bdevname", which returns the "s_id" field of - * a superblock or else "Null superblock" if the super block is NULL. - */ -static inline char *reiserfs_bdevname(struct super_block *s) -{ - return (s == NULL) ? "Null superblock" : s->s_id; -} - #define reiserfs_is_journal_aborted(journal) (unlikely (__reiserfs_is_journal_aborted (journal))) static inline int __reiserfs_is_journal_aborted(struct reiserfs_journal *journal) @@ -713,28 +846,28 @@ static inline void reiserfs_cond_resched(struct super_block *s) struct fid; -/* in reading the #defines, it may help to understand that they employ - the following abbreviations: - - B = Buffer - I = Item header - H = Height within the tree (should be changed to LEV) - N = Number of the item in the node - STAT = stat data - DEH = Directory Entry Header - EC = Entry Count - E = Entry number - UL = Unsigned Long - BLKH = BLocK Header - UNFM = UNForMatted node - DC = Disk Child - P = Path - - These #defines are named by concatenating these abbreviations, - where first comes the arguments, and last comes the return value, - of the macro. - -*/ +/* + * in reading the #defines, it may help to understand that they employ + * the following abbreviations: + * + * B = Buffer + * I = Item header + * H = Height within the tree (should be changed to LEV) + * N = Number of the item in the node + * STAT = stat data + * DEH = Directory Entry Header + * EC = Entry Count + * E = Entry number + * UL = Unsigned Long + * BLKH = BLocK Header + * UNFM = UNForMatted node + * DC = Disk Child + * P = Path + * + * These #defines are named by concatenating these abbreviations, + * where first comes the arguments, and last comes the return value, + * of the macro. + */ #define USE_INODE_GENERATION_COUNTER @@ -745,14 +878,17 @@ struct fid; /* n must be power of 2 */ #define _ROUND_UP(x,n) (((x)+(n)-1u) & ~((n)-1u)) -// to be ok for alpha and others we have to align structures to 8 byte -// boundary. -// FIXME: do not change 4 by anything else: there is code which relies on that +/* + * to be ok for alpha and others we have to align structures to 8 byte + * boundary. + * FIXME: do not change 4 by anything else: there is code which relies on that + */ #define ROUND_UP(x) _ROUND_UP(x,8LL) -/* debug levels. Right now, CONFIG_REISERFS_CHECK means print all debug -** messages. -*/ +/* + * debug levels. Right now, CONFIG_REISERFS_CHECK means print all debug + * messages. + */ #define REISERFS_DEBUG_CODE 5 /* extra messages to help find/debug errors */ void __reiserfs_warning(struct super_block *s, const char *id, @@ -761,7 +897,7 @@ void __reiserfs_warning(struct super_block *s, const char *id, __reiserfs_warning(s, id, __func__, fmt, ##args) /* assertions handling */ -/** always check a condition and panic if it's false. */ +/* always check a condition and panic if it's false. */ #define __RASSERT(cond, scond, format, args...) \ do { \ if (!(cond)) \ @@ -784,35 +920,48 @@ do { \ * Disk Data Structures */ -/***************************************************************************/ -/* SUPER BLOCK */ -/***************************************************************************/ +/*************************************************************************** + * SUPER BLOCK * + ***************************************************************************/ /* - * Structure of super block on disk, a version of which in RAM is often accessed as REISERFS_SB(s)->s_rs - * the version in RAM is part of a larger structure containing fields never written to disk. + * Structure of super block on disk, a version of which in RAM is often + * accessed as REISERFS_SB(s)->s_rs. The version in RAM is part of a larger + * structure containing fields never written to disk. */ -#define UNSET_HASH 0 // read_super will guess about, what hash names - // in directories were sorted with +#define UNSET_HASH 0 /* Detect hash on disk */ #define TEA_HASH 1 #define YURA_HASH 2 #define R5_HASH 3 #define DEFAULT_HASH R5_HASH struct journal_params { - __le32 jp_journal_1st_block; /* where does journal start from on its - * device */ - __le32 jp_journal_dev; /* journal device st_rdev */ - __le32 jp_journal_size; /* size of the journal */ - __le32 jp_journal_trans_max; /* max number of blocks in a transaction. */ - __le32 jp_journal_magic; /* random value made on fs creation (this - * was sb_journal_block_count) */ - __le32 jp_journal_max_batch; /* max number of blocks to batch into a - * trans */ - __le32 jp_journal_max_commit_age; /* in seconds, how old can an async - * commit be */ - __le32 jp_journal_max_trans_age; /* in seconds, how old can a transaction - * be */ + /* where does journal start from on its * device */ + __le32 jp_journal_1st_block; + + /* journal device st_rdev */ + __le32 jp_journal_dev; + + /* size of the journal */ + __le32 jp_journal_size; + + /* max number of blocks in a transaction. */ + __le32 jp_journal_trans_max; + + /* + * random value made on fs creation + * (this was sb_journal_block_count) + */ + __le32 jp_journal_magic; + + /* max number of blocks to batch into a trans */ + __le32 jp_journal_max_batch; + + /* in seconds, how old can an async commit be */ + __le32 jp_journal_max_commit_age; + + /* in seconds, how old can a transaction be */ + __le32 jp_journal_max_trans_age; }; /* this is the super from 3.5.X, where X >= 10 */ @@ -822,26 +971,48 @@ struct reiserfs_super_block_v1 { __le32 s_root_block; /* root block number */ struct journal_params s_journal; __le16 s_blocksize; /* block size */ - __le16 s_oid_maxsize; /* max size of object id array, see - * get_objectid() commentary */ + + /* max size of object id array, see get_objectid() commentary */ + __le16 s_oid_maxsize; __le16 s_oid_cursize; /* current size of object id array */ - __le16 s_umount_state; /* this is set to 1 when filesystem was - * umounted, to 2 - when not */ - char s_magic[10]; /* reiserfs magic string indicates that - * file system is reiserfs: - * "ReIsErFs" or "ReIsEr2Fs" or "ReIsEr3Fs" */ - __le16 s_fs_state; /* it is set to used by fsck to mark which - * phase of rebuilding is done */ - __le32 s_hash_function_code; /* indicate, what hash function is being use - * to sort names in a directory*/ + + /* this is set to 1 when filesystem was umounted, to 2 - when not */ + __le16 s_umount_state; + + /* + * reiserfs magic string indicates that file system is reiserfs: + * "ReIsErFs" or "ReIsEr2Fs" or "ReIsEr3Fs" + */ + char s_magic[10]; + + /* + * it is set to used by fsck to mark which + * phase of rebuilding is done + */ + __le16 s_fs_state; + /* + * indicate, what hash function is being use + * to sort names in a directory + */ + __le32 s_hash_function_code; __le16 s_tree_height; /* height of disk tree */ - __le16 s_bmap_nr; /* amount of bitmap blocks needed to address - * each block of file system */ - __le16 s_version; /* this field is only reliable on filesystem - * with non-standard journal */ - __le16 s_reserved_for_journal; /* size in blocks of journal area on main - * device, we need to keep after - * making fs with non-standard journal */ + + /* + * amount of bitmap blocks needed to address + * each block of file system + */ + __le16 s_bmap_nr; + + /* + * this field is only reliable on filesystem with non-standard journal + */ + __le16 s_version; + + /* + * size in blocks of journal area on main device, we need to + * keep after making fs with non-standard journal + */ + __le16 s_reserved_for_journal; } __attribute__ ((__packed__)); #define SB_SIZE_V1 (sizeof(struct reiserfs_super_block_v1)) @@ -850,17 +1021,21 @@ struct reiserfs_super_block_v1 { struct reiserfs_super_block { struct reiserfs_super_block_v1 s_v1; __le32 s_inode_generation; - __le32 s_flags; /* Right now used only by inode-attributes, if enabled */ + + /* Right now used only by inode-attributes, if enabled */ + __le32 s_flags; + unsigned char s_uuid[16]; /* filesystem unique identifier */ unsigned char s_label[16]; /* filesystem volume label */ __le16 s_mnt_count; /* Count of mounts since last fsck */ __le16 s_max_mnt_count; /* Maximum mounts before check */ __le32 s_lastcheck; /* Timestamp of last fsck */ __le32 s_check_interval; /* Interval between checks */ - char s_unused[76]; /* zero filled by mkreiserfs and - * reiserfs_convert_objectid_map_v1() - * so any additions must be updated - * there as well. */ + + /* + * zero filled by mkreiserfs and reiserfs_convert_objectid_map_v1() + * so any additions must be updated there as well. */ + char s_unused[76]; } __attribute__ ((__packed__)); #define SB_SIZE (sizeof(struct reiserfs_super_block)) @@ -868,7 +1043,7 @@ struct reiserfs_super_block { #define REISERFS_VERSION_1 0 #define REISERFS_VERSION_2 2 -// on-disk super block fields converted to cpu form +/* on-disk super block fields converted to cpu form */ #define SB_DISK_SUPER_BLOCK(s) (REISERFS_SB(s)->s_rs) #define SB_V1_DISK_SUPER_BLOCK(s) (&(SB_DISK_SUPER_BLOCK(s)->s_v1)) #define SB_BLOCKSIZE(s) \ @@ -923,11 +1098,13 @@ int is_reiserfs_3_5(struct reiserfs_super_block *rs); int is_reiserfs_3_6(struct reiserfs_super_block *rs); int is_reiserfs_jr(struct reiserfs_super_block *rs); -/* ReiserFS leaves the first 64k unused, so that partition labels have - enough space. If someone wants to write a fancy bootloader that - needs more than 64k, let us know, and this will be increased in size. - This number must be larger than than the largest block size on any - platform, or code will break. -Hans */ +/* + * ReiserFS leaves the first 64k unused, so that partition labels have + * enough space. If someone wants to write a fancy bootloader that + * needs more than 64k, let us know, and this will be increased in size. + * This number must be larger than than the largest block size on any + * platform, or code will break. -Hans + */ #define REISERFS_DISK_OFFSET_IN_BYTES (64 * 1024) #define REISERFS_FIRST_BLOCK unused_define #define REISERFS_JOURNAL_OFFSET_IN_BYTES REISERFS_DISK_OFFSET_IN_BYTES @@ -952,8 +1129,7 @@ struct unfm_nodeinfo { unsigned short unfm_freespace; }; -/* there are two formats of keys: 3.5 and 3.6 - */ +/* there are two formats of keys: 3.5 and 3.6 */ #define KEY_FORMAT_3_5 0 #define KEY_FORMAT_3_6 1 @@ -971,8 +1147,10 @@ static inline struct reiserfs_sb_info *REISERFS_SB(const struct super_block *sb) return sb->s_fs_info; } -/* Don't trust REISERFS_SB(sb)->s_bmap_nr, it's a u16 - * which overflows on large file systems. */ +/* + * Don't trust REISERFS_SB(sb)->s_bmap_nr, it's a u16 + * which overflows on large file systems. + */ static inline __u32 reiserfs_bmap_count(struct super_block *sb) { return (SB_BLOCK_COUNT(sb) - 1) / (sb->s_blocksize * 8) + 1; @@ -983,8 +1161,10 @@ static inline int bmap_would_wrap(unsigned bmap_nr) return bmap_nr > ((1LL << 16) - 1); } -/** this says about version of key of all items (but stat data) the - object consists of */ +/* + * this says about version of key of all items (but stat data) the + * object consists of + */ #define get_inode_item_key_version( inode ) \ ((REISERFS_I(inode)->i_flags & i_item_key_version_mask) ? KEY_FORMAT_3_6 : KEY_FORMAT_3_5) @@ -1003,16 +1183,18 @@ static inline int bmap_would_wrap(unsigned bmap_nr) else \ REISERFS_I(inode)->i_flags &= ~i_stat_data_version_mask; }) -/* This is an aggressive tail suppression policy, I am hoping it - improves our benchmarks. The principle behind it is that percentage - space saving is what matters, not absolute space saving. This is - non-intuitive, but it helps to understand it if you consider that the - cost to access 4 blocks is not much more than the cost to access 1 - block, if you have to do a seek and rotate. A tail risks a - non-linear disk access that is significant as a percentage of total - time cost for a 4 block file and saves an amount of space that is - less significant as a percentage of space, or so goes the hypothesis. - -Hans */ +/* + * This is an aggressive tail suppression policy, I am hoping it + * improves our benchmarks. The principle behind it is that percentage + * space saving is what matters, not absolute space saving. This is + * non-intuitive, but it helps to understand it if you consider that the + * cost to access 4 blocks is not much more than the cost to access 1 + * block, if you have to do a seek and rotate. A tail risks a + * non-linear disk access that is significant as a percentage of total + * time cost for a 4 block file and saves an amount of space that is + * less significant as a percentage of space, or so goes the hypothesis. + * -Hans + */ #define STORE_TAIL_IN_UNFM_S1(n_file_size,n_tail_size,n_block_size) \ (\ (!(n_tail_size)) || \ @@ -1026,10 +1208,11 @@ static inline int bmap_would_wrap(unsigned bmap_nr) ( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size) * 3)/4) ) ) \ ) -/* Another strategy for tails, this one means only create a tail if all the - file would fit into one DIRECT item. - Primary intention for this one is to increase performance by decreasing - seeking. +/* + * Another strategy for tails, this one means only create a tail if all the + * file would fit into one DIRECT item. + * Primary intention for this one is to increase performance by decreasing + * seeking. */ #define STORE_TAIL_IN_UNFM_S2(n_file_size,n_tail_size,n_block_size) \ (\ @@ -1043,23 +1226,21 @@ static inline int bmap_would_wrap(unsigned bmap_nr) #define REISERFS_VALID_FS 1 #define REISERFS_ERROR_FS 2 -// -// there are 5 item types currently -// +/* + * there are 5 item types currently + */ #define TYPE_STAT_DATA 0 #define TYPE_INDIRECT 1 #define TYPE_DIRECT 2 #define TYPE_DIRENTRY 3 #define TYPE_MAXTYPE 3 -#define TYPE_ANY 15 // FIXME: comment is required +#define TYPE_ANY 15 /* FIXME: comment is required */ -/***************************************************************************/ -/* KEY & ITEM HEAD */ -/***************************************************************************/ +/*************************************************************************** + * KEY & ITEM HEAD * + ***************************************************************************/ -// -// directories use this key as well as old files -// +/* * directories use this key as well as old files */ struct offset_v1 { __le32 k_offset; __le32 k_uniqueness; @@ -1092,11 +1273,14 @@ static inline void set_offset_v2_k_offset(struct offset_v2 *v2, loff_t offset) v2->v = (v2->v & cpu_to_le64(15ULL << 60)) | cpu_to_le64(offset); } -/* Key of an item determines its location in the S+tree, and - is composed of 4 components */ +/* + * Key of an item determines its location in the S+tree, and + * is composed of 4 components + */ struct reiserfs_key { - __le32 k_dir_id; /* packing locality: by default parent - directory object id */ + /* packing locality: by default parent directory object id */ + __le32 k_dir_id; + __le32 k_objectid; /* object identifier */ union { struct offset_v1 k_offset_v1; @@ -1105,8 +1289,8 @@ struct reiserfs_key { } __attribute__ ((__packed__)); struct in_core_key { - __u32 k_dir_id; /* packing locality: by default parent - directory object id */ + /* packing locality: by default parent directory object id */ + __u32 k_dir_id; __u32 k_objectid; /* object identifier */ __u64 k_offset; __u8 k_type; @@ -1115,14 +1299,16 @@ struct in_core_key { struct cpu_key { struct in_core_key on_disk_key; int version; - int key_length; /* 3 in all cases but direct2indirect and - indirect2direct conversion */ + /* 3 in all cases but direct2indirect and indirect2direct conversion */ + int key_length; }; -/* Our function for comparing keys can compare keys of different - lengths. It takes as a parameter the length of the keys it is to - compare. These defines are used in determining what is to be passed - to it as that parameter. */ +/* + * Our function for comparing keys can compare keys of different + * lengths. It takes as a parameter the length of the keys it is to + * compare. These defines are used in determining what is to be passed + * to it as that parameter. + */ #define REISERFS_FULL_KEY_LEN 4 #define REISERFS_SHORT_KEY_LEN 2 @@ -1151,40 +1337,52 @@ struct cpu_key { #define POSITION_FOUND 1 #define POSITION_NOT_FOUND 0 -// return values for reiserfs_find_entry and search_by_entry_key +/* return values for reiserfs_find_entry and search_by_entry_key */ #define NAME_FOUND 1 #define NAME_NOT_FOUND 0 #define GOTO_PREVIOUS_ITEM 2 #define NAME_FOUND_INVISIBLE 3 -/* Everything in the filesystem is stored as a set of items. The - item head contains the key of the item, its free space (for - indirect items) and specifies the location of the item itself - within the block. */ +/* + * Everything in the filesystem is stored as a set of items. The + * item head contains the key of the item, its free space (for + * indirect items) and specifies the location of the item itself + * within the block. + */ struct item_head { - /* Everything in the tree is found by searching for it based on - * its key.*/ + /* + * Everything in the tree is found by searching for it based on + * its key. + */ struct reiserfs_key ih_key; union { - /* The free space in the last unformatted node of an - indirect item if this is an indirect item. This - equals 0xFFFF iff this is a direct item or stat data - item. Note that the key, not this field, is used to - determine the item type, and thus which field this - union contains. */ + /* + * The free space in the last unformatted node of an + * indirect item if this is an indirect item. This + * equals 0xFFFF iff this is a direct item or stat data + * item. Note that the key, not this field, is used to + * determine the item type, and thus which field this + * union contains. + */ __le16 ih_free_space_reserved; - /* Iff this is a directory item, this field equals the - number of directory entries in the directory item. */ + + /* + * Iff this is a directory item, this field equals the + * number of directory entries in the directory item. + */ __le16 ih_entry_count; } __attribute__ ((__packed__)) u; __le16 ih_item_len; /* total size of the item body */ - __le16 ih_item_location; /* an offset to the item body - * within the block */ - __le16 ih_version; /* 0 for all old items, 2 for new - ones. Highest bit is set by fsck - temporary, cleaned after all - done */ + + /* an offset to the item body within the block */ + __le16 ih_item_location; + + /* + * 0 for all old items, 2 for new ones. Highest bit is set by fsck + * temporary, cleaned after all done + */ + __le16 ih_version; } __attribute__ ((__packed__)); /* size of item header */ #define IH_SIZE (sizeof(struct item_head)) @@ -1206,27 +1404,24 @@ struct item_head { #define get_ih_free_space(ih) (ih_version (ih) == KEY_FORMAT_3_6 ? 0 : ih_free_space (ih)) #define set_ih_free_space(ih,val) put_ih_free_space((ih), ((ih_version(ih) == KEY_FORMAT_3_6) ? 0 : (val))) -/* these operate on indirect items, where you've got an array of ints -** at a possibly unaligned location. These are a noop on ia32 -** -** p is the array of __u32, i is the index into the array, v is the value -** to store there. -*/ +/* + * these operate on indirect items, where you've got an array of ints + * at a possibly unaligned location. These are a noop on ia32 + * + * p is the array of __u32, i is the index into the array, v is the value + * to store there. + */ #define get_block_num(p, i) get_unaligned_le32((p) + (i)) #define put_block_num(p, i, v) put_unaligned_le32((v), (p) + (i)) -// -// in old version uniqueness field shows key type -// +/* * in old version uniqueness field shows key type */ #define V1_SD_UNIQUENESS 0 #define V1_INDIRECT_UNIQUENESS 0xfffffffe #define V1_DIRECT_UNIQUENESS 0xffffffff #define V1_DIRENTRY_UNIQUENESS 500 -#define V1_ANY_UNIQUENESS 555 // FIXME: comment is required +#define V1_ANY_UNIQUENESS 555 /* FIXME: comment is required */ -// -// here are conversion routines -// +/* here are conversion routines */ static inline int uniqueness2type(__u32 uniqueness) CONSTF; static inline int uniqueness2type(__u32 uniqueness) { @@ -1263,11 +1458,11 @@ static inline __u32 type2uniqueness(int type) } } -// -// key is pointer to on disk key which is stored in le, result is cpu, -// there is no way to get version of object from key, so, provide -// version to these defines -// +/* + * key is pointer to on disk key which is stored in le, result is cpu, + * there is no way to get version of object from key, so, provide + * version to these defines + */ static inline loff_t le_key_k_offset(int version, const struct reiserfs_key *key) { @@ -1283,9 +1478,11 @@ static inline loff_t le_ih_k_offset(const struct item_head *ih) static inline loff_t le_key_k_type(int version, const struct reiserfs_key *key) { - return (version == KEY_FORMAT_3_5) ? - uniqueness2type(le32_to_cpu(key->u.k_offset_v1.k_uniqueness)) : - offset_v2_k_type(&(key->u.k_offset_v2)); + if (version == KEY_FORMAT_3_5) { + loff_t val = le32_to_cpu(key->u.k_offset_v1.k_uniqueness); + return uniqueness2type(val); + } else + return offset_v2_k_type(&(key->u.k_offset_v2)); } static inline loff_t le_ih_k_type(const struct item_head *ih) @@ -1296,8 +1493,22 @@ static inline loff_t le_ih_k_type(const struct item_head *ih) static inline void set_le_key_k_offset(int version, struct reiserfs_key *key, loff_t offset) { - (version == KEY_FORMAT_3_5) ? (void)(key->u.k_offset_v1.k_offset = cpu_to_le32(offset)) : /* jdm check */ - (void)(set_offset_v2_k_offset(&(key->u.k_offset_v2), offset)); + if (version == KEY_FORMAT_3_5) + key->u.k_offset_v1.k_offset = cpu_to_le32(offset); + else + set_offset_v2_k_offset(&key->u.k_offset_v2, offset); +} + +static inline void add_le_key_k_offset(int version, struct reiserfs_key *key, + loff_t offset) +{ + set_le_key_k_offset(version, key, + le_key_k_offset(version, key) + offset); +} + +static inline void add_le_ih_k_offset(struct item_head *ih, loff_t offset) +{ + add_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset); } static inline void set_le_ih_k_offset(struct item_head *ih, loff_t offset) @@ -1308,10 +1519,11 @@ static inline void set_le_ih_k_offset(struct item_head *ih, loff_t offset) static inline void set_le_key_k_type(int version, struct reiserfs_key *key, int type) { - (version == KEY_FORMAT_3_5) ? - (void)(key->u.k_offset_v1.k_uniqueness = - cpu_to_le32(type2uniqueness(type))) - : (void)(set_offset_v2_k_type(&(key->u.k_offset_v2), type)); + if (version == KEY_FORMAT_3_5) { + type = type2uniqueness(type); + key->u.k_offset_v1.k_uniqueness = cpu_to_le32(type); + } else + set_offset_v2_k_type(&key->u.k_offset_v2, type); } static inline void set_le_ih_k_type(struct item_head *ih, int type) @@ -1339,9 +1551,7 @@ static inline int is_statdata_le_key(int version, struct reiserfs_key *key) return le_key_k_type(version, key) == TYPE_STAT_DATA; } -// -// item header has version. -// +/* item header has version. */ static inline int is_direntry_le_ih(struct item_head *ih) { return is_direntry_le_key(ih_version(ih), &ih->ih_key); @@ -1362,9 +1572,7 @@ static inline int is_statdata_le_ih(struct item_head *ih) return is_statdata_le_key(ih_version(ih), &ih->ih_key); } -// -// key is pointer to cpu key, result is cpu -// +/* key is pointer to cpu key, result is cpu */ static inline loff_t cpu_key_k_offset(const struct cpu_key *key) { return key->on_disk_key.k_offset; @@ -1415,7 +1623,7 @@ static inline void cpu_key_k_offset_dec(struct cpu_key *key) extern struct reiserfs_key root_key; -/* +/* * Picture represents a leaf of the S+tree * ______________________________________________________ * | | Array of | | | @@ -1424,15 +1632,19 @@ extern struct reiserfs_key root_key; * |______|_______________|___________________|___________| */ -/* Header of a disk block. More precisely, header of a formatted leaf - or internal node, and not the header of an unformatted node. */ +/* + * Header of a disk block. More precisely, header of a formatted leaf + * or internal node, and not the header of an unformatted node. + */ struct block_head { __le16 blk_level; /* Level of a block in the tree. */ __le16 blk_nr_item; /* Number of keys/items in a block. */ __le16 blk_free_space; /* Block free space in bytes. */ __le16 blk_reserved; /* dump this in v4/planA */ - struct reiserfs_key blk_right_delim_key; /* kept only for compatibility */ + + /* kept only for compatibility */ + struct reiserfs_key blk_right_delim_key; }; #define BLKH_SIZE (sizeof(struct block_head)) @@ -1447,18 +1659,20 @@ struct block_head { #define blkh_right_delim_key(p_blkh) ((p_blkh)->blk_right_delim_key) #define set_blkh_right_delim_key(p_blkh,val) ((p_blkh)->blk_right_delim_key = val) +/* values for blk_level field of the struct block_head */ + /* - * values for blk_level field of the struct block_head + * When node gets removed from the tree its blk_level is set to FREE_LEVEL. + * It is then used to see whether the node is still in the tree */ - -#define FREE_LEVEL 0 /* when node gets removed from the tree its - blk_level is set to FREE_LEVEL. It is then - used to see whether the node is still in the - tree */ +#define FREE_LEVEL 0 #define DISK_LEAF_NODE_LEVEL 1 /* Leaf node level. */ -/* Given the buffer head of a formatted node, resolve to the block head of that node. */ +/* + * Given the buffer head of a formatted node, resolve to the + * block head of that node. + */ #define B_BLK_HEAD(bh) ((struct block_head *)((bh)->b_data)) /* Number of items that are in buffer. */ #define B_NR_ITEMS(bh) (blkh_nr_item(B_BLK_HEAD(bh))) @@ -1479,14 +1693,14 @@ struct block_head { #define B_IS_KEYS_LEVEL(bh) (B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL \ && B_LEVEL(bh) <= MAX_HEIGHT) -/***************************************************************************/ -/* STAT DATA */ -/***************************************************************************/ +/*************************************************************************** + * STAT DATA * + ***************************************************************************/ -// -// old stat data is 32 bytes long. We are going to distinguish new one by -// different size -// +/* + * old stat data is 32 bytes long. We are going to distinguish new one by + * different size +*/ struct stat_data_v1 { __le16 sd_mode; /* file type, permissions */ __le16 sd_nlink; /* number of hard links */ @@ -1495,20 +1709,25 @@ struct stat_data_v1 { __le32 sd_size; /* file size */ __le32 sd_atime; /* time of last access */ __le32 sd_mtime; /* time file was last modified */ - __le32 sd_ctime; /* time inode (stat data) was last changed (except changes to sd_atime and sd_mtime) */ + + /* + * time inode (stat data) was last changed + * (except changes to sd_atime and sd_mtime) + */ + __le32 sd_ctime; union { __le32 sd_rdev; __le32 sd_blocks; /* number of blocks file uses */ } __attribute__ ((__packed__)) u; - __le32 sd_first_direct_byte; /* first byte of file which is stored - in a direct item: except that if it - equals 1 it is a symlink and if it - equals ~(__u32)0 there is no - direct item. The existence of this - field really grates on me. Let's - replace it with a macro based on - sd_size and our tail suppression - policy. Someday. -Hans */ + + /* + * first byte of file which is stored in a direct item: except that if + * it equals 1 it is a symlink and if it equals ~(__u32)0 there is no + * direct item. The existence of this field really grates on me. + * Let's replace it with a macro based on sd_size and our tail + * suppression policy. Someday. -Hans + */ + __le32 sd_first_direct_byte; } __attribute__ ((__packed__)); #define SD_V1_SIZE (sizeof(struct stat_data_v1)) @@ -1540,8 +1759,10 @@ struct stat_data_v1 { /* inode flags stored in sd_attrs (nee sd_reserved) */ -/* we want common flags to have the same values as in ext2, - so chattr(1) will work without problems */ +/* + * we want common flags to have the same values as in ext2, + * so chattr(1) will work without problems + */ #define REISERFS_IMMUTABLE_FL FS_IMMUTABLE_FL #define REISERFS_APPEND_FL FS_APPEND_FL #define REISERFS_SYNC_FL FS_SYNC_FL @@ -1561,8 +1782,10 @@ struct stat_data_v1 { REISERFS_COMPR_FL | \ REISERFS_NOTAIL_FL ) -/* Stat Data on disk (reiserfs version of UFS disk inode minus the - address blocks) */ +/* + * Stat Data on disk (reiserfs version of UFS disk inode minus the + * address blocks) + */ struct stat_data { __le16 sd_mode; /* file type, permissions */ __le16 sd_attrs; /* persistent inode flags */ @@ -1572,25 +1795,20 @@ struct stat_data { __le32 sd_gid; /* group */ __le32 sd_atime; /* time of last access */ __le32 sd_mtime; /* time file was last modified */ - __le32 sd_ctime; /* time inode (stat data) was last changed (except changes to sd_atime and sd_mtime) */ + + /* + * time inode (stat data) was last changed + * (except changes to sd_atime and sd_mtime) + */ + __le32 sd_ctime; __le32 sd_blocks; union { __le32 sd_rdev; __le32 sd_generation; - //__le32 sd_first_direct_byte; - /* first byte of file which is stored in a - direct item: except that if it equals 1 - it is a symlink and if it equals - ~(__u32)0 there is no direct item. The - existence of this field really grates - on me. Let's replace it with a macro - based on sd_size and our tail - suppression policy? */ } __attribute__ ((__packed__)) u; } __attribute__ ((__packed__)); -// -// this is 44 bytes long -// + +/* this is 44 bytes long */ #define SD_SIZE (sizeof(struct stat_data)) #define SD_V2_SIZE SD_SIZE #define stat_data_v2(ih) (ih_version (ih) == KEY_FORMAT_3_6) @@ -1621,48 +1839,61 @@ struct stat_data { #define sd_v2_attrs(sdp) (le16_to_cpu((sdp)->sd_attrs)) #define set_sd_v2_attrs(sdp,v) ((sdp)->sd_attrs = cpu_to_le16(v)) -/***************************************************************************/ -/* DIRECTORY STRUCTURE */ -/***************************************************************************/ -/* - Picture represents the structure of directory items - ________________________________________________ - | Array of | | | | | | - | directory |N-1| N-2 | .... | 1st |0th| - | entry headers | | | | | | - |_______________|___|_____|________|_______|___| - <---- directory entries ------> - - First directory item has k_offset component 1. We store "." and ".." - in one item, always, we never split "." and ".." into differing - items. This makes, among other things, the code for removing - directories simpler. */ +/*************************************************************************** + * DIRECTORY STRUCTURE * + ***************************************************************************/ +/* + * Picture represents the structure of directory items + * ________________________________________________ + * | Array of | | | | | | + * | directory |N-1| N-2 | .... | 1st |0th| + * | entry headers | | | | | | + * |_______________|___|_____|________|_______|___| + * <---- directory entries ------> + * + * First directory item has k_offset component 1. We store "." and ".." + * in one item, always, we never split "." and ".." into differing + * items. This makes, among other things, the code for removing + * directories simpler. + */ #define SD_OFFSET 0 #define SD_UNIQUENESS 0 #define DOT_OFFSET 1 #define DOT_DOT_OFFSET 2 #define DIRENTRY_UNIQUENESS 500 -/* */ #define FIRST_ITEM_OFFSET 1 /* - Q: How to get key of object pointed to by entry from entry? - - A: Each directory entry has its header. This header has deh_dir_id and deh_objectid fields, those are key - of object, entry points to */ + * Q: How to get key of object pointed to by entry from entry? + * + * A: Each directory entry has its header. This header has deh_dir_id + * and deh_objectid fields, those are key of object, entry points to + */ -/* NOT IMPLEMENTED: - Directory will someday contain stat data of object */ +/* + * NOT IMPLEMENTED: + * Directory will someday contain stat data of object + */ struct reiserfs_de_head { __le32 deh_offset; /* third component of the directory entry key */ - __le32 deh_dir_id; /* objectid of the parent directory of the object, that is referenced - by directory entry */ - __le32 deh_objectid; /* objectid of the object, that is referenced by directory entry */ + + /* + * objectid of the parent directory of the object, that is referenced + * by directory entry + */ + __le32 deh_dir_id; + + /* objectid of the object, that is referenced by directory entry */ + __le32 deh_objectid; __le16 deh_location; /* offset of name in the whole item */ - __le16 deh_state; /* whether 1) entry contains stat data (for future), and 2) whether - entry is hidden (unlinked) */ + + /* + * whether 1) entry contains stat data (for future), and + * 2) whether entry is hidden (unlinked) + */ + __le16 deh_state; } __attribute__ ((__packed__)); #define DEH_SIZE sizeof(struct reiserfs_de_head) #define deh_offset(p_deh) (le32_to_cpu((p_deh)->deh_offset)) @@ -1692,9 +1923,11 @@ struct reiserfs_de_head { # define ADDR_UNALIGNED_BITS (3) #endif -/* These are only used to manipulate deh_state. +/* + * These are only used to manipulate deh_state. * Because of this, we'll use the ext2_ bit routines, - * since they are little endian */ + * since they are little endian + */ #ifdef ADDR_UNALIGNED_BITS # define aligned_address(addr) ((void *)((long)(addr) & ~((1UL << ADDR_UNALIGNED_BITS) - 1))) @@ -1729,46 +1962,16 @@ extern void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid, extern void make_empty_dir_item(char *body, __le32 dirid, __le32 objid, __le32 par_dirid, __le32 par_objid); -/* array of the entry headers */ - /* get item body */ -#define B_I_PITEM(bh,ih) ( (bh)->b_data + ih_location(ih) ) -#define B_I_DEH(bh,ih) ((struct reiserfs_de_head *)(B_I_PITEM(bh,ih))) - -/* length of the directory entry in directory item. This define - calculates length of i-th directory entry using directory entry - locations from dir entry head. When it calculates length of 0-th - directory entry, it uses length of whole item in place of entry - location of the non-existent following entry in the calculation. - See picture above.*/ -/* -#define I_DEH_N_ENTRY_LENGTH(ih,deh,i) \ -((i) ? (deh_location((deh)-1) - deh_location((deh))) : (ih_item_len((ih)) - deh_location((deh)))) -*/ -static inline int entry_length(const struct buffer_head *bh, - const struct item_head *ih, int pos_in_item) -{ - struct reiserfs_de_head *deh; - - deh = B_I_DEH(bh, ih) + pos_in_item; - if (pos_in_item) - return deh_location(deh - 1) - deh_location(deh); - - return ih_item_len(ih) - deh_location(deh); -} - -/* number of entries in the directory item, depends on ENTRY_COUNT being at the start of directory dynamic data. */ -#define I_ENTRY_COUNT(ih) (ih_entry_count((ih))) - -/* name by bh, ih and entry_num */ -#define B_I_E_NAME(bh,ih,entry_num) ((char *)(bh->b_data + ih_location(ih) + deh_location(B_I_DEH(bh,ih)+(entry_num)))) - -// two entries per block (at least) +/* two entries per block (at least) */ #define REISERFS_MAX_NAME(block_size) 255 -/* this structure is used for operations on directory entries. It is - not a disk structure. */ -/* When reiserfs_find_entry or search_by_entry_key find directory - entry, they return filled reiserfs_dir_entry structure */ +/* + * this structure is used for operations on directory entries. It is + * not a disk structure. + * + * When reiserfs_find_entry or search_by_entry_key find directory + * entry, they return filled reiserfs_dir_entry structure + */ struct reiserfs_dir_entry { struct buffer_head *de_bh; int de_item_num; @@ -1786,10 +1989,14 @@ struct reiserfs_dir_entry { struct cpu_key de_entry_key; }; -/* these defines are useful when a particular member of a reiserfs_dir_entry is needed */ +/* + * these defines are useful when a particular member of + * a reiserfs_dir_entry is needed + */ /* pointer to file name, stored in entry */ -#define B_I_DEH_ENTRY_FILE_NAME(bh,ih,deh) (B_I_PITEM (bh, ih) + deh_location(deh)) +#define B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh) \ + (ih_item_body(bh, ih) + deh_location(deh)) /* length of name */ #define I_DEH_N_ENTRY_FILE_NAME_LENGTH(ih,deh,entry_num) \ @@ -1812,11 +2019,13 @@ struct reiserfs_dir_entry { * |______|_______________|___________________|___________| */ -/***************************************************************************/ -/* DISK CHILD */ -/***************************************************************************/ -/* Disk child pointer: The pointer from an internal node of the tree - to a node that is on disk. */ +/*************************************************************************** + * DISK CHILD * + ***************************************************************************/ +/* + * Disk child pointer: + * The pointer from an internal node of the tree to a node that is on disk. + */ struct disk_child { __le32 dc_block_number; /* Disk child's block number. */ __le16 dc_size; /* Disk child's used space. */ @@ -1849,47 +2058,66 @@ struct disk_child { #define MAX_NR_KEY(bh) ( (MAX_CHILD_SIZE(bh)-DC_SIZE)/(KEY_SIZE+DC_SIZE) ) #define MIN_NR_KEY(bh) (MAX_NR_KEY(bh)/2) -/***************************************************************************/ -/* PATH STRUCTURES AND DEFINES */ -/***************************************************************************/ +/*************************************************************************** + * PATH STRUCTURES AND DEFINES * + ***************************************************************************/ -/* Search_by_key fills up the path from the root to the leaf as it descends the tree looking for the - key. It uses reiserfs_bread to try to find buffers in the cache given their block number. If it - does not find them in the cache it reads them from disk. For each node search_by_key finds using - reiserfs_bread it then uses bin_search to look through that node. bin_search will find the - position of the block_number of the next node if it is looking through an internal node. If it - is looking through a leaf node bin_search will find the position of the item which has key either - equal to given key, or which is the maximal key less than the given key. */ +/* + * search_by_key fills up the path from the root to the leaf as it descends + * the tree looking for the key. It uses reiserfs_bread to try to find + * buffers in the cache given their block number. If it does not find + * them in the cache it reads them from disk. For each node search_by_key + * finds using reiserfs_bread it then uses bin_search to look through that + * node. bin_search will find the position of the block_number of the next + * node if it is looking through an internal node. If it is looking through + * a leaf node bin_search will find the position of the item which has key + * either equal to given key, or which is the maximal key less than the + * given key. + */ struct path_element { - struct buffer_head *pe_buffer; /* Pointer to the buffer at the path in the tree. */ - int pe_position; /* Position in the tree node which is placed in the */ - /* buffer above. */ + /* Pointer to the buffer at the path in the tree. */ + struct buffer_head *pe_buffer; + /* Position in the tree node which is placed in the buffer above. */ + int pe_position; }; -#define MAX_HEIGHT 5 /* maximal height of a tree. don't change this without changing JOURNAL_PER_BALANCE_CNT */ -#define EXTENDED_MAX_HEIGHT 7 /* Must be equals MAX_HEIGHT + FIRST_PATH_ELEMENT_OFFSET */ -#define FIRST_PATH_ELEMENT_OFFSET 2 /* Must be equal to at least 2. */ - -#define ILLEGAL_PATH_ELEMENT_OFFSET 1 /* Must be equal to FIRST_PATH_ELEMENT_OFFSET - 1 */ -#define MAX_FEB_SIZE 6 /* this MUST be MAX_HEIGHT + 1. See about FEB below */ - -/* We need to keep track of who the ancestors of nodes are. When we - perform a search we record which nodes were visited while - descending the tree looking for the node we searched for. This list - of nodes is called the path. This information is used while - performing balancing. Note that this path information may become - invalid, and this means we must check it when using it to see if it - is still valid. You'll need to read search_by_key and the comments - in it, especially about decrement_counters_in_path(), to understand - this structure. - -Paths make the code so much harder to work with and debug.... An -enormous number of bugs are due to them, and trying to write or modify -code that uses them just makes my head hurt. They are based on an -excessive effort to avoid disturbing the precious VFS code.:-( The -gods only know how we are going to SMP the code that uses them. -znodes are the way! */ +/* + * maximal height of a tree. don't change this without + * changing JOURNAL_PER_BALANCE_CNT + */ +#define MAX_HEIGHT 5 + +/* Must be equals MAX_HEIGHT + FIRST_PATH_ELEMENT_OFFSET */ +#define EXTENDED_MAX_HEIGHT 7 + +/* Must be equal to at least 2. */ +#define FIRST_PATH_ELEMENT_OFFSET 2 + +/* Must be equal to FIRST_PATH_ELEMENT_OFFSET - 1 */ +#define ILLEGAL_PATH_ELEMENT_OFFSET 1 + +/* this MUST be MAX_HEIGHT + 1. See about FEB below */ +#define MAX_FEB_SIZE 6 + +/* + * We need to keep track of who the ancestors of nodes are. When we + * perform a search we record which nodes were visited while + * descending the tree looking for the node we searched for. This list + * of nodes is called the path. This information is used while + * performing balancing. Note that this path information may become + * invalid, and this means we must check it when using it to see if it + * is still valid. You'll need to read search_by_key and the comments + * in it, especially about decrement_counters_in_path(), to understand + * this structure. + * + * Paths make the code so much harder to work with and debug.... An + * enormous number of bugs are due to them, and trying to write or modify + * code that uses them just makes my head hurt. They are based on an + * excessive effort to avoid disturbing the precious VFS code.:-( The + * gods only know how we are going to SMP the code that uses them. + * znodes are the way! + */ #define PATH_READA 0x1 /* do read ahead */ #define PATH_READA_BACK 0x2 /* read backwards */ @@ -1897,7 +2125,8 @@ znodes are the way! */ struct treepath { int path_length; /* Length of the array above. */ int reada; - struct path_element path_elements[EXTENDED_MAX_HEIGHT]; /* Array of the path elements. */ + /* Array of the path elements. */ + struct path_element path_elements[EXTENDED_MAX_HEIGHT]; int pos_in_item; }; @@ -1916,41 +2145,124 @@ struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,} #define PATH_OFFSET_POSITION(path, n_offset) (PATH_OFFSET_PELEMENT(path, n_offset)->pe_position) #define PATH_PLAST_BUFFER(path) (PATH_OFFSET_PBUFFER((path), (path)->path_length)) - /* you know, to the person who didn't - write this the macro name does not - at first suggest what it does. - Maybe POSITION_FROM_PATH_END? Or - maybe we should just focus on - dumping paths... -Hans */ + +/* + * you know, to the person who didn't write this the macro name does not + * at first suggest what it does. Maybe POSITION_FROM_PATH_END? Or + * maybe we should just focus on dumping paths... -Hans + */ #define PATH_LAST_POSITION(path) (PATH_OFFSET_POSITION((path), (path)->path_length)) -#define PATH_PITEM_HEAD(path) B_N_PITEM_HEAD(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path)) +/* + * in do_balance leaf has h == 0 in contrast with path structure, + * where root has level == 0. That is why we need these defines + */ + +/* tb->S[h] */ +#define PATH_H_PBUFFER(path, h) \ + PATH_OFFSET_PBUFFER(path, path->path_length - (h)) + +/* tb->F[h] or tb->S[0]->b_parent */ +#define PATH_H_PPARENT(path, h) PATH_H_PBUFFER(path, (h) + 1) -/* in do_balance leaf has h == 0 in contrast with path structure, - where root has level == 0. That is why we need these defines */ -#define PATH_H_PBUFFER(path, h) PATH_OFFSET_PBUFFER (path, path->path_length - (h)) /* tb->S[h] */ -#define PATH_H_PPARENT(path, h) PATH_H_PBUFFER (path, (h) + 1) /* tb->F[h] or tb->S[0]->b_parent */ -#define PATH_H_POSITION(path, h) PATH_OFFSET_POSITION (path, path->path_length - (h)) -#define PATH_H_B_ITEM_ORDER(path, h) PATH_H_POSITION(path, h + 1) /* tb->S[h]->b_item_order */ +#define PATH_H_POSITION(path, h) \ + PATH_OFFSET_POSITION(path, path->path_length - (h)) + +/* tb->S[h]->b_item_order */ +#define PATH_H_B_ITEM_ORDER(path, h) PATH_H_POSITION(path, h + 1) #define PATH_H_PATH_OFFSET(path, n_h) ((path)->path_length - (n_h)) +static inline void *reiserfs_node_data(const struct buffer_head *bh) +{ + return bh->b_data + sizeof(struct block_head); +} + +/* get key from internal node */ +static inline struct reiserfs_key *internal_key(struct buffer_head *bh, + int item_num) +{ + struct reiserfs_key *key = reiserfs_node_data(bh); + + return &key[item_num]; +} + +/* get the item header from leaf node */ +static inline struct item_head *item_head(const struct buffer_head *bh, + int item_num) +{ + struct item_head *ih = reiserfs_node_data(bh); + + return &ih[item_num]; +} + +/* get the key from leaf node */ +static inline struct reiserfs_key *leaf_key(const struct buffer_head *bh, + int item_num) +{ + return &item_head(bh, item_num)->ih_key; +} + +static inline void *ih_item_body(const struct buffer_head *bh, + const struct item_head *ih) +{ + return bh->b_data + ih_location(ih); +} + +/* get item body from leaf node */ +static inline void *item_body(const struct buffer_head *bh, int item_num) +{ + return ih_item_body(bh, item_head(bh, item_num)); +} + +static inline struct item_head *tp_item_head(const struct treepath *path) +{ + return item_head(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path)); +} + +static inline void *tp_item_body(const struct treepath *path) +{ + return item_body(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path)); +} + #define get_last_bh(path) PATH_PLAST_BUFFER(path) -#define get_ih(path) PATH_PITEM_HEAD(path) #define get_item_pos(path) PATH_LAST_POSITION(path) -#define get_item(path) ((void *)B_N_PITEM(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION (path))) #define item_moved(ih,path) comp_items(ih, path) #define path_changed(ih,path) comp_items (ih, path) -/***************************************************************************/ -/* MISC */ -/***************************************************************************/ +/* array of the entry headers */ + /* get item body */ +#define B_I_DEH(bh, ih) ((struct reiserfs_de_head *)(ih_item_body(bh, ih))) + +/* + * length of the directory entry in directory item. This define + * calculates length of i-th directory entry using directory entry + * locations from dir entry head. When it calculates length of 0-th + * directory entry, it uses length of whole item in place of entry + * location of the non-existent following entry in the calculation. + * See picture above. + */ +static inline int entry_length(const struct buffer_head *bh, + const struct item_head *ih, int pos_in_item) +{ + struct reiserfs_de_head *deh; + + deh = B_I_DEH(bh, ih) + pos_in_item; + if (pos_in_item) + return deh_location(deh - 1) - deh_location(deh); + + return ih_item_len(ih) - deh_location(deh); +} + +/*************************************************************************** + * MISC * + ***************************************************************************/ /* Size of pointer to the unformatted node. */ #define UNFM_P_SIZE (sizeof(unp_t)) #define UNFM_P_SHIFT 2 -// in in-core inode key is stored on le form +/* in in-core inode key is stored on le form */ #define INODE_PKEY(inode) ((struct reiserfs_key *)(REISERFS_I(inode)->i_key)) #define MAX_UL_INT 0xffffffff @@ -1958,8 +2270,6 @@ struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,} #define MAX_US_INT 0xffff // reiserfs version 2 has max offset 60 bits. Version 1 - 32 bit offset -#define U32_MAX (~(__u32)0) - static inline loff_t max_reiserfs_offset(struct inode *inode) { if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5) @@ -1968,7 +2278,6 @@ static inline loff_t max_reiserfs_offset(struct inode *inode) return (loff_t) ((~(__u64) 0) >> 4); } -/*#define MAX_KEY_UNIQUENESS MAX_UL_INT*/ #define MAX_KEY_OBJECTID MAX_UL_INT #define MAX_B_NUM MAX_UL_INT @@ -1977,9 +2286,12 @@ static inline loff_t max_reiserfs_offset(struct inode *inode) /* the purpose is to detect overflow of an unsigned short */ #define REISERFS_LINK_MAX (MAX_US_INT - 1000) -/* The following defines are used in reiserfs_insert_item and reiserfs_append_item */ -#define REISERFS_KERNEL_MEM 0 /* reiserfs kernel memory mode */ -#define REISERFS_USER_MEM 1 /* reiserfs user memory mode */ +/* + * The following defines are used in reiserfs_insert_item + * and reiserfs_append_item + */ +#define REISERFS_KERNEL_MEM 0 /* kernel memory mode */ +#define REISERFS_USER_MEM 1 /* user memory mode */ #define fs_generation(s) (REISERFS_SB(s)->s_generation_counter) #define get_generation(s) atomic_read (&fs_generation(s)) @@ -1991,46 +2303,65 @@ static inline loff_t max_reiserfs_offset(struct inode *inode) __fs_changed(gen, s); \ }) -/***************************************************************************/ -/* FIXATE NODES */ -/***************************************************************************/ +/*************************************************************************** + * FIXATE NODES * + ***************************************************************************/ #define VI_TYPE_LEFT_MERGEABLE 1 #define VI_TYPE_RIGHT_MERGEABLE 2 -/* To make any changes in the tree we always first find node, that - contains item to be changed/deleted or place to insert a new - item. We call this node S. To do balancing we need to decide what - we will shift to left/right neighbor, or to a new node, where new - item will be etc. To make this analysis simpler we build virtual - node. Virtual node is an array of items, that will replace items of - node S. (For instance if we are going to delete an item, virtual - node does not contain it). Virtual node keeps information about - item sizes and types, mergeability of first and last items, sizes - of all entries in directory item. We use this array of items when - calculating what we can shift to neighbors and how many nodes we - have to have if we do not any shiftings, if we shift to left/right - neighbor or to both. */ +/* + * To make any changes in the tree we always first find node, that + * contains item to be changed/deleted or place to insert a new + * item. We call this node S. To do balancing we need to decide what + * we will shift to left/right neighbor, or to a new node, where new + * item will be etc. To make this analysis simpler we build virtual + * node. Virtual node is an array of items, that will replace items of + * node S. (For instance if we are going to delete an item, virtual + * node does not contain it). Virtual node keeps information about + * item sizes and types, mergeability of first and last items, sizes + * of all entries in directory item. We use this array of items when + * calculating what we can shift to neighbors and how many nodes we + * have to have if we do not any shiftings, if we shift to left/right + * neighbor or to both. + */ struct virtual_item { - int vi_index; // index in the array of item operations - unsigned short vi_type; // left/right mergeability - unsigned short vi_item_len; /* length of item that it will have after balancing */ + int vi_index; /* index in the array of item operations */ + unsigned short vi_type; /* left/right mergeability */ + + /* length of item that it will have after balancing */ + unsigned short vi_item_len; + struct item_head *vi_ih; - const char *vi_item; // body of item (old or new) - const void *vi_new_data; // 0 always but paste mode - void *vi_uarea; // item specific area + const char *vi_item; /* body of item (old or new) */ + const void *vi_new_data; /* 0 always but paste mode */ + void *vi_uarea; /* item specific area */ }; struct virtual_node { - char *vn_free_ptr; /* this is a pointer to the free space in the buffer */ + /* this is a pointer to the free space in the buffer */ + char *vn_free_ptr; + unsigned short vn_nr_item; /* number of items in virtual node */ - short vn_size; /* size of node , that node would have if it has unlimited size and no balancing is performed */ - short vn_mode; /* mode of balancing (paste, insert, delete, cut) */ + + /* + * size of node , that node would have if it has + * unlimited size and no balancing is performed + */ + short vn_size; + + /* mode of balancing (paste, insert, delete, cut) */ + short vn_mode; + short vn_affected_item_num; short vn_pos_in_item; - struct item_head *vn_ins_ih; /* item header of inserted item, 0 for other modes */ + + /* item header of inserted item, 0 for other modes */ + struct item_head *vn_ins_ih; const void *vn_data; - struct virtual_item *vn_vi; /* array of items (including a new one, excluding item to be deleted) */ + + /* array of items (including a new one, excluding item to be deleted) */ + struct virtual_item *vn_vi; }; /* used by directory items when creating virtual nodes */ @@ -2040,22 +2371,25 @@ struct direntry_uarea { __u16 entry_sizes[1]; } __attribute__ ((__packed__)); -/***************************************************************************/ -/* TREE BALANCE */ -/***************************************************************************/ +/*************************************************************************** + * TREE BALANCE * + ***************************************************************************/ -/* This temporary structure is used in tree balance algorithms, and - constructed as we go to the extent that its various parts are - needed. It contains arrays of nodes that can potentially be - involved in the balancing of node S, and parameters that define how - each of the nodes must be balanced. Note that in these algorithms - for balancing the worst case is to need to balance the current node - S and the left and right neighbors and all of their parents plus - create a new node. We implement S1 balancing for the leaf nodes - and S0 balancing for the internal nodes (S1 and S0 are defined in - our papers.)*/ +/* + * This temporary structure is used in tree balance algorithms, and + * constructed as we go to the extent that its various parts are + * needed. It contains arrays of nodes that can potentially be + * involved in the balancing of node S, and parameters that define how + * each of the nodes must be balanced. Note that in these algorithms + * for balancing the worst case is to need to balance the current node + * S and the left and right neighbors and all of their parents plus + * create a new node. We implement S1 balancing for the leaf nodes + * and S0 balancing for the internal nodes (S1 and S0 are defined in + * our papers.) + */ -#define MAX_FREE_BLOCK 7 /* size of the array of buffers to free at end of do_balance */ +/* size of the array of buffers to free at end of do_balance */ +#define MAX_FREE_BLOCK 7 /* maximum number of FEB blocknrs on a single level */ #define MAX_AMOUNT_NEEDED 2 @@ -2067,64 +2401,144 @@ struct tree_balance { struct super_block *tb_sb; struct reiserfs_transaction_handle *transaction_handle; struct treepath *tb_path; - struct buffer_head *L[MAX_HEIGHT]; /* array of left neighbors of nodes in the path */ - struct buffer_head *R[MAX_HEIGHT]; /* array of right neighbors of nodes in the path */ - struct buffer_head *FL[MAX_HEIGHT]; /* array of fathers of the left neighbors */ - struct buffer_head *FR[MAX_HEIGHT]; /* array of fathers of the right neighbors */ - struct buffer_head *CFL[MAX_HEIGHT]; /* array of common parents of center node and its left neighbor */ - struct buffer_head *CFR[MAX_HEIGHT]; /* array of common parents of center node and its right neighbor */ - - struct buffer_head *FEB[MAX_FEB_SIZE]; /* array of empty buffers. Number of buffers in array equals - cur_blknum. */ + + /* array of left neighbors of nodes in the path */ + struct buffer_head *L[MAX_HEIGHT]; + + /* array of right neighbors of nodes in the path */ + struct buffer_head *R[MAX_HEIGHT]; + + /* array of fathers of the left neighbors */ + struct buffer_head *FL[MAX_HEIGHT]; + + /* array of fathers of the right neighbors */ + struct buffer_head *FR[MAX_HEIGHT]; + /* array of common parents of center node and its left neighbor */ + struct buffer_head *CFL[MAX_HEIGHT]; + + /* array of common parents of center node and its right neighbor */ + struct buffer_head *CFR[MAX_HEIGHT]; + + /* + * array of empty buffers. Number of buffers in array equals + * cur_blknum. + */ + struct buffer_head *FEB[MAX_FEB_SIZE]; struct buffer_head *used[MAX_FEB_SIZE]; struct buffer_head *thrown[MAX_FEB_SIZE]; - int lnum[MAX_HEIGHT]; /* array of number of items which must be - shifted to the left in order to balance the - current node; for leaves includes item that - will be partially shifted; for internal - nodes, it is the number of child pointers - rather than items. It includes the new item - being created. The code sometimes subtracts - one to get the number of wholly shifted - items for other purposes. */ - int rnum[MAX_HEIGHT]; /* substitute right for left in comment above */ - int lkey[MAX_HEIGHT]; /* array indexed by height h mapping the key delimiting L[h] and - S[h] to its item number within the node CFL[h] */ - int rkey[MAX_HEIGHT]; /* substitute r for l in comment above */ - int insert_size[MAX_HEIGHT]; /* the number of bytes by we are trying to add or remove from - S[h]. A negative value means removing. */ - int blknum[MAX_HEIGHT]; /* number of nodes that will replace node S[h] after - balancing on the level h of the tree. If 0 then S is - being deleted, if 1 then S is remaining and no new nodes - are being created, if 2 or 3 then 1 or 2 new nodes is - being created */ + + /* + * array of number of items which must be shifted to the left in + * order to balance the current node; for leaves includes item that + * will be partially shifted; for internal nodes, it is the number + * of child pointers rather than items. It includes the new item + * being created. The code sometimes subtracts one to get the + * number of wholly shifted items for other purposes. + */ + int lnum[MAX_HEIGHT]; + + /* substitute right for left in comment above */ + int rnum[MAX_HEIGHT]; + + /* + * array indexed by height h mapping the key delimiting L[h] and + * S[h] to its item number within the node CFL[h] + */ + int lkey[MAX_HEIGHT]; + + /* substitute r for l in comment above */ + int rkey[MAX_HEIGHT]; + + /* + * the number of bytes by we are trying to add or remove from + * S[h]. A negative value means removing. + */ + int insert_size[MAX_HEIGHT]; + + /* + * number of nodes that will replace node S[h] after balancing + * on the level h of the tree. If 0 then S is being deleted, + * if 1 then S is remaining and no new nodes are being created, + * if 2 or 3 then 1 or 2 new nodes is being created + */ + int blknum[MAX_HEIGHT]; /* fields that are used only for balancing leaves of the tree */ - int cur_blknum; /* number of empty blocks having been already allocated */ - int s0num; /* number of items that fall into left most node when S[0] splits */ - int s1num; /* number of items that fall into first new node when S[0] splits */ - int s2num; /* number of items that fall into second new node when S[0] splits */ - int lbytes; /* number of bytes which can flow to the left neighbor from the left */ - /* most liquid item that cannot be shifted from S[0] entirely */ - /* if -1 then nothing will be partially shifted */ - int rbytes; /* number of bytes which will flow to the right neighbor from the right */ - /* most liquid item that cannot be shifted from S[0] entirely */ - /* if -1 then nothing will be partially shifted */ - int s1bytes; /* number of bytes which flow to the first new node when S[0] splits */ - /* note: if S[0] splits into 3 nodes, then items do not need to be cut */ - int s2bytes; - struct buffer_head *buf_to_free[MAX_FREE_BLOCK]; /* buffers which are to be freed after do_balance finishes by unfix_nodes */ - char *vn_buf; /* kmalloced memory. Used to create - virtual node and keep map of - dirtied bitmap blocks */ + + /* number of empty blocks having been already allocated */ + int cur_blknum; + + /* number of items that fall into left most node when S[0] splits */ + int s0num; + + /* + * number of bytes which can flow to the left neighbor from the left + * most liquid item that cannot be shifted from S[0] entirely + * if -1 then nothing will be partially shifted + */ + int lbytes; + + /* + * number of bytes which will flow to the right neighbor from the right + * most liquid item that cannot be shifted from S[0] entirely + * if -1 then nothing will be partially shifted + */ + int rbytes; + + + /* + * index into the array of item headers in + * S[0] of the affected item + */ + int item_pos; + + /* new nodes allocated to hold what could not fit into S */ + struct buffer_head *S_new[2]; + + /* + * number of items that will be placed into nodes in S_new + * when S[0] splits + */ + int snum[2]; + + /* + * number of bytes which flow to nodes in S_new when S[0] splits + * note: if S[0] splits into 3 nodes, then items do not need to be cut + */ + int sbytes[2]; + + int pos_in_item; + int zeroes_num; + + /* + * buffers which are to be freed after do_balance finishes + * by unfix_nodes + */ + struct buffer_head *buf_to_free[MAX_FREE_BLOCK]; + + /* + * kmalloced memory. Used to create virtual node and keep + * map of dirtied bitmap blocks + */ + char *vn_buf; + int vn_buf_size; /* size of the vn_buf */ - struct virtual_node *tb_vn; /* VN starts after bitmap of bitmap blocks */ - int fs_gen; /* saved value of `reiserfs_generation' counter - see FILESYSTEM_CHANGED() macro in reiserfs_fs.h */ + /* VN starts after bitmap of bitmap blocks */ + struct virtual_node *tb_vn; + + /* + * saved value of `reiserfs_generation' counter see + * FILESYSTEM_CHANGED() macro in reiserfs_fs.h + */ + int fs_gen; + #ifdef DISPLACE_NEW_PACKING_LOCALITIES - struct in_core_key key; /* key pointer, to pass to block allocator or - another low-level subsystem */ + /* + * key pointer, to pass to block allocator or + * another low-level subsystem + */ + struct in_core_key key; #endif }; @@ -2132,20 +2546,24 @@ struct tree_balance { /* When inserting an item. */ #define M_INSERT 'i' -/* When inserting into (directories only) or appending onto an already - existent item. */ +/* + * When inserting into (directories only) or appending onto an already + * existent item. + */ #define M_PASTE 'p' /* When deleting an item. */ #define M_DELETE 'd' /* When truncating an item or removing an entry from a (directory) item. */ -#define M_CUT 'c' +#define M_CUT 'c' /* used when balancing on leaf level skipped (in reiserfsck) */ #define M_INTERNAL 'n' -/* When further balancing is not needed, then do_balance does not need - to be called. */ -#define M_SKIP_BALANCING 's' +/* + * When further balancing is not needed, then do_balance does not need + * to be called. + */ +#define M_SKIP_BALANCING 's' #define M_CONVERT 'v' /* modes of leaf_move_items */ @@ -2158,8 +2576,10 @@ struct tree_balance { #define FIRST_TO_LAST 0 #define LAST_TO_FIRST 1 -/* used in do_balance for passing parent of node information that has - been gotten from tb struct */ +/* + * used in do_balance for passing parent of node information that has + * been gotten from tb struct + */ struct buffer_info { struct tree_balance *tb; struct buffer_head *bi_bh; @@ -2177,20 +2597,24 @@ static inline struct super_block *sb_from_bi(struct buffer_info *bi) return bi ? sb_from_tb(bi->tb) : NULL; } -/* there are 4 types of items: stat data, directory item, indirect, direct. -+-------------------+------------+--------------+------------+ -| | k_offset | k_uniqueness | mergeable? | -+-------------------+------------+--------------+------------+ -| stat data | 0 | 0 | no | -+-------------------+------------+--------------+------------+ -| 1st directory item| DOT_OFFSET |DIRENTRY_UNIQUENESS| no | -| non 1st directory | hash value | | yes | -| item | | | | -+-------------------+------------+--------------+------------+ -| indirect item | offset + 1 |TYPE_INDIRECT | if this is not the first indirect item of the object -+-------------------+------------+--------------+------------+ -| direct item | offset + 1 |TYPE_DIRECT | if not this is not the first direct item of the object -+-------------------+------------+--------------+------------+ +/* + * there are 4 types of items: stat data, directory item, indirect, direct. + * +-------------------+------------+--------------+------------+ + * | | k_offset | k_uniqueness | mergeable? | + * +-------------------+------------+--------------+------------+ + * | stat data | 0 | 0 | no | + * +-------------------+------------+--------------+------------+ + * | 1st directory item| DOT_OFFSET | DIRENTRY_ .. | no | + * | non 1st directory | hash value | UNIQUENESS | yes | + * | item | | | | + * +-------------------+------------+--------------+------------+ + * | indirect item | offset + 1 |TYPE_INDIRECT | [1] | + * +-------------------+------------+--------------+------------+ + * | direct item | offset + 1 |TYPE_DIRECT | [2] | + * +-------------------+------------+--------------+------------+ + * + * [1] if this is not the first indirect item of the object + * [2] if this is not the first direct item of the object */ struct item_operations { @@ -2229,49 +2653,43 @@ extern struct item_operations *item_ops[TYPE_ANY + 1]; /* number of blocks pointed to by the indirect item */ #define I_UNFM_NUM(ih) (ih_item_len(ih) / UNFM_P_SIZE) -/* the used space within the unformatted node corresponding to pos within the item pointed to by ih */ +/* + * the used space within the unformatted node corresponding + * to pos within the item pointed to by ih + */ #define I_POS_UNFM_SIZE(ih,pos,size) (((pos) == I_UNFM_NUM(ih) - 1 ) ? (size) - ih_free_space(ih) : (size)) -/* number of bytes contained by the direct item or the unformatted nodes the indirect item points to */ - -/* get the item header */ -#define B_N_PITEM_HEAD(bh,item_num) ( (struct item_head * )((bh)->b_data + BLKH_SIZE) + (item_num) ) - -/* get key */ -#define B_N_PDELIM_KEY(bh,item_num) ( (struct reiserfs_key * )((bh)->b_data + BLKH_SIZE) + (item_num) ) - -/* get the key */ -#define B_N_PKEY(bh,item_num) ( &(B_N_PITEM_HEAD(bh,item_num)->ih_key) ) - -/* get item body */ -#define B_N_PITEM(bh,item_num) ( (bh)->b_data + ih_location(B_N_PITEM_HEAD((bh),(item_num)))) - -/* get the stat data by the buffer header and the item order */ -#define B_N_STAT_DATA(bh,nr) \ -( (struct stat_data *)((bh)->b_data + ih_location(B_N_PITEM_HEAD((bh),(nr))) ) ) +/* + * number of bytes contained by the direct item or the + * unformatted nodes the indirect item points to + */ - /* following defines use reiserfs buffer header and item header */ +/* following defines use reiserfs buffer header and item header */ /* get stat-data */ #define B_I_STAT_DATA(bh, ih) ( (struct stat_data * )((bh)->b_data + ih_location(ih)) ) -// this is 3976 for size==4096 +/* this is 3976 for size==4096 */ #define MAX_DIRECT_ITEM_LEN(size) ((size) - BLKH_SIZE - 2*IH_SIZE - SD_SIZE - UNFM_P_SIZE) -/* indirect items consist of entries which contain blocknrs, pos - indicates which entry, and B_I_POS_UNFM_POINTER resolves to the - blocknr contained by the entry pos points to */ -#define B_I_POS_UNFM_POINTER(bh,ih,pos) le32_to_cpu(*(((unp_t *)B_I_PITEM(bh,ih)) + (pos))) -#define PUT_B_I_POS_UNFM_POINTER(bh,ih,pos, val) do {*(((unp_t *)B_I_PITEM(bh,ih)) + (pos)) = cpu_to_le32(val); } while (0) +/* + * indirect items consist of entries which contain blocknrs, pos + * indicates which entry, and B_I_POS_UNFM_POINTER resolves to the + * blocknr contained by the entry pos points to + */ +#define B_I_POS_UNFM_POINTER(bh, ih, pos) \ + le32_to_cpu(*(((unp_t *)ih_item_body(bh, ih)) + (pos))) +#define PUT_B_I_POS_UNFM_POINTER(bh, ih, pos, val) \ + (*(((unp_t *)ih_item_body(bh, ih)) + (pos)) = cpu_to_le32(val)) struct reiserfs_iget_args { __u32 objectid; __u32 dirid; }; -/***************************************************************************/ -/* FUNCTION DECLARATIONS */ -/***************************************************************************/ +/*************************************************************************** + * FUNCTION DECLARATIONS * + ***************************************************************************/ #define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12) @@ -2283,7 +2701,10 @@ struct reiserfs_iget_args { /* first block written in a commit. */ struct reiserfs_journal_desc { __le32 j_trans_id; /* id of commit */ - __le32 j_len; /* length of commit. len +1 is the commit block */ + + /* length of commit. len +1 is the commit block */ + __le32 j_len; + __le32 j_mount_id; /* mount id of this trans */ __le32 j_realblock[1]; /* real locations for each block */ }; @@ -2310,22 +2731,35 @@ struct reiserfs_journal_commit { #define set_commit_trans_id(c,val) do { (c)->j_trans_id = cpu_to_le32 (val); } while (0) #define set_commit_trans_len(c,val) do { (c)->j_len = cpu_to_le32 (val); } while (0) -/* this header block gets written whenever a transaction is considered fully flushed, and is more recent than the -** last fully flushed transaction. fully flushed means all the log blocks and all the real blocks are on disk, -** and this transaction does not need to be replayed. -*/ +/* + * this header block gets written whenever a transaction is considered + * fully flushed, and is more recent than the last fully flushed transaction. + * fully flushed means all the log blocks and all the real blocks are on + * disk, and this transaction does not need to be replayed. + */ struct reiserfs_journal_header { - __le32 j_last_flush_trans_id; /* id of last fully flushed transaction */ - __le32 j_first_unflushed_offset; /* offset in the log of where to start replay after a crash */ + /* id of last fully flushed transaction */ + __le32 j_last_flush_trans_id; + + /* offset in the log of where to start replay after a crash */ + __le32 j_first_unflushed_offset; + __le32 j_mount_id; /* 12 */ struct journal_params jh_journal; }; /* biggest tunable defines are right here */ #define JOURNAL_BLOCK_COUNT 8192 /* number of blocks in the journal */ -#define JOURNAL_TRANS_MAX_DEFAULT 1024 /* biggest possible single transaction, don't change for now (8/3/99) */ + +/* biggest possible single transaction, don't change for now (8/3/99) */ +#define JOURNAL_TRANS_MAX_DEFAULT 1024 #define JOURNAL_TRANS_MIN_DEFAULT 256 -#define JOURNAL_MAX_BATCH_DEFAULT 900 /* max blocks to batch into one transaction, don't make this any bigger than 900 */ + +/* + * max blocks to batch into one transaction, + * don't make this any bigger than 900 + */ +#define JOURNAL_MAX_BATCH_DEFAULT 900 #define JOURNAL_MIN_RATIO 2 #define JOURNAL_MAX_COMMIT_AGE 30 #define JOURNAL_MAX_TRANS_AGE 30 @@ -2350,16 +2784,18 @@ struct reiserfs_journal_header { #define REISERFS_QUOTA_DEL_BLOCKS(s) 0 #endif -/* both of these can be as low as 1, or as high as you want. The min is the -** number of 4k bitmap nodes preallocated on mount. New nodes are allocated -** as needed, and released when transactions are committed. On release, if -** the current number of nodes is > max, the node is freed, otherwise, -** it is put on a free list for faster use later. +/* + * both of these can be as low as 1, or as high as you want. The min is the + * number of 4k bitmap nodes preallocated on mount. New nodes are allocated + * as needed, and released when transactions are committed. On release, if + * the current number of nodes is > max, the node is freed, otherwise, + * it is put on a free list for faster use later. */ #define REISERFS_MIN_BITMAP_NODES 10 #define REISERFS_MAX_BITMAP_NODES 100 -#define JBH_HASH_SHIFT 13 /* these are based on journal hash size of 8192 */ +/* these are based on journal hash size of 8192 */ +#define JBH_HASH_SHIFT 13 #define JBH_HASH_MASK 8191 #define _jhashfn(sb,block) \ @@ -2367,7 +2803,7 @@ struct reiserfs_journal_header { (((block)<<(JBH_HASH_SHIFT - 6)) ^ ((block) >> 13) ^ ((block) << (JBH_HASH_SHIFT - 12)))) #define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK]) -// We need these to make journal.c code more readable +/* We need these to make journal.c code more readable */ #define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) #define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) #define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) @@ -2375,12 +2811,14 @@ struct reiserfs_journal_header { enum reiserfs_bh_state_bits { BH_JDirty = BH_PrivateStart, /* buffer is in current transaction */ BH_JDirty_wait, - BH_JNew, /* disk block was taken off free list before - * being in a finished transaction, or - * written to disk. Can be reused immed. */ + /* + * disk block was taken off free list before being in a + * finished transaction, or written to disk. Can be reused immed. + */ + BH_JNew, BH_JPrepared, BH_JRestore_dirty, - BH_JTest, // debugging only will go away + BH_JTest, /* debugging only will go away */ }; BUFFER_FNS(JDirty, journaled); @@ -2396,27 +2834,36 @@ TAS_BUFFER_FNS(JRestore_dirty, journal_restore_dirty); BUFFER_FNS(JTest, journal_test); TAS_BUFFER_FNS(JTest, journal_test); -/* -** transaction handle which is passed around for all journal calls -*/ +/* transaction handle which is passed around for all journal calls */ struct reiserfs_transaction_handle { - struct super_block *t_super; /* super for this FS when journal_begin was - called. saves calls to reiserfs_get_super - also used by nested transactions to make - sure they are nesting on the right FS - _must_ be first in the handle - */ + /* + * super for this FS when journal_begin was called. saves calls to + * reiserfs_get_super also used by nested transactions to make + * sure they are nesting on the right FS _must_ be first + * in the handle + */ + struct super_block *t_super; + int t_refcount; int t_blocks_logged; /* number of blocks this writer has logged */ int t_blocks_allocated; /* number of blocks this writer allocated */ - unsigned int t_trans_id; /* sanity check, equals the current trans id */ + + /* sanity check, equals the current trans id */ + unsigned int t_trans_id; + void *t_handle_save; /* save existing current->journal_info */ - unsigned displace_new_blocks:1; /* if new block allocation occurres, that block - should be displaced from others */ + + /* + * if new block allocation occurres, that block + * should be displaced from others + */ + unsigned displace_new_blocks:1; + struct list_head t_list; }; -/* used to keep track of ordered and tail writes, attached to the buffer +/* + * used to keep track of ordered and tail writes, attached to the buffer * head through b_journal_head. */ struct reiserfs_jh { @@ -2429,7 +2876,7 @@ void reiserfs_free_jh(struct buffer_head *bh); int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh); int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh); int journal_mark_dirty(struct reiserfs_transaction_handle *, - struct super_block *, struct buffer_head *bh); + struct buffer_head *bh); static inline int reiserfs_file_data_log(struct inode *inode) { @@ -2479,10 +2926,8 @@ int journal_init(struct super_block *, const char *j_dev_name, int old_format, int journal_release(struct reiserfs_transaction_handle *, struct super_block *); int journal_release_error(struct reiserfs_transaction_handle *, struct super_block *); -int journal_end(struct reiserfs_transaction_handle *, struct super_block *, - unsigned long); -int journal_end_sync(struct reiserfs_transaction_handle *, struct super_block *, - unsigned long); +int journal_end(struct reiserfs_transaction_handle *); +int journal_end_sync(struct reiserfs_transaction_handle *); int journal_mark_freed(struct reiserfs_transaction_handle *, struct super_block *, b_blocknr_t blocknr); int journal_transaction_should_end(struct reiserfs_transaction_handle *, int); @@ -2491,7 +2936,7 @@ int reiserfs_in_journal(struct super_block *sb, unsigned int bmap_nr, int journal_begin(struct reiserfs_transaction_handle *, struct super_block *sb, unsigned long); int journal_join_abort(struct reiserfs_transaction_handle *, - struct super_block *sb, unsigned long); + struct super_block *sb); void reiserfs_abort_journal(struct super_block *sb, int errno); void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...); int reiserfs_allocate_list_bitmaps(struct super_block *s, @@ -2513,20 +2958,18 @@ int B_IS_IN_TREE(const struct buffer_head *); extern void copy_item_head(struct item_head *to, const struct item_head *from); -// first key is in cpu form, second - le +/* first key is in cpu form, second - le */ extern int comp_short_keys(const struct reiserfs_key *le_key, const struct cpu_key *cpu_key); extern void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from); -// both are in le form +/* both are in le form */ extern int comp_le_keys(const struct reiserfs_key *, const struct reiserfs_key *); extern int comp_short_le_keys(const struct reiserfs_key *, const struct reiserfs_key *); -// -// get key version from on disk key - kludge -// +/* * get key version from on disk key - kludge */ static inline int le_key_version(const struct reiserfs_key *key) { int type; @@ -2603,12 +3046,12 @@ void padd_item(char *item, int total_length, int length); /* inode.c */ /* args for the create parameter of reiserfs_get_block */ -#define GET_BLOCK_NO_CREATE 0 /* don't create new blocks or convert tails */ -#define GET_BLOCK_CREATE 1 /* add anything you need to find block */ -#define GET_BLOCK_NO_HOLE 2 /* return -ENOENT for file holes */ -#define GET_BLOCK_READ_DIRECT 4 /* read the tail if indirect item not found */ -#define GET_BLOCK_NO_IMUX 8 /* i_mutex is not held, don't preallocate */ -#define GET_BLOCK_NO_DANGLE 16 /* don't leave any transactions running */ +#define GET_BLOCK_NO_CREATE 0 /* don't create new blocks or convert tails */ +#define GET_BLOCK_CREATE 1 /* add anything you need to find block */ +#define GET_BLOCK_NO_HOLE 2 /* return -ENOENT for file holes */ +#define GET_BLOCK_READ_DIRECT 4 /* read the tail if indirect item not found */ +#define GET_BLOCK_NO_IMUX 8 /* i_mutex is not held, don't preallocate */ +#define GET_BLOCK_NO_DANGLE 16 /* don't leave any transactions running */ void reiserfs_read_locked_inode(struct inode *inode, struct reiserfs_iget_args *args); @@ -2807,25 +3250,49 @@ struct buffer_head *get_FEB(struct tree_balance *); /* bitmap.c */ -/* structure contains hints for block allocator, and it is a container for - * arguments, such as node, search path, transaction_handle, etc. */ +/* + * structure contains hints for block allocator, and it is a container for + * arguments, such as node, search path, transaction_handle, etc. + */ struct __reiserfs_blocknr_hint { - struct inode *inode; /* inode passed to allocator, if we allocate unf. nodes */ + /* inode passed to allocator, if we allocate unf. nodes */ + struct inode *inode; + sector_t block; /* file offset, in blocks */ struct in_core_key key; - struct treepath *path; /* search path, used by allocator to deternine search_start by - * various ways */ - struct reiserfs_transaction_handle *th; /* transaction handle is needed to log super blocks and - * bitmap blocks changes */ + + /* + * search path, used by allocator to deternine search_start by + * various ways + */ + struct treepath *path; + + /* + * transaction handle is needed to log super blocks + * and bitmap blocks changes + */ + struct reiserfs_transaction_handle *th; + b_blocknr_t beg, end; - b_blocknr_t search_start; /* a field used to transfer search start value (block number) - * between different block allocator procedures - * (determine_search_start() and others) */ - int prealloc_size; /* is set in determine_prealloc_size() function, used by underlayed - * function that do actual allocation */ - - unsigned formatted_node:1; /* the allocator uses different polices for getting disk space for - * formatted/unformatted blocks with/without preallocation */ + + /* + * a field used to transfer search start value (block number) + * between different block allocator procedures + * (determine_search_start() and others) + */ + b_blocknr_t search_start; + + /* + * is set in determine_prealloc_size() function, + * used by underlayed function that do actual allocation + */ + int prealloc_size; + + /* + * the allocator uses different polices for getting disk + * space for formatted/unformatted blocks with/without preallocation + */ + unsigned formatted_node:1; unsigned preallocate:1; }; @@ -2841,6 +3308,7 @@ void reiserfs_init_alloc_options(struct super_block *s); */ __le32 reiserfs_choose_packing(struct inode *dir); +void show_alloc_options(struct seq_file *seq, struct super_block *s); int reiserfs_init_bitmap_cache(struct super_block *sb); void reiserfs_free_bitmap_cache(struct super_block *sb); void reiserfs_cache_bitmap_metadata(struct super_block *sb, struct buffer_head *bh, struct reiserfs_bitmap_info *info); @@ -2918,13 +3386,15 @@ __u32 r5_hash(const signed char *msg, int len); #define reiserfs_test_le_bit test_bit_le #define reiserfs_find_next_zero_le_bit find_next_zero_bit_le -/* sometimes reiserfs_truncate may require to allocate few new blocks - to perform indirect2direct conversion. People probably used to - think, that truncate should work without problems on a filesystem - without free disk space. They may complain that they can not - truncate due to lack of free disk space. This spare space allows us - to not worry about it. 500 is probably too much, but it should be - absolutely safe */ +/* + * sometimes reiserfs_truncate may require to allocate few new blocks + * to perform indirect2direct conversion. People probably used to + * think, that truncate should work without problems on a filesystem + * without free disk space. They may complain that they can not + * truncate due to lack of free disk space. This spare space allows us + * to not worry about it. 500 is probably too much, but it should be + * absolutely safe + */ #define SPARE_SPACE 500 /* prototypes from ioctl.c */ diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c index a4ef5cd606e..6052d323bc9 100644 --- a/fs/reiserfs/resize.c +++ b/fs/reiserfs/resize.c @@ -53,8 +53,10 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) } bforget(bh); - /* old disk layout detection; those partitions can be mounted, but - * cannot be resized */ + /* + * old disk layout detection; those partitions can be mounted, but + * cannot be resized + */ if (SB_BUFFER_WITH_SB(s)->b_blocknr * SB_BUFFER_WITH_SB(s)->b_size != REISERFS_DISK_OFFSET_IN_BYTES) { printk @@ -86,12 +88,14 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) ("reiserfs_resize: unable to allocate memory for journal bitmaps\n"); return -ENOMEM; } - /* the new journal bitmaps are zero filled, now we copy in the bitmap - ** node pointers from the old journal bitmap structs, and then - ** transfer the new data structures into the journal struct. - ** - ** using the copy_size var below allows this code to work for - ** both shrinking and expanding the FS. + /* + * the new journal bitmaps are zero filled, now we copy i + * the bitmap node pointers from the old journal bitmap + * structs, and then transfer the new data structures + * into the journal struct. + * + * using the copy_size var below allows this code to work for + * both shrinking and expanding the FS. */ copy_size = bmap_nr_new < bmap_nr ? bmap_nr_new : bmap_nr; copy_size = @@ -101,36 +105,45 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) jb = SB_JOURNAL(s)->j_list_bitmap + i; memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size); - /* just in case vfree schedules on us, copy the new - ** pointer into the journal struct before freeing the - ** old one + /* + * just in case vfree schedules on us, copy the new + * pointer into the journal struct before freeing the + * old one */ node_tmp = jb->bitmaps; jb->bitmaps = jbitmap[i].bitmaps; vfree(node_tmp); } - /* allocate additional bitmap blocks, reallocate array of bitmap - * block pointers */ + /* + * allocate additional bitmap blocks, reallocate + * array of bitmap block pointers + */ bitmap = vzalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new); if (!bitmap) { - /* Journal bitmaps are still supersized, but the memory isn't - * leaked, so I guess it's ok */ + /* + * Journal bitmaps are still supersized, but the + * memory isn't leaked, so I guess it's ok + */ printk("reiserfs_resize: unable to allocate memory.\n"); return -ENOMEM; } for (i = 0; i < bmap_nr; i++) bitmap[i] = old_bitmap[i]; - /* This doesn't go through the journal, but it doesn't have to. - * The changes are still atomic: We're synced up when the journal - * transaction begins, and the new bitmaps don't matter if the - * transaction fails. */ + /* + * This doesn't go through the journal, but it doesn't have to. + * The changes are still atomic: We're synced up when the + * journal transaction begins, and the new bitmaps don't + * matter if the transaction fails. + */ for (i = bmap_nr; i < bmap_nr_new; i++) { int depth; - /* don't use read_bitmap_block since it will cache - * the uninitialized bitmap */ + /* + * don't use read_bitmap_block since it will cache + * the uninitialized bitmap + */ depth = reiserfs_write_unlock_nested(s); bh = sb_bread(s, i * s->s_blocksize * 8); reiserfs_write_lock_nested(s, depth); @@ -147,7 +160,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) depth = reiserfs_write_unlock_nested(s); sync_dirty_buffer(bh); reiserfs_write_lock_nested(s, depth); - // update bitmap_info stuff + /* update bitmap_info stuff */ bitmap[i].free_count = sb_blocksize(sb) * 8 - 1; brelse(bh); } @@ -156,9 +169,11 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) vfree(old_bitmap); } - /* begin transaction, if there was an error, it's fine. Yes, we have + /* + * begin transaction, if there was an error, it's fine. Yes, we have * incorrect bitmaps now, but none of it is ever going to touch the - * disk anyway. */ + * disk anyway. + */ err = journal_begin(&th, s, 10); if (err) return err; @@ -167,7 +182,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) info = SB_AP_BITMAP(s) + bmap_nr - 1; bh = reiserfs_read_bitmap_block(s, bmap_nr - 1); if (!bh) { - int jerr = journal_end(&th, s, 10); + int jerr = journal_end(&th); if (jerr) return jerr; return -EIO; @@ -178,14 +193,14 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) reiserfs_clear_le_bit(i, bh->b_data); info->free_count += s->s_blocksize * 8 - block_r; - journal_mark_dirty(&th, s, bh); + journal_mark_dirty(&th, bh); brelse(bh); /* Correct new last bitmap block - It may not be full */ info = SB_AP_BITMAP(s) + bmap_nr_new - 1; bh = reiserfs_read_bitmap_block(s, bmap_nr_new - 1); if (!bh) { - int jerr = journal_end(&th, s, 10); + int jerr = journal_end(&th); if (jerr) return jerr; return -EIO; @@ -194,7 +209,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) reiserfs_prepare_for_journal(s, bh, 1); for (i = block_r_new; i < s->s_blocksize * 8; i++) reiserfs_set_le_bit(i, bh->b_data); - journal_mark_dirty(&th, s, bh); + journal_mark_dirty(&th, bh); brelse(bh); info->free_count -= s->s_blocksize * 8 - block_r_new; @@ -207,8 +222,8 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) PUT_SB_BLOCK_COUNT(s, block_count_new); PUT_SB_BMAP_NR(s, bmap_would_wrap(bmap_nr_new) ? : bmap_nr_new); - journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); SB_JOURNAL(s)->j_must_wait = 1; - return journal_end(&th, s, 10); + return journal_end(&th); } diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index b14706a05d5..dd44468edc2 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -8,46 +8,6 @@ * Pereslavl-Zalessky Russia */ -/* - * This file contains functions dealing with S+tree - * - * B_IS_IN_TREE - * copy_item_head - * comp_short_keys - * comp_keys - * comp_short_le_keys - * le_key2cpu_key - * comp_le_keys - * bin_search - * get_lkey - * get_rkey - * key_in_buffer - * decrement_bcount - * reiserfs_check_path - * pathrelse_and_restore - * pathrelse - * search_by_key_reada - * search_by_key - * search_for_position_by_key - * comp_items - * prepare_for_direct_item - * prepare_for_direntry_item - * prepare_for_delete_or_cut - * calc_deleted_bytes_number - * init_tb_struct - * padd_item - * reiserfs_delete_item - * reiserfs_delete_solid_item - * reiserfs_delete_object - * maybe_indirect_to_direct - * indirect_to_direct_roll_back - * reiserfs_cut_from_item - * truncate_directory - * reiserfs_do_truncate - * reiserfs_paste_into_item - * reiserfs_insert_item - */ - #include <linux/time.h> #include <linux/string.h> #include <linux/pagemap.h> @@ -65,21 +25,21 @@ inline int B_IS_IN_TREE(const struct buffer_head *bh) return (B_LEVEL(bh) != FREE_LEVEL); } -// -// to gets item head in le form -// +/* to get item head in le form */ inline void copy_item_head(struct item_head *to, const struct item_head *from) { memcpy(to, from, IH_SIZE); } -/* k1 is pointer to on-disk structure which is stored in little-endian - form. k2 is pointer to cpu variable. For key of items of the same - object this returns 0. - Returns: -1 if key1 < key2 - 0 if key1 == key2 - 1 if key1 > key2 */ +/* + * k1 is pointer to on-disk structure which is stored in little-endian + * form. k2 is pointer to cpu variable. For key of items of the same + * object this returns 0. + * Returns: -1 if key1 < key2 + * 0 if key1 == key2 + * 1 if key1 > key2 + */ inline int comp_short_keys(const struct reiserfs_key *le_key, const struct cpu_key *cpu_key) { @@ -97,11 +57,13 @@ inline int comp_short_keys(const struct reiserfs_key *le_key, return 0; } -/* k1 is pointer to on-disk structure which is stored in little-endian - form. k2 is pointer to cpu variable. - Compare keys using all 4 key fields. - Returns: -1 if key1 < key2 0 - if key1 = key2 1 if key1 > key2 */ +/* + * k1 is pointer to on-disk structure which is stored in little-endian + * form. k2 is pointer to cpu variable. + * Compare keys using all 4 key fields. + * Returns: -1 if key1 < key2 0 + * if key1 = key2 1 if key1 > key2 + */ static inline int comp_keys(const struct reiserfs_key *le_key, const struct cpu_key *cpu_key) { @@ -155,15 +117,17 @@ inline void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from) to->on_disk_key.k_dir_id = le32_to_cpu(from->k_dir_id); to->on_disk_key.k_objectid = le32_to_cpu(from->k_objectid); - // find out version of the key + /* find out version of the key */ version = le_key_version(from); to->version = version; to->on_disk_key.k_offset = le_key_k_offset(version, from); to->on_disk_key.k_type = le_key_k_type(version, from); } -// this does not say which one is bigger, it only returns 1 if keys -// are not equal, 0 otherwise +/* + * this does not say which one is bigger, it only returns 1 if keys + * are not equal, 0 otherwise + */ inline int comp_le_keys(const struct reiserfs_key *k1, const struct reiserfs_key *k2) { @@ -177,24 +141,27 @@ inline int comp_le_keys(const struct reiserfs_key *k1, * *pos = number of the searched element if found, else the * * number of the first element that is larger than key. * **************************************************************************/ -/* For those not familiar with binary search: lbound is the leftmost item that it - could be, rbound the rightmost item that it could be. We examine the item - halfway between lbound and rbound, and that tells us either that we can increase - lbound, or decrease rbound, or that we have found it, or if lbound <= rbound that - there are no possible items, and we have not found it. With each examination we - cut the number of possible items it could be by one more than half rounded down, - or we find it. */ +/* + * For those not familiar with binary search: lbound is the leftmost item + * that it could be, rbound the rightmost item that it could be. We examine + * the item halfway between lbound and rbound, and that tells us either + * that we can increase lbound, or decrease rbound, or that we have found it, + * or if lbound <= rbound that there are no possible items, and we have not + * found it. With each examination we cut the number of possible items it + * could be by one more than half rounded down, or we find it. + */ static inline int bin_search(const void *key, /* Key to search for. */ const void *base, /* First item in the array. */ int num, /* Number of items in the array. */ - int width, /* Item size in the array. - searched. Lest the reader be - confused, note that this is crafted - as a general function, and when it - is applied specifically to the array - of item headers in a node, width - is actually the item header size not - the item size. */ + /* + * Item size in the array. searched. Lest the + * reader be confused, note that this is crafted + * as a general function, and when it is applied + * specifically to the array of item headers in a + * node, width is actually the item header size + * not the item size. + */ + int width, int *pos /* Number of the searched for element. */ ) { @@ -216,8 +183,10 @@ static inline int bin_search(const void *key, /* Key to search for. */ return ITEM_FOUND; /* Key found in the array. */ } - /* bin_search did not find given key, it returns position of key, - that is minimal and greater than the given one. */ + /* + * bin_search did not find given key, it returns position of key, + * that is minimal and greater than the given one. + */ *pos = lbound; return ITEM_NOT_FOUND; } @@ -228,16 +197,20 @@ const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} }; /* Maximal possible key. It is never in the tree. */ static const struct reiserfs_key MAX_KEY = { - __constant_cpu_to_le32(0xffffffff), - __constant_cpu_to_le32(0xffffffff), - {{__constant_cpu_to_le32(0xffffffff), - __constant_cpu_to_le32(0xffffffff)},} + cpu_to_le32(0xffffffff), + cpu_to_le32(0xffffffff), + {{cpu_to_le32(0xffffffff), + cpu_to_le32(0xffffffff)},} }; -/* Get delimiting key of the buffer by looking for it in the buffers in the path, starting from the bottom - of the path, and going upwards. We must check the path's validity at each step. If the key is not in - the path, there is no delimiting key in the tree (buffer is first or last buffer in tree), and in this - case we return a special key, either MIN_KEY or MAX_KEY. */ +/* + * Get delimiting key of the buffer by looking for it in the buffers in the + * path, starting from the bottom of the path, and going upwards. We must + * check the path's validity at each step. If the key is not in the path, + * there is no delimiting key in the tree (buffer is first or last buffer + * in tree), and in this case we return a special key, either MIN_KEY or + * MAX_KEY. + */ static inline const struct reiserfs_key *get_lkey(const struct treepath *chk_path, const struct super_block *sb) { @@ -270,9 +243,12 @@ static inline const struct reiserfs_key *get_lkey(const struct treepath *chk_pat PATH_OFFSET_PBUFFER(chk_path, path_offset + 1)->b_blocknr) return &MAX_KEY; - /* Return delimiting key if position in the parent is not equal to zero. */ + /* + * Return delimiting key if position in the parent + * is not equal to zero. + */ if (position) - return B_N_PDELIM_KEY(parent, position - 1); + return internal_key(parent, position - 1); } /* Return MIN_KEY if we are in the root of the buffer tree. */ if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)-> @@ -308,15 +284,23 @@ inline const struct reiserfs_key *get_rkey(const struct treepath *chk_path, path_offset)) > B_NR_ITEMS(parent)) return &MIN_KEY; - /* Check whether parent at the path really points to the child. */ + /* + * Check whether parent at the path really points + * to the child. + */ if (B_N_CHILD_NUM(parent, position) != PATH_OFFSET_PBUFFER(chk_path, path_offset + 1)->b_blocknr) return &MIN_KEY; - /* Return delimiting key if position in the parent is not the last one. */ + + /* + * Return delimiting key if position in the parent + * is not the last one. + */ if (position != B_NR_ITEMS(parent)) - return B_N_PDELIM_KEY(parent, position); + return internal_key(parent, position); } + /* Return MAX_KEY if we are in the root of the buffer tree. */ if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)-> b_blocknr == SB_ROOT_BLOCK(sb)) @@ -324,13 +308,20 @@ inline const struct reiserfs_key *get_rkey(const struct treepath *chk_path, return &MIN_KEY; } -/* Check whether a key is contained in the tree rooted from a buffer at a path. */ -/* This works by looking at the left and right delimiting keys for the buffer in the last path_element in - the path. These delimiting keys are stored at least one level above that buffer in the tree. If the - buffer is the first or last node in the tree order then one of the delimiting keys may be absent, and in - this case get_lkey and get_rkey return a special key which is MIN_KEY or MAX_KEY. */ -static inline int key_in_buffer(struct treepath *chk_path, /* Path which should be checked. */ - const struct cpu_key *key, /* Key which should be checked. */ +/* + * Check whether a key is contained in the tree rooted from a buffer at a path. + * This works by looking at the left and right delimiting keys for the buffer + * in the last path_element in the path. These delimiting keys are stored + * at least one level above that buffer in the tree. If the buffer is the + * first or last node in the tree order then one of the delimiting keys may + * be absent, and in this case get_lkey and get_rkey return a special key + * which is MIN_KEY or MAX_KEY. + */ +static inline int key_in_buffer( + /* Path which should be checked. */ + struct treepath *chk_path, + /* Key which should be checked. */ + const struct cpu_key *key, struct super_block *sb ) { @@ -359,9 +350,11 @@ int reiserfs_check_path(struct treepath *p) return 0; } -/* Drop the reference to each buffer in a path and restore +/* + * Drop the reference to each buffer in a path and restore * dirty bits clean when preparing the buffer for the log. - * This version should only be called from fix_nodes() */ + * This version should only be called from fix_nodes() + */ void pathrelse_and_restore(struct super_block *sb, struct treepath *search_path) { @@ -418,14 +411,17 @@ static int is_leaf(char *buf, int blocksize, struct buffer_head *bh) } ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1; used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location(ih)); + + /* free space does not match to calculated amount of use space */ if (used_space != blocksize - blkh_free_space(blkh)) { - /* free space does not match to calculated amount of use space */ reiserfs_warning(NULL, "reiserfs-5082", "free space seems wrong: %z", bh); return 0; } - // FIXME: it is_leaf will hit performance too much - we may have - // return 1 here + /* + * FIXME: it is_leaf will hit performance too much - we may have + * return 1 here + */ /* check tables of item heads */ ih = (struct item_head *)(buf + BLKH_SIZE); @@ -460,7 +456,7 @@ static int is_leaf(char *buf, int blocksize, struct buffer_head *bh) prev_location = ih_location(ih); } - // one may imagine much more checks + /* one may imagine many more checks */ return 1; } @@ -481,8 +477,8 @@ static int is_internal(char *buf, int blocksize, struct buffer_head *bh) } nr = blkh_nr_item(blkh); + /* for internal which is not root we might check min number of keys */ if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) { - /* for internal which is not root we might check min number of keys */ reiserfs_warning(NULL, "reiserfs-5088", "number of key seems wrong: %z", bh); return 0; @@ -494,12 +490,15 @@ static int is_internal(char *buf, int blocksize, struct buffer_head *bh) "free space seems wrong: %z", bh); return 0; } - // one may imagine much more checks + + /* one may imagine many more checks */ return 1; } -// make sure that bh contains formatted node of reiserfs tree of -// 'level'-th level +/* + * make sure that bh contains formatted node of reiserfs tree of + * 'level'-th level + */ static int is_tree_node(struct buffer_head *bh, int level) { if (B_LEVEL(bh) != level) { @@ -546,7 +545,8 @@ static int search_by_key_reada(struct super_block *s, for (j = 0; j < i; j++) { /* * note, this needs attention if we are getting rid of the BKL - * you have to make sure the prepared bit isn't set on this buffer + * you have to make sure the prepared bit isn't set on this + * buffer */ if (!buffer_uptodate(bh[j])) { if (depth == -1) @@ -558,39 +558,34 @@ static int search_by_key_reada(struct super_block *s, return depth; } -/************************************************************************** - * Algorithm SearchByKey * - * look for item in the Disk S+Tree by its key * - * Input: sb - super block * - * key - pointer to the key to search * - * Output: ITEM_FOUND, ITEM_NOT_FOUND or IO_ERROR * - * search_path - path from the root to the needed leaf * - **************************************************************************/ - -/* This function fills up the path from the root to the leaf as it - descends the tree looking for the key. It uses reiserfs_bread to - try to find buffers in the cache given their block number. If it - does not find them in the cache it reads them from disk. For each - node search_by_key finds using reiserfs_bread it then uses - bin_search to look through that node. bin_search will find the - position of the block_number of the next node if it is looking - through an internal node. If it is looking through a leaf node - bin_search will find the position of the item which has key either - equal to given key, or which is the maximal key less than the given - key. search_by_key returns a path that must be checked for the - correctness of the top of the path but need not be checked for the - correctness of the bottom of the path */ -/* The function is NOT SCHEDULE-SAFE! */ -int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to search. */ - struct treepath *search_path,/* This structure was - allocated and initialized - by the calling - function. It is filled up - by this function. */ - int stop_level /* How far down the tree to search. To - stop at leaf level - set to - DISK_LEAF_NODE_LEVEL */ - ) +/* + * This function fills up the path from the root to the leaf as it + * descends the tree looking for the key. It uses reiserfs_bread to + * try to find buffers in the cache given their block number. If it + * does not find them in the cache it reads them from disk. For each + * node search_by_key finds using reiserfs_bread it then uses + * bin_search to look through that node. bin_search will find the + * position of the block_number of the next node if it is looking + * through an internal node. If it is looking through a leaf node + * bin_search will find the position of the item which has key either + * equal to given key, or which is the maximal key less than the given + * key. search_by_key returns a path that must be checked for the + * correctness of the top of the path but need not be checked for the + * correctness of the bottom of the path + */ +/* + * search_by_key - search for key (and item) in stree + * @sb: superblock + * @key: pointer to key to search for + * @search_path: Allocated and initialized struct treepath; Returned filled + * on success. + * @stop_level: How far down the tree to search, Use DISK_LEAF_NODE_LEVEL to + * stop at leaf level. + * + * The function is NOT SCHEDULE-SAFE! + */ +int search_by_key(struct super_block *sb, const struct cpu_key *key, + struct treepath *search_path, int stop_level) { b_blocknr_t block_number; int expected_level; @@ -609,17 +604,22 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s PROC_INFO_INC(sb, search_by_key); - /* As we add each node to a path we increase its count. This means that - we must be careful to release all nodes in a path before we either - discard the path struct or re-use the path struct, as we do here. */ + /* + * As we add each node to a path we increase its count. This means + * that we must be careful to release all nodes in a path before we + * either discard the path struct or re-use the path struct, as we + * do here. + */ pathrelse(search_path); right_neighbor_of_leaf_node = 0; - /* With each iteration of this loop we search through the items in the - current node, and calculate the next current node(next path element) - for the next iteration of this loop.. */ + /* + * With each iteration of this loop we search through the items in the + * current node, and calculate the next current node(next path element) + * for the next iteration of this loop.. + */ block_number = SB_ROOT_BLOCK(sb); expected_level = -1; while (1) { @@ -639,8 +639,10 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s ++search_path->path_length); fs_gen = get_generation(sb); - /* Read the next tree node, and set the last element in the path to - have a pointer to it. */ + /* + * Read the next tree node, and set the last element + * in the path to have a pointer to it. + */ if ((bh = last_element->pe_buffer = sb_getblk(sb, block_number))) { @@ -666,7 +668,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s if (!buffer_uptodate(bh)) goto io_error; } else { - io_error: +io_error: search_path->path_length--; pathrelse(search_path); return IO_ERROR; @@ -676,9 +678,12 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s expected_level = SB_TREE_HEIGHT(sb); expected_level--; - /* It is possible that schedule occurred. We must check whether the key - to search is still in the tree rooted from the current buffer. If - not then repeat search from the root. */ + /* + * It is possible that schedule occurred. We must check + * whether the key to search is still in the tree rooted + * from the current buffer. If not then repeat search + * from the root. + */ if (fs_changed(fs_gen, sb) && (!B_IS_IN_TREE(bh) || B_LEVEL(bh) != expected_level || @@ -689,8 +694,10 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s sbk_restarted[expected_level - 1]); pathrelse(search_path); - /* Get the root block number so that we can repeat the search - starting from the root. */ + /* + * Get the root block number so that we can + * repeat the search starting from the root. + */ block_number = SB_ROOT_BLOCK(sb); expected_level = -1; right_neighbor_of_leaf_node = 0; @@ -699,9 +706,11 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s continue; } - /* only check that the key is in the buffer if key is not - equal to the MAX_KEY. Latter case is only possible in - "finish_unfinished()" processing during mount. */ + /* + * only check that the key is in the buffer if key is not + * equal to the MAX_KEY. Latter case is only possible in + * "finish_unfinished()" processing during mount. + */ RFALSE(comp_keys(&MAX_KEY, key) && !key_in_buffer(search_path, key, sb), "PAP-5130: key is not in the buffer"); @@ -713,8 +722,10 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s } #endif - // make sure, that the node contents look like a node of - // certain level + /* + * make sure, that the node contents look like a node of + * certain level + */ if (!is_tree_node(bh, expected_level)) { reiserfs_error(sb, "vs-5150", "invalid format found in block %ld. " @@ -732,32 +743,42 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s "vs-5152: tree level (%d) is less than stop level (%d)", node_level, stop_level); - retval = bin_search(key, B_N_PITEM_HEAD(bh, 0), + retval = bin_search(key, item_head(bh, 0), B_NR_ITEMS(bh), (node_level == DISK_LEAF_NODE_LEVEL) ? IH_SIZE : KEY_SIZE, - &(last_element->pe_position)); + &last_element->pe_position); if (node_level == stop_level) { return retval; } /* we are not in the stop level */ + /* + * item has been found, so we choose the pointer which + * is to the right of the found one + */ if (retval == ITEM_FOUND) - /* item has been found, so we choose the pointer which is to the right of the found one */ last_element->pe_position++; - /* if item was not found we choose the position which is to - the left of the found item. This requires no code, - bin_search did it already. */ + /* + * if item was not found we choose the position which is to + * the left of the found item. This requires no code, + * bin_search did it already. + */ - /* So we have chosen a position in the current node which is - an internal node. Now we calculate child block number by - position in the node. */ + /* + * So we have chosen a position in the current node which is + * an internal node. Now we calculate child block number by + * position in the node. + */ block_number = B_N_CHILD_NUM(bh, last_element->pe_position); - /* if we are going to read leaf nodes, try for read ahead as well */ + /* + * if we are going to read leaf nodes, try for read + * ahead as well + */ if ((search_path->reada & PATH_READA) && node_level == DISK_LEAF_NODE_LEVEL + 1) { int pos = last_element->pe_position; @@ -779,7 +800,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s /* * check to make sure we're in the same object */ - le_key = B_N_PDELIM_KEY(bh, pos); + le_key = internal_key(bh, pos); if (le32_to_cpu(le_key->k_objectid) != key->on_disk_key.k_objectid) { break; @@ -789,26 +810,28 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s } } -/* Form the path to an item and position in this item which contains - file byte defined by key. If there is no such item - corresponding to the key, we point the path to the item with - maximal key less than key, and *pos_in_item is set to one - past the last entry/byte in the item. If searching for entry in a - directory item, and it is not found, *pos_in_item is set to one - entry more than the entry with maximal key which is less than the - sought key. - - Note that if there is no entry in this same node which is one more, - then we point to an imaginary entry. for direct items, the - position is in units of bytes, for indirect items the position is - in units of blocknr entries, for directory items the position is in - units of directory entries. */ - +/* + * Form the path to an item and position in this item which contains + * file byte defined by key. If there is no such item + * corresponding to the key, we point the path to the item with + * maximal key less than key, and *pos_in_item is set to one + * past the last entry/byte in the item. If searching for entry in a + * directory item, and it is not found, *pos_in_item is set to one + * entry more than the entry with maximal key which is less than the + * sought key. + * + * Note that if there is no entry in this same node which is one more, + * then we point to an imaginary entry. for direct items, the + * position is in units of bytes, for indirect items the position is + * in units of blocknr entries, for directory items the position is in + * units of directory entries. + */ /* The function is NOT SCHEDULE-SAFE! */ -int search_for_position_by_key(struct super_block *sb, /* Pointer to the super block. */ - const struct cpu_key *p_cpu_key, /* Key to search (cpu variable) */ - struct treepath *search_path /* Filled up by this function. */ - ) +int search_for_position_by_key(struct super_block *sb, + /* Key to search (cpu variable) */ + const struct cpu_key *p_cpu_key, + /* Filled up by this function. */ + struct treepath *search_path) { struct item_head *p_le_ih; /* pointer to on-disk structure */ int blk_size; @@ -830,7 +853,7 @@ int search_for_position_by_key(struct super_block *sb, /* Pointer to the super b if (retval == ITEM_FOUND) { RFALSE(!ih_item_len - (B_N_PITEM_HEAD + (item_head (PATH_PLAST_BUFFER(search_path), PATH_LAST_POSITION(search_path))), "PAP-5165: item length equals zero"); @@ -844,14 +867,14 @@ int search_for_position_by_key(struct super_block *sb, /* Pointer to the super b /* Item is not found. Set path to the previous item. */ p_le_ih = - B_N_PITEM_HEAD(PATH_PLAST_BUFFER(search_path), + item_head(PATH_PLAST_BUFFER(search_path), --PATH_LAST_POSITION(search_path)); blk_size = sb->s_blocksize; - if (comp_short_keys(&(p_le_ih->ih_key), p_cpu_key)) { + if (comp_short_keys(&p_le_ih->ih_key, p_cpu_key)) return FILE_NOT_FOUND; - } - // FIXME: quite ugly this far + + /* FIXME: quite ugly this far */ item_offset = le_ih_k_offset(p_le_ih); offset = cpu_key_k_offset(p_cpu_key); @@ -866,8 +889,10 @@ int search_for_position_by_key(struct super_block *sb, /* Pointer to the super b return POSITION_FOUND; } - /* Needed byte is not contained in the item pointed to by the - path. Set pos_in_item out of the item. */ + /* + * Needed byte is not contained in the item pointed to by the + * path. Set pos_in_item out of the item. + */ if (is_indirect_le_ih(p_le_ih)) pos_in_item(search_path) = ih_item_len(p_le_ih) / UNFM_P_SIZE; @@ -892,19 +917,17 @@ int comp_items(const struct item_head *stored_ih, const struct treepath *path) return 1; /* we need only to know, whether it is the same item */ - ih = get_ih(path); + ih = tp_item_head(path); return memcmp(stored_ih, ih, IH_SIZE); } -/* unformatted nodes are not logged anymore, ever. This is safe -** now -*/ +/* unformatted nodes are not logged anymore, ever. This is safe now */ #define held_by_others(bh) (atomic_read(&(bh)->b_count) > 1) -// block can not be forgotten as it is in I/O or held by someone +/* block can not be forgotten as it is in I/O or held by someone */ #define block_in_use(bh) (buffer_locked(bh) || (held_by_others(bh))) -// prepare for delete or cut of direct item +/* prepare for delete or cut of direct item */ static inline int prepare_for_direct_item(struct treepath *path, struct item_head *le_ih, struct inode *inode, @@ -917,9 +940,8 @@ static inline int prepare_for_direct_item(struct treepath *path, *cut_size = -(IH_SIZE + ih_item_len(le_ih)); return M_DELETE; } - // new file gets truncated + /* new file gets truncated */ if (get_inode_item_key_version(inode) == KEY_FORMAT_3_6) { - // round_len = ROUND_UP(new_file_length); /* this was new_file_length < le_ih ... */ if (round_len < le_ih_k_offset(le_ih)) { @@ -933,12 +955,13 @@ static inline int prepare_for_direct_item(struct treepath *path, return M_CUT; /* Cut from this item. */ } - // old file: items may have any length + /* old file: items may have any length */ if (new_file_length < le_ih_k_offset(le_ih)) { *cut_size = -(IH_SIZE + ih_item_len(le_ih)); return M_DELETE; /* Delete this item. */ } + /* Calculate first position and size for cutting from item. */ *cut_size = -(ih_item_len(le_ih) - (pos_in_item(path) = @@ -957,12 +980,15 @@ static inline int prepare_for_direntry_item(struct treepath *path, RFALSE(ih_entry_count(le_ih) != 2, "PAP-5220: incorrect empty directory item (%h)", le_ih); *cut_size = -(IH_SIZE + ih_item_len(le_ih)); - return M_DELETE; /* Delete the directory item containing "." and ".." entry. */ + /* Delete the directory item containing "." and ".." entry. */ + return M_DELETE; } if (ih_entry_count(le_ih) == 1) { - /* Delete the directory item such as there is one record only - in this item */ + /* + * Delete the directory item such as there is one record only + * in this item + */ *cut_size = -(IH_SIZE + ih_item_len(le_ih)); return M_DELETE; } @@ -976,18 +1002,34 @@ static inline int prepare_for_direntry_item(struct treepath *path, #define JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD (2 * JOURNAL_PER_BALANCE_CNT + 1) -/* If the path points to a directory or direct item, calculate mode and the size cut, for balance. - If the path points to an indirect item, remove some number of its unformatted nodes. - In case of file truncate calculate whether this item must be deleted/truncated or last - unformatted node of this item will be converted to a direct item. - This function returns a determination of what balance mode the calling function should employ. */ -static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, struct inode *inode, struct treepath *path, const struct cpu_key *item_key, int *removed, /* Number of unformatted nodes which were removed - from end of the file. */ - int *cut_size, unsigned long long new_file_length /* MAX_KEY_OFFSET in case of delete. */ +/* + * If the path points to a directory or direct item, calculate mode + * and the size cut, for balance. + * If the path points to an indirect item, remove some number of its + * unformatted nodes. + * In case of file truncate calculate whether this item must be + * deleted/truncated or last unformatted node of this item will be + * converted to a direct item. + * This function returns a determination of what balance mode the + * calling function should employ. + */ +static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, + struct inode *inode, + struct treepath *path, + const struct cpu_key *item_key, + /* + * Number of unformatted nodes + * which were removed from end + * of the file. + */ + int *removed, + int *cut_size, + /* MAX_KEY_OFFSET in case of delete. */ + unsigned long long new_file_length ) { struct super_block *sb = inode->i_sb; - struct item_head *p_le_ih = PATH_PITEM_HEAD(path); + struct item_head *p_le_ih = tp_item_head(path); struct buffer_head *bh = PATH_PLAST_BUFFER(path); BUG_ON(!th->t_trans_id); @@ -1023,8 +1065,10 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st int pos = 0; if ( new_file_length == max_reiserfs_offset (inode) ) { - /* prepare_for_delete_or_cut() is called by - * reiserfs_delete_item() */ + /* + * prepare_for_delete_or_cut() is called by + * reiserfs_delete_item() + */ new_file_length = 0; delete = 1; } @@ -1033,27 +1077,30 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st need_re_search = 0; *cut_size = 0; bh = PATH_PLAST_BUFFER(path); - copy_item_head(&s_ih, PATH_PITEM_HEAD(path)); + copy_item_head(&s_ih, tp_item_head(path)); pos = I_UNFM_NUM(&s_ih); while (le_ih_k_offset (&s_ih) + (pos - 1) * blk_size > new_file_length) { __le32 *unfm; __u32 block; - /* Each unformatted block deletion may involve one additional - * bitmap block into the transaction, thereby the initial - * journal space reservation might not be enough. */ + /* + * Each unformatted block deletion may involve + * one additional bitmap block into the transaction, + * thereby the initial journal space reservation + * might not be enough. + */ if (!delete && (*cut_size) != 0 && reiserfs_transaction_free_space(th) < JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) break; - unfm = (__le32 *)B_I_PITEM(bh, &s_ih) + pos - 1; + unfm = (__le32 *)ih_item_body(bh, &s_ih) + pos - 1; block = get_block_num(unfm, 0); if (block != 0) { reiserfs_prepare_for_journal(sb, bh, 1); put_block_num(unfm, 0, 0); - journal_mark_dirty(th, sb, bh); + journal_mark_dirty(th, bh); reiserfs_free_block(th, inode, block, 1); } @@ -1074,17 +1121,21 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st break; } } - /* a trick. If the buffer has been logged, this will do nothing. If - ** we've broken the loop without logging it, it will restore the - ** buffer */ + /* + * a trick. If the buffer has been logged, this will + * do nothing. If we've broken the loop without logging + * it, it will restore the buffer + */ reiserfs_restore_prepared_buffer(sb, bh); } while (need_re_search && search_for_position_by_key(sb, item_key, path) == POSITION_FOUND); pos_in_item(path) = pos * UNFM_P_SIZE; if (*cut_size == 0) { - /* Nothing were cut. maybe convert last unformatted node to the - * direct item? */ + /* + * Nothing was cut. maybe convert last unformatted node to the + * direct item? + */ result = M_CONVERT; } return result; @@ -1095,7 +1146,7 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st static int calc_deleted_bytes_number(struct tree_balance *tb, char mode) { int del_size; - struct item_head *p_le_ih = PATH_PITEM_HEAD(tb->tb_path); + struct item_head *p_le_ih = tp_item_head(tb->tb_path); if (is_statdata_le_ih(p_le_ih)) return 0; @@ -1104,9 +1155,11 @@ static int calc_deleted_bytes_number(struct tree_balance *tb, char mode) (mode == M_DELETE) ? ih_item_len(p_le_ih) : -tb->insert_size[0]; if (is_direntry_le_ih(p_le_ih)) { - /* return EMPTY_DIR_SIZE; We delete emty directoris only. - * we can't use EMPTY_DIR_SIZE, as old format dirs have a different - * empty size. ick. FIXME, is this right? */ + /* + * return EMPTY_DIR_SIZE; We delete emty directories only. + * we can't use EMPTY_DIR_SIZE, as old format dirs have a + * different empty size. ick. FIXME, is this right? + */ return del_size; } @@ -1169,7 +1222,8 @@ char head2type(struct item_head *ih) } #endif -/* Delete object item. +/* + * Delete object item. * th - active transaction handle * path - path to the deleted item * item_key - key to search for the deleted item @@ -1212,7 +1266,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, RFALSE(mode != M_DELETE, "PAP-5320: mode must be M_DELETE"); - copy_item_head(&s_ih, PATH_PITEM_HEAD(path)); + copy_item_head(&s_ih, tp_item_head(path)); s_del_balance.insert_size[0] = del_size; ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL); @@ -1221,7 +1275,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, PROC_INFO_INC(sb, delete_item_restarted); - // file system changed, repeat search + /* file system changed, repeat search */ ret_value = search_for_position_by_key(sb, item_key, path); if (ret_value == IO_ERROR) @@ -1238,16 +1292,18 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, unfix_nodes(&s_del_balance); return 0; } - // reiserfs_delete_item returns item length when success + + /* reiserfs_delete_item returns item length when success */ ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE); - q_ih = get_ih(path); + q_ih = tp_item_head(path); quota_cut_bytes = ih_item_len(q_ih); - /* hack so the quota code doesn't have to guess if the file - ** has a tail. On tail insert, we allocate quota for 1 unformatted node. - ** We test the offset because the tail might have been - ** split into multiple items, and we only want to decrement for - ** the unfm node once + /* + * hack so the quota code doesn't have to guess if the file has a + * tail. On tail insert, we allocate quota for 1 unformatted node. + * We test the offset because the tail might have been + * split into multiple items, and we only want to decrement for + * the unfm node once */ if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(q_ih)) { if ((le_ih_k_offset(q_ih) & (sb->s_blocksize - 1)) == 1) { @@ -1261,33 +1317,38 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, int off; char *data; - /* We are in direct2indirect conversion, so move tail contents - to the unformatted node */ - /* note, we do the copy before preparing the buffer because we - ** don't care about the contents of the unformatted node yet. - ** the only thing we really care about is the direct item's data - ** is in the unformatted node. - ** - ** Otherwise, we would have to call reiserfs_prepare_for_journal on - ** the unformatted node, which might schedule, meaning we'd have to - ** loop all the way back up to the start of the while loop. - ** - ** The unformatted node must be dirtied later on. We can't be - ** sure here if the entire tail has been deleted yet. - ** - ** un_bh is from the page cache (all unformatted nodes are - ** from the page cache) and might be a highmem page. So, we - ** can't use un_bh->b_data. - ** -clm + /* + * We are in direct2indirect conversion, so move tail contents + * to the unformatted node + */ + /* + * note, we do the copy before preparing the buffer because we + * don't care about the contents of the unformatted node yet. + * the only thing we really care about is the direct item's + * data is in the unformatted node. + * + * Otherwise, we would have to call + * reiserfs_prepare_for_journal on the unformatted node, + * which might schedule, meaning we'd have to loop all the + * way back up to the start of the while loop. + * + * The unformatted node must be dirtied later on. We can't be + * sure here if the entire tail has been deleted yet. + * + * un_bh is from the page cache (all unformatted nodes are + * from the page cache) and might be a highmem page. So, we + * can't use un_bh->b_data. + * -clm */ data = kmap_atomic(un_bh->b_page); off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_CACHE_SIZE - 1)); memcpy(data + off, - B_I_PITEM(PATH_PLAST_BUFFER(path), &s_ih), + ih_item_body(PATH_PLAST_BUFFER(path), &s_ih), ret_value); kunmap_atomic(data); } + /* Perform balancing after all resources have been collected at once. */ do_balance(&s_del_balance, NULL, NULL, M_DELETE); @@ -1304,20 +1365,21 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, return ret_value; } -/* Summary Of Mechanisms For Handling Collisions Between Processes: - - deletion of the body of the object is performed by iput(), with the - result that if multiple processes are operating on a file, the - deletion of the body of the file is deferred until the last process - that has an open inode performs its iput(). - - writes and truncates are protected from collisions by use of - semaphores. - - creates, linking, and mknod are protected from collisions with other - processes by making the reiserfs_add_entry() the last step in the - creation, and then rolling back all changes if there was a collision. - - Hans +/* + * Summary Of Mechanisms For Handling Collisions Between Processes: + * + * deletion of the body of the object is performed by iput(), with the + * result that if multiple processes are operating on a file, the + * deletion of the body of the file is deferred until the last process + * that has an open inode performs its iput(). + * + * writes and truncates are protected from collisions by use of + * semaphores. + * + * creates, linking, and mknod are protected from collisions with other + * processes by making the reiserfs_add_entry() the last step in the + * creation, and then rolling back all changes if there was a collision. + * - Hans */ /* this deletes item which never gets split */ @@ -1347,7 +1409,11 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, } if (retval != ITEM_FOUND) { pathrelse(&path); - // No need for a warning, if there is just no free space to insert '..' item into the newly-created subdir + /* + * No need for a warning, if there is just no free + * space to insert '..' item into the + * newly-created subdir + */ if (! ((unsigned long long) GET_HASH_VALUE(le_key_k_offset @@ -1362,11 +1428,11 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, } if (!tb_init) { tb_init = 1; - item_len = ih_item_len(PATH_PITEM_HEAD(&path)); + item_len = ih_item_len(tp_item_head(&path)); init_tb_struct(th, &tb, th->t_super, &path, -(IH_SIZE + item_len)); } - quota_cut_bytes = ih_item_len(PATH_PITEM_HEAD(&path)); + quota_cut_bytes = ih_item_len(tp_item_head(&path)); retval = fix_nodes(M_DELETE, &tb, NULL, NULL); if (retval == REPEAT_SEARCH) { @@ -1376,7 +1442,11 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, if (retval == CARRY_ON) { do_balance(&tb, NULL, NULL, M_DELETE); - if (inode) { /* Should we count quota for item? (we don't count quotas for save-links) */ + /* + * Should we count quota for item? (we don't + * count quotas for save-links) + */ + if (inode) { int depth; #ifdef REISERQUOTA_DEBUG reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE, @@ -1391,7 +1461,8 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, } break; } - // IO_ERROR, NO_DISK_SPACE, etc + + /* IO_ERROR, NO_DISK_SPACE, etc */ reiserfs_warning(th->t_super, "vs-5360", "could not delete %K due to fix_nodes failure", &cpu_key); @@ -1447,11 +1518,13 @@ static void unmap_buffers(struct page *page, loff_t pos) do { next = bh->b_this_page; - /* we want to unmap the buffers that contain the tail, and - ** all the buffers after it (since the tail must be at the - ** end of the file). We don't want to unmap file data - ** before the tail, since it might be dirty and waiting to - ** reach disk + /* + * we want to unmap the buffers that contain + * the tail, and all the buffers after it + * (since the tail must be at the end of the + * file). We don't want to unmap file data + * before the tail, since it might be dirty + * and waiting to reach disk */ cur_index += bh->b_size; if (cur_index > tail_index) { @@ -1476,9 +1549,10 @@ static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th, BUG_ON(!th->t_trans_id); BUG_ON(new_file_size != inode->i_size); - /* the page being sent in could be NULL if there was an i/o error - ** reading in the last block. The user will hit problems trying to - ** read the file, but for now we just skip the indirect2direct + /* + * the page being sent in could be NULL if there was an i/o error + * reading in the last block. The user will hit problems trying to + * read the file, but for now we just skip the indirect2direct */ if (atomic_read(&inode->i_count) > 1 || !tail_has_to_be_packed(inode) || @@ -1490,17 +1564,18 @@ static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th, pathrelse(path); return cut_bytes; } + /* Perform the conversion to a direct_item. */ - /* return indirect_to_direct(inode, path, item_key, - new_file_size, mode); */ return indirect2direct(th, inode, page, path, item_key, new_file_size, mode); } -/* we did indirect_to_direct conversion. And we have inserted direct - item successesfully, but there were no disk space to cut unfm - pointer being converted. Therefore we have to delete inserted - direct item(s) */ +/* + * we did indirect_to_direct conversion. And we have inserted direct + * item successesfully, but there were no disk space to cut unfm + * pointer being converted. Therefore we have to delete inserted + * direct item(s) + */ static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th, struct inode *inode, struct treepath *path) { @@ -1509,7 +1584,7 @@ static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th, int removed; BUG_ON(!th->t_trans_id); - make_cpu_key(&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4); // !!!! + make_cpu_key(&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4); tail_key.key_length = 4; tail_len = @@ -1521,7 +1596,7 @@ static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th, reiserfs_panic(inode->i_sb, "vs-5615", "found invalid item"); RFALSE(path->pos_in_item != - ih_item_len(PATH_PITEM_HEAD(path)) - 1, + ih_item_len(tp_item_head(path)) - 1, "vs-5616: appended bytes found"); PATH_LAST_POSITION(path)--; @@ -1539,7 +1614,6 @@ static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th, reiserfs_warning(inode->i_sb, "reiserfs-5091", "indirect_to_direct " "conversion has been rolled back due to " "lack of disk space"); - //mark_file_without_tail (inode); mark_inode_dirty(inode); } @@ -1551,15 +1625,18 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, struct page *page, loff_t new_file_size) { struct super_block *sb = inode->i_sb; - /* Every function which is going to call do_balance must first - create a tree_balance structure. Then it must fill up this - structure by using the init_tb_struct and fix_nodes functions. - After that we can make tree balancing. */ + /* + * Every function which is going to call do_balance must first + * create a tree_balance structure. Then it must fill up this + * structure by using the init_tb_struct and fix_nodes functions. + * After that we can make tree balancing. + */ struct tree_balance s_cut_balance; struct item_head *p_le_ih; - int cut_size = 0, /* Amount to be cut. */ - ret_value = CARRY_ON, removed = 0, /* Number of the removed unformatted nodes. */ - is_inode_locked = 0; + int cut_size = 0; /* Amount to be cut. */ + int ret_value = CARRY_ON; + int removed = 0; /* Number of the removed unformatted nodes. */ + int is_inode_locked = 0; char mode; /* Mode of the balance. */ int retval2 = -1; int quota_cut_bytes; @@ -1571,21 +1648,27 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, init_tb_struct(th, &s_cut_balance, inode->i_sb, path, cut_size); - /* Repeat this loop until we either cut the item without needing - to balance, or we fix_nodes without schedule occurring */ + /* + * Repeat this loop until we either cut the item without needing + * to balance, or we fix_nodes without schedule occurring + */ while (1) { - /* Determine the balance mode, position of the first byte to - be cut, and size to be cut. In case of the indirect item - free unformatted nodes which are pointed to by the cut - pointers. */ + /* + * Determine the balance mode, position of the first byte to + * be cut, and size to be cut. In case of the indirect item + * free unformatted nodes which are pointed to by the cut + * pointers. + */ mode = prepare_for_delete_or_cut(th, inode, path, item_key, &removed, &cut_size, new_file_size); if (mode == M_CONVERT) { - /* convert last unformatted node to direct item or leave - tail in the unformatted node */ + /* + * convert last unformatted node to direct item or + * leave tail in the unformatted node + */ RFALSE(ret_value != CARRY_ON, "PAP-5570: can not convert twice"); @@ -1599,15 +1682,20 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, is_inode_locked = 1; - /* removing of last unformatted node will change value we - have to return to truncate. Save it */ + /* + * removing of last unformatted node will + * change value we have to return to truncate. + * Save it + */ retval2 = ret_value; - /*retval2 = sb->s_blocksize - (new_file_size & (sb->s_blocksize - 1)); */ - /* So, we have performed the first part of the conversion: - inserting the new direct item. Now we are removing the - last unformatted node pointer. Set key to search for - it. */ + /* + * So, we have performed the first part of the + * conversion: + * inserting the new direct item. Now we are + * removing the last unformatted node pointer. + * Set key to search for it. + */ set_cpu_key_k_type(item_key, TYPE_INDIRECT); item_key->key_length = 4; new_file_size -= @@ -1650,11 +1738,13 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, return (ret_value == IO_ERROR) ? -EIO : -ENOENT; } /* while */ - // check fix_nodes results (IO_ERROR or NO_DISK_SPACE) + /* check fix_nodes results (IO_ERROR or NO_DISK_SPACE) */ if (ret_value != CARRY_ON) { if (is_inode_locked) { - // FIXME: this seems to be not needed: we are always able - // to cut item + /* + * FIXME: this seems to be not needed: we are always + * able to cut item + */ indirect_to_direct_roll_back(th, inode, path); } if (ret_value == NO_DISK_SPACE) @@ -1671,22 +1761,23 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, /* Calculate number of bytes that need to be cut from the item. */ quota_cut_bytes = (mode == - M_DELETE) ? ih_item_len(get_ih(path)) : -s_cut_balance. + M_DELETE) ? ih_item_len(tp_item_head(path)) : -s_cut_balance. insert_size[0]; if (retval2 == -1) ret_value = calc_deleted_bytes_number(&s_cut_balance, mode); else ret_value = retval2; - /* For direct items, we only change the quota when deleting the last - ** item. + /* + * For direct items, we only change the quota when deleting the last + * item. */ - p_le_ih = PATH_PITEM_HEAD(s_cut_balance.tb_path); + p_le_ih = tp_item_head(s_cut_balance.tb_path); if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_le_ih)) { if (mode == M_DELETE && (le_ih_k_offset(p_le_ih) & (sb->s_blocksize - 1)) == 1) { - // FIXME: this is to keep 3.5 happy + /* FIXME: this is to keep 3.5 happy */ REISERFS_I(inode)->i_first_direct_byte = U32_MAX; quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE; } else { @@ -1696,10 +1787,12 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, #ifdef CONFIG_REISERFS_CHECK if (is_inode_locked) { struct item_head *le_ih = - PATH_PITEM_HEAD(s_cut_balance.tb_path); - /* we are going to complete indirect2direct conversion. Make - sure, that we exactly remove last unformatted node pointer - of the item */ + tp_item_head(s_cut_balance.tb_path); + /* + * we are going to complete indirect2direct conversion. Make + * sure, that we exactly remove last unformatted node pointer + * of the item + */ if (!is_indirect_le_ih(le_ih)) reiserfs_panic(sb, "vs-5652", "item must be indirect %h", le_ih); @@ -1717,17 +1810,20 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, "(CUT, insert_size==%d)", le_ih, s_cut_balance.insert_size[0]); } - /* it would be useful to make sure, that right neighboring - item is direct item of this file */ + /* + * it would be useful to make sure, that right neighboring + * item is direct item of this file + */ } #endif do_balance(&s_cut_balance, NULL, NULL, mode); if (is_inode_locked) { - /* we've done an indirect->direct conversion. when the data block - ** was freed, it was removed from the list of blocks that must - ** be flushed before the transaction commits, make sure to - ** unmap and invalidate it + /* + * we've done an indirect->direct conversion. when the + * data block was freed, it was removed from the list of + * blocks that must be flushed before the transaction + * commits, make sure to unmap and invalidate it */ unmap_buffers(page, tail_pos); REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; @@ -1758,20 +1854,25 @@ static void truncate_directory(struct reiserfs_transaction_handle *th, set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_STAT_DATA); } -/* Truncate file to the new size. Note, this must be called with a transaction - already started */ +/* + * Truncate file to the new size. Note, this must be called with a + * transaction already started + */ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, - struct inode *inode, /* ->i_size contains new size */ + struct inode *inode, /* ->i_size contains new size */ struct page *page, /* up to date for last block */ - int update_timestamps /* when it is called by - file_release to convert - the tail - no timestamps - should be updated */ + /* + * when it is called by file_release to convert + * the tail - no timestamps should be updated + */ + int update_timestamps ) { INITIALIZE_PATH(s_search_path); /* Path to the current object item. */ struct item_head *p_le_ih; /* Pointer to an item header. */ - struct cpu_key s_item_key; /* Key to search for a previous file item. */ + + /* Key to search for a previous file item. */ + struct cpu_key s_item_key; loff_t file_size, /* Old file size. */ new_file_size; /* New file size. */ int deleted; /* Number of deleted or truncated bytes. */ @@ -1784,8 +1885,8 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, || S_ISLNK(inode->i_mode))) return 0; + /* deletion of directory - no need to update timestamps */ if (S_ISDIR(inode->i_mode)) { - // deletion of directory - no need to update timestamps truncate_directory(th, inode); return 0; } @@ -1793,7 +1894,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, /* Get new file size. */ new_file_size = inode->i_size; - // FIXME: note, that key type is unimportant here + /* FIXME: note, that key type is unimportant here */ make_cpu_key(&s_item_key, inode, max_reiserfs_offset(inode), TYPE_DIRECT, 3); @@ -1819,7 +1920,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, s_search_path.pos_in_item--; /* Get real file size (total length of all file items) */ - p_le_ih = PATH_PITEM_HEAD(&s_search_path); + p_le_ih = tp_item_head(&s_search_path); if (is_statdata_le_ih(p_le_ih)) file_size = 0; else { @@ -1827,9 +1928,11 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, int bytes = op_bytes_number(p_le_ih, inode->i_sb->s_blocksize); - /* this may mismatch with real file size: if last direct item - had no padding zeros and last unformatted node had no free - space, this file would have this file size */ + /* + * this may mismatch with real file size: if last direct item + * had no padding zeros and last unformatted node had no free + * space, this file would have this file size + */ file_size = offset + bytes - 1; } /* @@ -1867,18 +1970,20 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, set_cpu_key_k_offset(&s_item_key, file_size); - /* While there are bytes to truncate and previous file item is presented in the tree. */ + /* + * While there are bytes to truncate and previous + * file item is presented in the tree. + */ /* - ** This loop could take a really long time, and could log - ** many more blocks than a transaction can hold. So, we do a polite - ** journal end here, and if the transaction needs ending, we make - ** sure the file is consistent before ending the current trans - ** and starting a new one + * This loop could take a really long time, and could log + * many more blocks than a transaction can hold. So, we do + * a polite journal end here, and if the transaction needs + * ending, we make sure the file is consistent before ending + * the current trans and starting a new one */ if (journal_transaction_should_end(th, 0) || reiserfs_transaction_free_space(th) <= JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) { - int orig_len_alloc = th->t_blocks_allocated; pathrelse(&s_search_path); if (update_timestamps) { @@ -1887,7 +1992,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, } reiserfs_update_sd(th, inode); - err = journal_end(th, inode->i_sb, orig_len_alloc); + err = journal_end(th); if (err) goto out; err = journal_begin(th, inode->i_sb, @@ -1904,25 +2009,25 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, "PAP-5680: truncate did not finish: new_file_size %Ld, current %Ld, oid %d", new_file_size, file_size, s_item_key.on_disk_key.k_objectid); - update_and_out: +update_and_out: if (update_timestamps) { - // this is truncate, not file closing + /* this is truncate, not file closing */ inode->i_mtime = CURRENT_TIME_SEC; inode->i_ctime = CURRENT_TIME_SEC; } reiserfs_update_sd(th, inode); - out: +out: pathrelse(&s_search_path); return err; } #ifdef CONFIG_REISERFS_CHECK -// this makes sure, that we __append__, not overwrite or add holes +/* this makes sure, that we __append__, not overwrite or add holes */ static void check_research_for_paste(struct treepath *path, const struct cpu_key *key) { - struct item_head *found_ih = get_ih(path); + struct item_head *found_ih = tp_item_head(path); if (is_direct_le_ih(found_ih)) { if (le_ih_k_offset(found_ih) + @@ -1952,13 +2057,22 @@ static void check_research_for_paste(struct treepath *path, } #endif /* config reiserfs check */ -/* Paste bytes to the existing item. Returns bytes number pasted into the item. */ -int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct treepath *search_path, /* Path to the pasted item. */ - const struct cpu_key *key, /* Key to search for the needed item. */ - struct inode *inode, /* Inode item belongs to */ - const char *body, /* Pointer to the bytes to paste. */ +/* + * Paste bytes to the existing item. + * Returns bytes number pasted into the item. + */ +int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, + /* Path to the pasted item. */ + struct treepath *search_path, + /* Key to search for the needed item. */ + const struct cpu_key *key, + /* Inode item belongs to */ + struct inode *inode, + /* Pointer to the bytes to paste. */ + const char *body, + /* Size of pasted bytes. */ int pasted_size) -{ /* Size of pasted bytes. */ +{ struct super_block *sb = inode->i_sb; struct tree_balance s_paste_balance; int retval; @@ -1973,7 +2087,7 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota paste_into_item(): allocating %u id=%u type=%c", pasted_size, inode->i_uid, - key2type(&(key->on_disk_key))); + key2type(&key->on_disk_key)); #endif depth = reiserfs_write_unlock_nested(sb); @@ -1997,7 +2111,7 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree while ((retval = fix_nodes(M_PASTE, &s_paste_balance, NULL, body)) == REPEAT_SEARCH) { - search_again: +search_again: /* file system changed while we were in the fix_nodes */ PROC_INFO_INC(th->t_super, paste_into_item_restarted); retval = @@ -2019,21 +2133,23 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree #endif } - /* Perform balancing after all resources are collected by fix_nodes, and - accessing them will not risk triggering schedule. */ + /* + * Perform balancing after all resources are collected by fix_nodes, + * and accessing them will not risk triggering schedule. + */ if (retval == CARRY_ON) { do_balance(&s_paste_balance, NULL /*ih */ , body, M_PASTE); return 0; } retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO; - error_out: +error_out: /* this also releases the path */ unfix_nodes(&s_paste_balance); #ifdef REISERQUOTA_DEBUG reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota paste_into_item(): freeing %u id=%u type=%c", pasted_size, inode->i_uid, - key2type(&(key->on_disk_key))); + key2type(&key->on_disk_key)); #endif depth = reiserfs_write_unlock_nested(sb); dquot_free_space_nodirty(inode, pasted_size); @@ -2041,7 +2157,8 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree return retval; } -/* Insert new item into the buffer at the path. +/* + * Insert new item into the buffer at the path. * th - active transaction handle * path - path to the inserted item * ih - pointer to the item header to insert @@ -2064,8 +2181,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, fs_gen = get_generation(inode->i_sb); quota_bytes = ih_item_len(ih); - /* hack so the quota code doesn't have to guess if the file has - ** a tail, links are always tails, so there's no guessing needed + /* + * hack so the quota code doesn't have to guess + * if the file has a tail, links are always tails, + * so there's no guessing needed */ if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(ih)) quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE; @@ -2074,8 +2193,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, "reiserquota insert_item(): allocating %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(ih)); #endif - /* We can't dirty inode here. It would be immediately written but - * appropriate stat item isn't inserted yet... */ + /* + * We can't dirty inode here. It would be immediately + * written but appropriate stat item isn't inserted yet... + */ depth = reiserfs_write_unlock_nested(inode->i_sb); retval = dquot_alloc_space_nodirty(inode, quota_bytes); reiserfs_write_lock_nested(inode->i_sb, depth); @@ -2089,7 +2210,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, #ifdef DISPLACE_NEW_PACKING_LOCALITIES s_ins_balance.key = key->on_disk_key; #endif - /* DQUOT_* can schedule, must check to be sure calling fix_nodes is safe */ + /* + * DQUOT_* can schedule, must check to be sure calling + * fix_nodes is safe + */ if (inode && fs_changed(fs_gen, inode->i_sb)) { goto search_again; } @@ -2097,7 +2221,7 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, while ((retval = fix_nodes(M_INSERT, &s_ins_balance, ih, body)) == REPEAT_SEARCH) { - search_again: +search_again: /* file system changed while we were in the fix_nodes */ PROC_INFO_INC(th->t_super, insert_item_restarted); retval = search_item(th->t_super, key, path); @@ -2121,7 +2245,7 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, } retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO; - error_out: +error_out: /* also releases the path */ unfix_nodes(&s_ins_balance); #ifdef REISERQUOTA_DEBUG diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 3ead145dadc..a392cef6acc 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -62,7 +62,6 @@ static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs) static int reiserfs_remount(struct super_block *s, int *flags, char *data); static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf); -void show_alloc_options(struct seq_file *seq, struct super_block *s); static int reiserfs_sync_fs(struct super_block *s, int wait) { @@ -75,7 +74,7 @@ static int reiserfs_sync_fs(struct super_block *s, int wait) dquot_writeback_dquots(s, -1); reiserfs_write_lock(s); if (!journal_begin(&th, s, 1)) - if (!journal_end_sync(&th, s, 1)) + if (!journal_end_sync(&th)) reiserfs_flush_old_commits(s); reiserfs_write_unlock(s); return 0; @@ -137,9 +136,9 @@ static int reiserfs_freeze(struct super_block *s) } else { reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); - journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); reiserfs_block_writes(&th); - journal_end_sync(&th, s, 1); + journal_end_sync(&th); } } reiserfs_write_unlock(s); @@ -154,13 +153,15 @@ static int reiserfs_unfreeze(struct super_block *s) extern const struct in_core_key MAX_IN_CORE_KEY; -/* this is used to delete "save link" when there are no items of a - file it points to. It can either happen if unlink is completed but - "save unlink" removal, or if file has both unlink and truncate - pending and as unlink completes first (because key of "save link" - protecting unlink is bigger that a key lf "save link" which - protects truncate), so there left no items to make truncate - completion on */ +/* + * this is used to delete "save link" when there are no items of a + * file it points to. It can either happen if unlink is completed but + * "save unlink" removal, or if file has both unlink and truncate + * pending and as unlink completes first (because key of "save link" + * protecting unlink is bigger that a key lf "save link" which + * protects truncate), so there left no items to make truncate + * completion on + */ static int remove_save_link_only(struct super_block *s, struct reiserfs_key *key, int oid_free) { @@ -177,7 +178,7 @@ static int remove_save_link_only(struct super_block *s, /* removals are protected by direct items */ reiserfs_release_objectid(&th, le32_to_cpu(key->k_objectid)); - return journal_end(&th, s, JOURNAL_PER_BALANCE_CNT); + return journal_end(&th); } #ifdef CONFIG_QUOTA @@ -259,7 +260,7 @@ static int finish_unfinished(struct super_block *s) break; } item_pos--; - ih = B_N_PITEM_HEAD(bh, item_pos); + ih = item_head(bh, item_pos); if (le32_to_cpu(ih->ih_key.k_dir_id) != MAX_KEY_OBJECTID) /* there are no "save" links anymore */ @@ -272,7 +273,7 @@ static int finish_unfinished(struct super_block *s) truncate = 0; /* reiserfs_iget needs k_dirid and k_objectid only */ - item = B_I_PITEM(bh, ih); + item = ih_item_body(bh, ih); obj_key.on_disk_key.k_dir_id = le32_to_cpu(*(__le32 *) item); obj_key.on_disk_key.k_objectid = le32_to_cpu(ih->ih_key.k_objectid); @@ -283,8 +284,10 @@ static int finish_unfinished(struct super_block *s) inode = reiserfs_iget(s, &obj_key); if (!inode) { - /* the unlink almost completed, it just did not manage to remove - "save" link and release objectid */ + /* + * the unlink almost completed, it just did not + * manage to remove "save" link and release objectid + */ reiserfs_warning(s, "vs-2180", "iget failed for %K", &obj_key); retval = remove_save_link_only(s, &save_link_key, 1); @@ -304,10 +307,13 @@ static int finish_unfinished(struct super_block *s) reiserfs_write_lock_nested(inode->i_sb, depth); if (truncate && S_ISDIR(inode->i_mode)) { - /* We got a truncate request for a dir which is impossible. - The only imaginable way is to execute unfinished truncate request - then boot into old kernel, remove the file and create dir with - the same key. */ + /* + * We got a truncate request for a dir which + * is impossible. The only imaginable way is to + * execute unfinished truncate request then boot + * into old kernel, remove the file and create dir + * with the same key. + */ reiserfs_warning(s, "green-2101", "impossible truncate on a " "directory %k. Please report", @@ -321,14 +327,16 @@ static int finish_unfinished(struct super_block *s) if (truncate) { REISERFS_I(inode)->i_flags |= i_link_saved_truncate_mask; - /* not completed truncate found. New size was committed together - with "save" link */ + /* + * not completed truncate found. New size was + * committed together with "save" link + */ reiserfs_info(s, "Truncating %k to %Ld ..", INODE_PKEY(inode), inode->i_size); - reiserfs_truncate_file(inode, - 0 - /*don't update modification time */ - ); + + /* don't update modification time */ + reiserfs_truncate_file(inode, 0); + retval = remove_save_link(inode, truncate); } else { REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask; @@ -374,10 +382,12 @@ static int finish_unfinished(struct super_block *s) return retval; } -/* to protect file being unlinked from getting lost we "safe" link files - being unlinked. This link will be deleted in the same transaction with last - item of file. mounting the filesystem we scan all these links and remove - files which almost got lost */ +/* + * to protect file being unlinked from getting lost we "safe" link files + * being unlinked. This link will be deleted in the same transaction with last + * item of file. mounting the filesystem we scan all these links and remove + * files which almost got lost + */ void add_save_link(struct reiserfs_transaction_handle *th, struct inode *inode, int truncate) { @@ -496,7 +506,7 @@ int remove_save_link(struct inode *inode, int truncate) } else REISERFS_I(inode)->i_flags &= ~i_link_saved_truncate_mask; - return journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT); + return journal_end(&th); } static void reiserfs_kill_sb(struct super_block *s) @@ -531,19 +541,23 @@ static void reiserfs_put_super(struct super_block *s) reiserfs_write_lock(s); - /* change file system state to current state if it was mounted with read-write permissions */ + /* + * change file system state to current state if it was mounted + * with read-write permissions + */ if (!(s->s_flags & MS_RDONLY)) { if (!journal_begin(&th, s, 10)) { reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); set_sb_umount_state(SB_DISK_SUPER_BLOCK(s), REISERFS_SB(s)->s_mount_state); - journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); } } - /* note, journal_release checks for readonly mount, and can decide not - ** to do a journal_end + /* + * note, journal_release checks for readonly mount, and can + * decide not to do a journal_end */ journal_release(&th, s); @@ -560,6 +574,7 @@ static void reiserfs_put_super(struct super_block *s) reiserfs_write_unlock(s); mutex_destroy(&REISERFS_SB(s)->lock); + destroy_workqueue(REISERFS_SB(s)->commit_wq); kfree(s->s_fs_info); s->s_fs_info = NULL; } @@ -597,7 +612,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache", sizeof(struct @@ -635,15 +650,16 @@ static void reiserfs_dirty_inode(struct inode *inode, int flags) } reiserfs_write_lock(inode->i_sb); - /* this is really only used for atime updates, so they don't have - ** to be included in O_SYNC or fsync + /* + * this is really only used for atime updates, so they don't have + * to be included in O_SYNC or fsync */ err = journal_begin(&th, inode->i_sb, 1); if (err) goto out; reiserfs_update_sd(&th, inode); - journal_end(&th, inode->i_sb, 1); + journal_end(&th); out: reiserfs_write_unlock(inode->i_sb); @@ -789,31 +805,53 @@ static const struct export_operations reiserfs_export_ops = { .get_parent = reiserfs_get_parent, }; -/* this struct is used in reiserfs_getopt () for containing the value for those - mount options that have values rather than being toggles. */ +/* + * this struct is used in reiserfs_getopt () for containing the value for + * those mount options that have values rather than being toggles. + */ typedef struct { char *value; - int setmask; /* bitmask which is to set on mount_options bitmask when this - value is found, 0 is no bits are to be changed. */ - int clrmask; /* bitmask which is to clear on mount_options bitmask when this - value is found, 0 is no bits are to be changed. This is - applied BEFORE setmask */ + /* + * bitmask which is to set on mount_options bitmask + * when this value is found, 0 is no bits are to be changed. + */ + int setmask; + /* + * bitmask which is to clear on mount_options bitmask + * when this value is found, 0 is no bits are to be changed. + * This is applied BEFORE setmask + */ + int clrmask; } arg_desc_t; /* Set this bit in arg_required to allow empty arguments */ #define REISERFS_OPT_ALLOWEMPTY 31 -/* this struct is used in reiserfs_getopt() for describing the set of reiserfs - mount options */ +/* + * this struct is used in reiserfs_getopt() for describing the + * set of reiserfs mount options + */ typedef struct { char *option_name; - int arg_required; /* 0 if argument is not required, not 0 otherwise */ - const arg_desc_t *values; /* list of values accepted by an option */ - int setmask; /* bitmask which is to set on mount_options bitmask when this - value is found, 0 is no bits are to be changed. */ - int clrmask; /* bitmask which is to clear on mount_options bitmask when this - value is found, 0 is no bits are to be changed. This is - applied BEFORE setmask */ + + /* 0 if argument is not required, not 0 otherwise */ + int arg_required; + + /* list of values accepted by an option */ + const arg_desc_t *values; + + /* + * bitmask which is to set on mount_options bitmask + * when this value is found, 0 is no bits are to be changed. + */ + int setmask; + + /* + * bitmask which is to clear on mount_options bitmask + * when this value is found, 0 is no bits are to be changed. + * This is applied BEFORE setmask + */ + int clrmask; } opt_desc_t; /* possible values for -o data= */ @@ -834,8 +872,10 @@ static const arg_desc_t barrier_mode[] = { {.value = NULL} }; -/* possible values for "-o block-allocator=" and bits which are to be set in - s_mount_opt of reiserfs specific part of in-core super block */ +/* + * possible values for "-o block-allocator=" and bits which are to be set in + * s_mount_opt of reiserfs specific part of in-core super block + */ static const arg_desc_t balloc[] = { {"noborder", 1 << REISERFS_NO_BORDER, 0}, {"border", 0, 1 << REISERFS_NO_BORDER}, @@ -865,21 +905,25 @@ static const arg_desc_t error_actions[] = { {NULL, 0, 0}, }; -/* proceed only one option from a list *cur - string containing of mount options - opts - array of options which are accepted - opt_arg - if option is found and requires an argument and if it is specifed - in the input - pointer to the argument is stored here - bit_flags - if option requires to set a certain bit - it is set here - return -1 if unknown option is found, opt->arg_required otherwise */ +/* + * proceed only one option from a list *cur - string containing of mount + * options + * opts - array of options which are accepted + * opt_arg - if option is found and requires an argument and if it is specifed + * in the input - pointer to the argument is stored here + * bit_flags - if option requires to set a certain bit - it is set here + * return -1 if unknown option is found, opt->arg_required otherwise + */ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts, char **opt_arg, unsigned long *bit_flags) { char *p; - /* foo=bar, - ^ ^ ^ - | | +-- option_end - | +-- arg_start - +-- option_start + /* + * foo=bar, + * ^ ^ ^ + * | | +-- option_end + * | +-- arg_start + * +-- option_start */ const opt_desc_t *opt; const arg_desc_t *arg; @@ -894,9 +938,12 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts, } if (!strncmp(p, "alloc=", 6)) { - /* Ugly special case, probably we should redo options parser so that - it can understand several arguments for some options, also so that - it can fill several bitfields with option values. */ + /* + * Ugly special case, probably we should redo options + * parser so that it can understand several arguments for + * some options, also so that it can fill several bitfields + * with option values. + */ if (reiserfs_parse_alloc_options(s, p + 6)) { return -1; } else { @@ -959,7 +1006,10 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts, return -1; } - /* move to the argument, or to next option if argument is not required */ + /* + * move to the argument, or to next option if argument is not + * required + */ p++; if (opt->arg_required @@ -996,12 +1046,20 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts, } /* returns 0 if something is wrong in option string, 1 - otherwise */ -static int reiserfs_parse_options(struct super_block *s, char *options, /* string given via mount's -o */ +static int reiserfs_parse_options(struct super_block *s, + + /* string given via mount's -o */ + char *options, + + /* + * after the parsing phase, contains the + * collection of bitflags defining what + * mount options were selected. + */ unsigned long *mount_options, - /* after the parsing phase, contains the - collection of bitflags defining what - mount options were selected. */ - unsigned long *blocks, /* strtol-ed from NNN of resize=NNN */ + + /* strtol-ed from NNN of resize=NNN */ + unsigned long *blocks, char **jdev_name, unsigned int *commit_max_age, char **qf_names, @@ -1011,7 +1069,10 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin char *arg = NULL; char *pos; opt_desc_t opts[] = { - /* Compatibility stuff, so that -o notail for old setups still work */ + /* + * Compatibility stuff, so that -o notail for old + * setups still work + */ {"tails",.arg_required = 't',.values = tails}, {"notail",.clrmask = (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)}, @@ -1056,8 +1117,10 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin *blocks = 0; if (!options || !*options) - /* use default configuration: create tails, journaling on, no - conversion to newest format */ + /* + * use default configuration: create tails, journaling on, no + * conversion to newest format + */ return 1; for (pos = options; pos;) { @@ -1110,7 +1173,8 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin if (c == 'j') { if (arg && *arg && jdev_name) { - if (*jdev_name) { //Hm, already assigned? + /* Hm, already assigned? */ + if (*jdev_name) { reiserfs_warning(s, "super-6510", "journal device was " "already specified to " @@ -1319,6 +1383,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) int i; #endif + sync_filesystem(s); reiserfs_write_lock(s); #ifdef CONFIG_QUOTA @@ -1362,8 +1427,10 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) safe_mask |= 1 << REISERFS_USRQUOTA; safe_mask |= 1 << REISERFS_GRPQUOTA; - /* Update the bitmask, taking care to keep - * the bits we're not allowed to change here */ + /* + * Update the bitmask, taking care to keep + * the bits we're not allowed to change here + */ REISERFS_SB(s)->s_mount_opt = (REISERFS_SB(s)-> s_mount_opt & ~safe_mask) | (mount_options & safe_mask); @@ -1410,7 +1477,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) /* Mounting a rw partition read-only. */ reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); set_sb_umount_state(rs, REISERFS_SB(s)->s_mount_state); - journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); } else { /* remount read-write */ if (!(s->s_flags & MS_RDONLY)) { @@ -1427,7 +1494,9 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) handle_data_mode(s, mount_options); handle_barrier_mode(s, mount_options); REISERFS_SB(s)->s_mount_state = sb_umount_state(rs); - s->s_flags &= ~MS_RDONLY; /* now it is safe to call journal_begin */ + + /* now it is safe to call journal_begin */ + s->s_flags &= ~MS_RDONLY; err = journal_begin(&th, s, 10); if (err) goto out_err_unlock; @@ -1440,12 +1509,12 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) if (!old_format_only(s)) set_sb_mnt_count(rs, sb_mnt_count(rs) + 1); /* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */ - journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS; } /* this will force a full flush of all journal lists */ SB_JOURNAL(s)->j_must_wait = 1; - err = journal_end(&th, s, 10); + err = journal_end(&th); if (err) goto out_err_unlock; @@ -1479,7 +1548,7 @@ static int read_super_block(struct super_block *s, int offset) if (!bh) { reiserfs_warning(s, "sh-2006", "bread failed (dev %s, block %lu, size %lu)", - reiserfs_bdevname(s), offset / s->s_blocksize, + s->s_id, offset / s->s_blocksize, s->s_blocksize); return 1; } @@ -1489,9 +1558,9 @@ static int read_super_block(struct super_block *s, int offset) brelse(bh); return 1; } - // - // ok, reiserfs signature (old or new) found in at the given offset - // + /* + * ok, reiserfs signature (old or new) found in at the given offset + */ fs_blocksize = sb_blocksize(rs); brelse(bh); sb_set_blocksize(s, fs_blocksize); @@ -1500,7 +1569,7 @@ static int read_super_block(struct super_block *s, int offset) if (!bh) { reiserfs_warning(s, "sh-2007", "bread failed (dev %s, block %lu, size %lu)", - reiserfs_bdevname(s), offset / s->s_blocksize, + s->s_id, offset / s->s_blocksize, s->s_blocksize); return 1; } @@ -1509,7 +1578,7 @@ static int read_super_block(struct super_block *s, int offset) if (sb_blocksize(rs) != s->s_blocksize) { reiserfs_warning(s, "sh-2011", "can't find a reiserfs " "filesystem on (dev %s, block %Lu, size %lu)", - reiserfs_bdevname(s), + s->s_id, (unsigned long long)bh->b_blocknr, s->s_blocksize); brelse(bh); @@ -1529,9 +1598,11 @@ static int read_super_block(struct super_block *s, int offset) SB_BUFFER_WITH_SB(s) = bh; SB_DISK_SUPER_BLOCK(s) = rs; + /* + * magic is of non-standard journal filesystem, look at s_version to + * find which format is in use + */ if (is_reiserfs_jr(rs)) { - /* magic is of non-standard journal filesystem, look at s_version to - find which format is in use */ if (sb_version(rs) == REISERFS_VERSION_2) reiserfs_info(s, "found reiserfs format \"3.6\"" " with non-standard journal\n"); @@ -1545,8 +1616,10 @@ static int read_super_block(struct super_block *s, int offset) return 1; } } else - /* s_version of standard format may contain incorrect information, - so we just look at the magic string */ + /* + * s_version of standard format may contain incorrect + * information, so we just look at the magic string + */ reiserfs_info(s, "found reiserfs format \"%s\" with standard journal\n", is_reiserfs_3_5(rs) ? "3.5" : "3.6"); @@ -1558,8 +1631,9 @@ static int read_super_block(struct super_block *s, int offset) s->dq_op = &reiserfs_quota_operations; #endif - /* new format is limited by the 32 bit wide i_blocks field, want to - ** be one full block below that. + /* + * new format is limited by the 32 bit wide i_blocks field, want to + * be one full block below that. */ s->s_maxbytes = (512LL << 32) - s->s_blocksize; return 0; @@ -1568,7 +1642,7 @@ static int read_super_block(struct super_block *s, int offset) /* after journal replay, reread all bitmap and super blocks */ static int reread_meta_blocks(struct super_block *s) { - ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s))); + ll_rw_block(READ, 1, &SB_BUFFER_WITH_SB(s)); wait_on_buffer(SB_BUFFER_WITH_SB(s)); if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { reiserfs_warning(s, "reiserfs-2504", "error reading the super"); @@ -1578,14 +1652,15 @@ static int reread_meta_blocks(struct super_block *s) return 0; } -///////////////////////////////////////////////////// -// hash detection stuff +/* hash detection stuff */ -// if root directory is empty - we set default - Yura's - hash and -// warn about it -// FIXME: we look for only one name in a directory. If tea and yura -// bith have the same value - we ask user to send report to the -// mailing list +/* + * if root directory is empty - we set default - Yura's - hash and + * warn about it + * FIXME: we look for only one name in a directory. If tea and yura + * both have the same value - we ask user to send report to the + * mailing list + */ static __u32 find_hash_out(struct super_block *s) { int retval; @@ -1593,92 +1668,83 @@ static __u32 find_hash_out(struct super_block *s) struct cpu_key key; INITIALIZE_PATH(path); struct reiserfs_dir_entry de; + struct reiserfs_de_head *deh; __u32 hash = DEFAULT_HASH; + __u32 deh_hashval, teahash, r5hash, yurahash; inode = s->s_root->d_inode; - do { // Some serious "goto"-hater was there ;) - u32 teahash, r5hash, yurahash; + make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3); + retval = search_by_entry_key(s, &key, &path, &de); + if (retval == IO_ERROR) { + pathrelse(&path); + return UNSET_HASH; + } + if (retval == NAME_NOT_FOUND) + de.de_entry_num--; + + set_de_name_and_namelen(&de); + deh = de.de_deh + de.de_entry_num; - make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3); - retval = search_by_entry_key(s, &key, &path, &de); - if (retval == IO_ERROR) { - pathrelse(&path); - return UNSET_HASH; - } - if (retval == NAME_NOT_FOUND) - de.de_entry_num--; - set_de_name_and_namelen(&de); - if (deh_offset(&(de.de_deh[de.de_entry_num])) == DOT_DOT_OFFSET) { - /* allow override in this case */ - if (reiserfs_rupasov_hash(s)) { - hash = YURA_HASH; - } - reiserfs_info(s, "FS seems to be empty, autodetect " - "is using the default hash\n"); - break; - } - r5hash = GET_HASH_VALUE(r5_hash(de.de_name, de.de_namelen)); - teahash = GET_HASH_VALUE(keyed_hash(de.de_name, de.de_namelen)); - yurahash = GET_HASH_VALUE(yura_hash(de.de_name, de.de_namelen)); - if (((teahash == r5hash) - && - (GET_HASH_VALUE(deh_offset(&(de.de_deh[de.de_entry_num]))) - == r5hash)) || ((teahash == yurahash) - && (yurahash == - GET_HASH_VALUE(deh_offset - (& - (de. - de_deh[de. - de_entry_num]))))) - || ((r5hash == yurahash) - && (yurahash == - GET_HASH_VALUE(deh_offset - (&(de.de_deh[de.de_entry_num])))))) { - reiserfs_warning(s, "reiserfs-2506", "Unable to " - "automatically detect hash function. " - "Please mount with -o " - "hash={tea,rupasov,r5}"); - hash = UNSET_HASH; - break; - } - if (GET_HASH_VALUE(deh_offset(&(de.de_deh[de.de_entry_num]))) == - yurahash) + if (deh_offset(deh) == DOT_DOT_OFFSET) { + /* allow override in this case */ + if (reiserfs_rupasov_hash(s)) hash = YURA_HASH; - else if (GET_HASH_VALUE - (deh_offset(&(de.de_deh[de.de_entry_num]))) == teahash) - hash = TEA_HASH; - else if (GET_HASH_VALUE - (deh_offset(&(de.de_deh[de.de_entry_num]))) == r5hash) - hash = R5_HASH; - else { - reiserfs_warning(s, "reiserfs-2506", - "Unrecognised hash function"); - hash = UNSET_HASH; - } - } while (0); + reiserfs_info(s, "FS seems to be empty, autodetect is using the default hash\n"); + goto out; + } + + deh_hashval = GET_HASH_VALUE(deh_offset(deh)); + r5hash = GET_HASH_VALUE(r5_hash(de.de_name, de.de_namelen)); + teahash = GET_HASH_VALUE(keyed_hash(de.de_name, de.de_namelen)); + yurahash = GET_HASH_VALUE(yura_hash(de.de_name, de.de_namelen)); + + if ((teahash == r5hash && deh_hashval == r5hash) || + (teahash == yurahash && deh_hashval == yurahash) || + (r5hash == yurahash && deh_hashval == yurahash)) { + reiserfs_warning(s, "reiserfs-2506", + "Unable to automatically detect hash " + "function. Please mount with -o " + "hash={tea,rupasov,r5}"); + hash = UNSET_HASH; + goto out; + } + if (deh_hashval == yurahash) + hash = YURA_HASH; + else if (deh_hashval == teahash) + hash = TEA_HASH; + else if (deh_hashval == r5hash) + hash = R5_HASH; + else { + reiserfs_warning(s, "reiserfs-2506", + "Unrecognised hash function"); + hash = UNSET_HASH; + } +out: pathrelse(&path); return hash; } -// finds out which hash names are sorted with +/* finds out which hash names are sorted with */ static int what_hash(struct super_block *s) { __u32 code; code = sb_hash_function_code(SB_DISK_SUPER_BLOCK(s)); - /* reiserfs_hash_detect() == true if any of the hash mount options - ** were used. We must check them to make sure the user isn't - ** using a bad hash value + /* + * reiserfs_hash_detect() == true if any of the hash mount options + * were used. We must check them to make sure the user isn't + * using a bad hash value */ if (code == UNSET_HASH || reiserfs_hash_detect(s)) code = find_hash_out(s); if (code != UNSET_HASH && reiserfs_hash_detect(s)) { - /* detection has found the hash, and we must check against the - ** mount options + /* + * detection has found the hash, and we must check against the + * mount options */ if (reiserfs_rupasov_hash(s) && code != YURA_HASH) { reiserfs_warning(s, "reiserfs-2507", @@ -1700,7 +1766,10 @@ static int what_hash(struct super_block *s) code = UNSET_HASH; } } else { - /* find_hash_out was not called or could not determine the hash */ + /* + * find_hash_out was not called or + * could not determine the hash + */ if (reiserfs_rupasov_hash(s)) { code = YURA_HASH; } else if (reiserfs_tea_hash(s)) { @@ -1710,8 +1779,9 @@ static int what_hash(struct super_block *s) } } - /* if we are mounted RW, and we have a new valid hash code, update - ** the super + /* + * if we are mounted RW, and we have a new valid hash code, update + * the super */ if (code != UNSET_HASH && !(s->s_flags & MS_RDONLY) && @@ -1721,7 +1791,7 @@ static int what_hash(struct super_block *s) return code; } -// return pointer to appropriate function +/* return pointer to appropriate function */ static hashf_t hash_function(struct super_block *s) { switch (what_hash(s)) { @@ -1738,7 +1808,7 @@ static hashf_t hash_function(struct super_block *s) return NULL; } -// this is used to set up correct value for old partitions +/* this is used to set up correct value for old partitions */ static int function2code(hashf_t func) { if (func == keyed_hash) @@ -1748,7 +1818,7 @@ static int function2code(hashf_t func) if (func == r5_hash) return R5_HASH; - BUG(); // should never happen + BUG(); /* should never happen */ return 0; } @@ -1783,8 +1853,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) sbi->s_mount_opt |= (1 << REISERFS_SMALLTAIL); sbi->s_mount_opt |= (1 << REISERFS_ERROR_RO); sbi->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH); - /* no preallocation minimum, be smart in - reiserfs_file_write instead */ + /* no preallocation minimum, be smart in reiserfs_file_write instead */ sbi->s_alloc_options.preallocmin = 0; /* Preallocate by 16 blocks (17-1) at once */ sbi->s_alloc_options.preallocsize = 17; @@ -1796,9 +1865,17 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) mutex_init(&sbi->lock); sbi->lock_depth = -1; + sbi->commit_wq = alloc_workqueue("reiserfs/%s", WQ_MEM_RECLAIM, 0, + s->s_id); + if (!sbi->commit_wq) { + SWARN(silent, s, "", "Cannot allocate commit workqueue"); + errval = -ENOMEM; + goto error_unlocked; + } + jdev_name = NULL; if (reiserfs_parse_options - (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name, + (s, (char *)data, &sbi->s_mount_opt, &blocks, &jdev_name, &commit_max_age, qf_names, &qfmt) == 0) { goto error_unlocked; } @@ -1819,20 +1896,29 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) goto error_unlocked; } - /* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */ + /* + * try old format (undistributed bitmap, super block in 8-th 1k + * block of a device) + */ if (!read_super_block(s, REISERFS_OLD_DISK_OFFSET_IN_BYTES)) old_format = 1; - /* try new format (64-th 1k block), which can contain reiserfs super block */ + + /* + * try new format (64-th 1k block), which can contain reiserfs + * super block + */ else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) { SWARN(silent, s, "sh-2021", "can not find reiserfs on %s", - reiserfs_bdevname(s)); + s->s_id); goto error_unlocked; } rs = SB_DISK_SUPER_BLOCK(s); - /* Let's do basic sanity check to verify that underlying device is not - smaller than the filesystem. If the check fails then abort and scream, - because bad stuff will happen otherwise. */ + /* + * Let's do basic sanity check to verify that underlying device is not + * smaller than the filesystem. If the check fails then abort and + * scream, because bad stuff will happen otherwise. + */ if (s->s_bdev && s->s_bdev->bd_inode && i_size_read(s->s_bdev->bd_inode) < sb_block_count(rs) * sb_blocksize(rs)) { @@ -1876,15 +1962,16 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) printk("reiserfs: using flush barriers\n"); } - // set_device_ro(s->s_dev, 1) ; if (journal_init(s, jdev_name, old_format, commit_max_age)) { SWARN(silent, s, "sh-2022", "unable to initialize journal space"); goto error_unlocked; } else { - jinit_done = 1; /* once this is set, journal_release must be called - ** if we error out of the mount - */ + /* + * once this is set, journal_release must be called + * if we error out of the mount + */ + jinit_done = 1; } if (reread_meta_blocks(s)) { @@ -1905,7 +1992,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) args.dirid = REISERFS_ROOT_PARENT_OBJECTID; root_inode = iget5_locked(s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor, - reiserfs_init_locked_inode, (void *)(&args)); + reiserfs_init_locked_inode, (void *)&args); if (!root_inode) { SWARN(silent, s, "jmacd-10", "get root inode failed"); goto error_unlocked; @@ -1929,7 +2016,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) s->s_root = d_make_root(root_inode); if (!s->s_root) goto error; - // define and initialize hash function + /* define and initialize hash function */ sbi->s_hash_function = hash_function(s); if (sbi->s_hash_function == NULL) { dput(s->s_root); @@ -1939,11 +2026,11 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) if (is_reiserfs_3_5(rs) || (is_reiserfs_jr(rs) && SB_VERSION(s) == REISERFS_VERSION_1)) - set_bit(REISERFS_3_5, &(sbi->s_properties)); + set_bit(REISERFS_3_5, &sbi->s_properties); else if (old_format) - set_bit(REISERFS_OLD_FORMAT, &(sbi->s_properties)); + set_bit(REISERFS_OLD_FORMAT, &sbi->s_properties); else - set_bit(REISERFS_3_6, &(sbi->s_properties)); + set_bit(REISERFS_3_6, &sbi->s_properties); if (!(s->s_flags & MS_RDONLY)) { @@ -1958,10 +2045,12 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) set_sb_umount_state(rs, REISERFS_ERROR_FS); set_sb_fs_state(rs, 0); - /* Clear out s_bmap_nr if it would wrap. We can handle this + /* + * Clear out s_bmap_nr if it would wrap. We can handle this * case, but older revisions can't. This will cause the * file system to fail mount on those older implementations, - * avoiding corruption. -jeffm */ + * avoiding corruption. -jeffm + */ if (bmap_would_wrap(reiserfs_bmap_count(s)) && sb_bmap_nr(rs) != 0) { reiserfs_warning(s, "super-2030", "This file system " @@ -1974,8 +2063,10 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) } if (old_format_only(s)) { - /* filesystem of format 3.5 either with standard or non-standard - journal */ + /* + * filesystem of format 3.5 either with standard + * or non-standard journal + */ if (convert_reiserfs(s)) { /* and -o conv is given */ if (!silent) @@ -1983,8 +2074,11 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) "converting 3.5 filesystem to the 3.6 format"); if (is_reiserfs_3_5(rs)) - /* put magic string of 3.6 format. 2.2 will not be able to - mount this filesystem anymore */ + /* + * put magic string of 3.6 format. + * 2.2 will not be able to + * mount this filesystem anymore + */ memcpy(rs->s_v1.s_magic, reiserfs_3_6_magic_string, sizeof @@ -1992,8 +2086,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) set_sb_version(rs, REISERFS_VERSION_2); reiserfs_convert_objectid_map_v1(s); - set_bit(REISERFS_3_6, &(sbi->s_properties)); - clear_bit(REISERFS_3_5, &(sbi->s_properties)); + set_bit(REISERFS_3_6, &sbi->s_properties); + clear_bit(REISERFS_3_5, &sbi->s_properties); } else if (!silent) { reiserfs_info(s, "using 3.5.x disk format\n"); } @@ -2001,8 +2095,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) set_sb_mnt_count(rs, sb_mnt_count(rs) + 1); - journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); - errval = journal_end(&th, s, 1); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); + errval = journal_end(&th); if (errval) { dput(s->s_root); s->s_root = NULL; @@ -2018,7 +2112,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) } reiserfs_write_lock(s); - /* look for files which were to be removed in previous session */ + /* + * look for files which were to be removed in previous session + */ finish_unfinished(s); } else { if (old_format_only(s) && !silent) { @@ -2034,7 +2130,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) } reiserfs_write_lock(s); } - // mark hash in super block: it could be unset. overwrite should be ok + /* + * mark hash in super block: it could be unset. overwrite should be ok + */ set_sb_hash_function_code(rs, function2code(sbi->s_hash_function)); handle_attrs(s); @@ -2111,9 +2209,7 @@ static int reiserfs_write_dquot(struct dquot *dquot) depth = reiserfs_write_unlock_nested(dquot->dq_sb); ret = dquot_commit(dquot); reiserfs_write_lock_nested(dquot->dq_sb, depth); - err = - journal_end(&th, dquot->dq_sb, - REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); + err = journal_end(&th); if (!ret && err) ret = err; out: @@ -2136,9 +2232,7 @@ static int reiserfs_acquire_dquot(struct dquot *dquot) depth = reiserfs_write_unlock_nested(dquot->dq_sb); ret = dquot_acquire(dquot); reiserfs_write_lock_nested(dquot->dq_sb, depth); - err = - journal_end(&th, dquot->dq_sb, - REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); + err = journal_end(&th); if (!ret && err) ret = err; out: @@ -2163,9 +2257,7 @@ static int reiserfs_release_dquot(struct dquot *dquot) } ret = dquot_release(dquot); reiserfs_write_lock(dquot->dq_sb); - err = - journal_end(&th, dquot->dq_sb, - REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb)); + err = journal_end(&th); if (!ret && err) ret = err; reiserfs_write_unlock(dquot->dq_sb); @@ -2198,7 +2290,7 @@ static int reiserfs_write_info(struct super_block *sb, int type) depth = reiserfs_write_unlock_nested(sb); ret = dquot_commit_info(sb, type); reiserfs_write_lock_nested(sb, depth); - err = journal_end(&th, sb, 2); + err = journal_end(&th); if (!ret && err) ret = err; out: @@ -2238,7 +2330,10 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, goto out; } inode = path->dentry->d_inode; - /* We must not pack tails for quota files on reiserfs for quota IO to work */ + /* + * We must not pack tails for quota files on reiserfs for quota + * IO to work + */ if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) { err = reiserfs_unpack(inode, NULL); if (err) { @@ -2268,7 +2363,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, err = journal_begin(&th, sb, 1); if (err) goto out; - err = journal_end_sync(&th, sb, 1); + err = journal_end_sync(&th); if (err) goto out; } @@ -2279,10 +2374,12 @@ out: return err; } -/* Read data from quotafile - avoid pagecache and such because we cannot afford +/* + * Read data from quotafile - avoid pagecache and such because we cannot afford * acquiring the locks... As quota files are never truncated and quota code * itself serializes the operations (and no one else should touch the files) - * we don't have to be afraid of races */ + * we don't have to be afraid of races + */ static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off) { @@ -2303,7 +2400,10 @@ static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data, sb->s_blocksize - offset < toread ? sb->s_blocksize - offset : toread; tmp_bh.b_state = 0; - /* Quota files are without tails so we can safely use this function */ + /* + * Quota files are without tails so we can safely + * use this function + */ reiserfs_write_lock(sb); err = reiserfs_get_block(inode, blk, &tmp_bh, 0); reiserfs_write_unlock(sb); @@ -2326,8 +2426,10 @@ static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data, return len; } -/* Write to quotafile (we know the transaction is already started and has - * enough credits) */ +/* + * Write to quotafile (we know the transaction is already started and has + * enough credits) + */ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off) { @@ -2368,7 +2470,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, unlock_buffer(bh); reiserfs_write_lock(sb); reiserfs_prepare_for_journal(sb, bh, 1); - journal_mark_dirty(current->journal_info, sb, bh); + journal_mark_dirty(current->journal_info, bh); if (!journal_quota) reiserfs_add_ordered_list(inode, bh); reiserfs_write_unlock(sb); @@ -2402,18 +2504,18 @@ static int __init init_reiserfs_fs(void) { int ret; - if ((ret = init_inodecache())) { + ret = init_inodecache(); + if (ret) return ret; - } reiserfs_proc_info_global_init(); ret = register_filesystem(&reiserfs_fs_type); + if (ret) + goto out; - if (ret == 0) { - return 0; - } - + return 0; +out: reiserfs_proc_info_global_done(); destroy_inodecache(); diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c index 5e2624d12f7..f41e19b4bb4 100644 --- a/fs/reiserfs/tail_conversion.c +++ b/fs/reiserfs/tail_conversion.c @@ -1,5 +1,6 @@ /* - * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright details + * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright + * details */ #include <linux/time.h> @@ -7,29 +8,41 @@ #include <linux/buffer_head.h> #include "reiserfs.h" -/* access to tail : when one is going to read tail it must make sure, that is not running. - direct2indirect and indirect2direct can not run concurrently */ +/* + * access to tail : when one is going to read tail it must make sure, that is + * not running. direct2indirect and indirect2direct can not run concurrently + */ -/* Converts direct items to an unformatted node. Panics if file has no - tail. -ENOSPC if no disk space for conversion */ -/* path points to first direct item of the file regarless of how many of - them are there */ +/* + * Converts direct items to an unformatted node. Panics if file has no + * tail. -ENOSPC if no disk space for conversion + */ +/* + * path points to first direct item of the file regardless of how many of + * them are there + */ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode, struct treepath *path, struct buffer_head *unbh, loff_t tail_offset) { struct super_block *sb = inode->i_sb; struct buffer_head *up_to_date_bh; - struct item_head *p_le_ih = PATH_PITEM_HEAD(path); + struct item_head *p_le_ih = tp_item_head(path); unsigned long total_tail = 0; - struct cpu_key end_key; /* Key to search for the last byte of the - converted item. */ - struct item_head ind_ih; /* new indirect item to be inserted or - key of unfm pointer to be pasted */ - int blk_size, retval; /* returned value for reiserfs_insert_item and clones */ - unp_t unfm_ptr; /* Handle on an unformatted node - that will be inserted in the - tree. */ + + /* Key to search for the last byte of the converted item. */ + struct cpu_key end_key; + + /* + * new indirect item to be inserted or key + * of unfm pointer to be pasted + */ + struct item_head ind_ih; + int blk_size; + /* returned value for reiserfs_insert_item and clones */ + int retval; + /* Handle on an unformatted node that will be inserted in the tree. */ + unp_t unfm_ptr; BUG_ON(!th->t_trans_id); @@ -37,8 +50,10 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode, blk_size = sb->s_blocksize; - /* and key to search for append or insert pointer to the new - unformatted node. */ + /* + * and key to search for append or insert pointer to the new + * unformatted node. + */ copy_item_head(&ind_ih, p_le_ih); set_le_ih_k_offset(&ind_ih, tail_offset); set_le_ih_k_type(&ind_ih, TYPE_INDIRECT); @@ -55,7 +70,7 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode, return -EIO; } - p_le_ih = PATH_PITEM_HEAD(path); + p_le_ih = tp_item_head(path); unfm_ptr = cpu_to_le32(unbh->b_blocknr); @@ -76,36 +91,43 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode, if (retval) { return retval; } - // note: from here there are two keys which have matching first - // three key components. They only differ by the fourth one. + /* + * note: from here there are two keys which have matching first + * three key components. They only differ by the fourth one. + */ /* Set the key to search for the direct items of the file */ make_cpu_key(&end_key, inode, max_reiserfs_offset(inode), TYPE_DIRECT, 4); - /* Move bytes from the direct items to the new unformatted node - and delete them. */ + /* + * Move bytes from the direct items to the new unformatted node + * and delete them. + */ while (1) { int tail_size; - /* end_key.k_offset is set so, that we will always have found - last item of the file */ + /* + * end_key.k_offset is set so, that we will always have found + * last item of the file + */ if (search_for_position_by_key(sb, &end_key, path) == POSITION_FOUND) reiserfs_panic(sb, "PAP-14050", "direct item (%K) not found", &end_key); - p_le_ih = PATH_PITEM_HEAD(path); + p_le_ih = tp_item_head(path); RFALSE(!is_direct_le_ih(p_le_ih), "vs-14055: direct item expected(%K), found %h", &end_key, p_le_ih); tail_size = (le_ih_k_offset(p_le_ih) & (blk_size - 1)) + ih_item_len(p_le_ih) - 1; - /* we only send the unbh pointer if the buffer is not up to date. - ** this avoids overwriting good data from writepage() with old data - ** from the disk or buffer cache - ** Special case: unbh->b_page will be NULL if we are coming through - ** DIRECT_IO handler here. + /* + * we only send the unbh pointer if the buffer is not + * up to date. this avoids overwriting good data from + * writepage() with old data from the disk or buffer cache + * Special case: unbh->b_page will be NULL if we are coming + * through DIRECT_IO handler here. */ if (!unbh->b_page || buffer_uptodate(unbh) || PageUptodate(unbh->b_page)) { @@ -117,13 +139,15 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode, up_to_date_bh); total_tail += retval; + + /* done: file does not have direct items anymore */ if (tail_size == retval) - // done: file does not have direct items anymore break; } - /* if we've copied bytes from disk into the page, we need to zero - ** out the unused part of the block (it was not up to date before) + /* + * if we've copied bytes from disk into the page, we need to zero + * out the unused part of the block (it was not up to date before) */ if (up_to_date_bh) { unsigned pgoff = @@ -146,9 +170,11 @@ void reiserfs_unmap_buffer(struct buffer_head *bh) BUG(); } clear_buffer_dirty(bh); - /* Remove the buffer from whatever list it belongs to. We are mostly - interested in removing it from per-sb j_dirty_buffers list, to avoid - BUG() on attempt to write not mapped buffer */ + /* + * Remove the buffer from whatever list it belongs to. We are mostly + * interested in removing it from per-sb j_dirty_buffers list, to avoid + * BUG() on attempt to write not mapped buffer + */ if ((!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) { struct inode *inode = bh->b_page->mapping->host; struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); @@ -164,12 +190,14 @@ void reiserfs_unmap_buffer(struct buffer_head *bh) unlock_buffer(bh); } -/* this first locks inode (neither reads nor sync are permitted), - reads tail through page cache, insert direct item. When direct item - inserted successfully inode is left locked. Return value is always - what we expect from it (number of cut bytes). But when tail remains - in the unformatted node, we set mode to SKIP_BALANCING and unlock - inode */ +/* + * this first locks inode (neither reads nor sync are permitted), + * reads tail through page cache, insert direct item. When direct item + * inserted successfully inode is left locked. Return value is always + * what we expect from it (number of cut bytes). But when tail remains + * in the unformatted node, we set mode to SKIP_BALANCING and unlock + * inode + */ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *inode, struct page *page, struct treepath *path, /* path to the indirect item. */ @@ -194,7 +222,7 @@ int indirect2direct(struct reiserfs_transaction_handle *th, *mode = M_SKIP_BALANCING; /* store item head path points to. */ - copy_item_head(&s_ih, PATH_PITEM_HEAD(path)); + copy_item_head(&s_ih, tp_item_head(path)); tail_len = (n_new_file_size & (block_size - 1)); if (get_inode_sd_version(inode) == STAT_DATA_V2) @@ -207,9 +235,11 @@ int indirect2direct(struct reiserfs_transaction_handle *th, 1) * sb->s_blocksize; pos1 = pos; - // we are protected by i_mutex. The tail can not disapper, not - // append can be done either - // we are in truncate or packing tail in file_release + /* + * we are protected by i_mutex. The tail can not disapper, not + * append can be done either + * we are in truncate or packing tail in file_release + */ tail = (char *)kmap(page); /* this can schedule */ @@ -220,7 +250,7 @@ int indirect2direct(struct reiserfs_transaction_handle *th, reiserfs_panic(sb, "PAP-5520", "item to be converted %K does not exist", item_key); - copy_item_head(&s_ih, PATH_PITEM_HEAD(path)); + copy_item_head(&s_ih, tp_item_head(path)); #ifdef CONFIG_REISERFS_CHECK pos = le_ih_k_offset(&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE - @@ -236,9 +266,10 @@ int indirect2direct(struct reiserfs_transaction_handle *th, pos1 + 1, TYPE_DIRECT, round_tail_len, 0xffff /*ih_free_space */ ); - /* we want a pointer to the first byte of the tail in the page. - ** the page was locked and this part of the page was up to date when - ** indirect2direct was called, so we know the bytes are still valid + /* + * we want a pointer to the first byte of the tail in the page. + * the page was locked and this part of the page was up to date when + * indirect2direct was called, so we know the bytes are still valid */ tail = tail + (pos & (PAGE_CACHE_SIZE - 1)); @@ -250,12 +281,14 @@ int indirect2direct(struct reiserfs_transaction_handle *th, /* Insert tail as new direct item in the tree */ if (reiserfs_insert_item(th, path, &key, &s_ih, inode, tail ? tail : NULL) < 0) { - /* No disk memory. So we can not convert last unformatted node - to the direct item. In this case we used to adjust - indirect items's ih_free_space. Now ih_free_space is not - used, it would be ideal to write zeros to corresponding - unformatted node. For now i_size is considered as guard for - going out of file size */ + /* + * No disk memory. So we can not convert last unformatted node + * to the direct item. In this case we used to adjust + * indirect items's ih_free_space. Now ih_free_space is not + * used, it would be ideal to write zeros to corresponding + * unformatted node. For now i_size is considered as guard for + * going out of file size + */ kunmap(page); return block_size - round_tail_len; } @@ -264,12 +297,16 @@ int indirect2direct(struct reiserfs_transaction_handle *th, /* make sure to get the i_blocks changes from reiserfs_insert_item */ reiserfs_update_sd(th, inode); - // note: we have now the same as in above direct2indirect - // conversion: there are two keys which have matching first three - // key components. They only differ by the fouhth one. + /* + * note: we have now the same as in above direct2indirect + * conversion: there are two keys which have matching first three + * key components. They only differ by the fourth one. + */ - /* We have inserted new direct item and must remove last - unformatted node. */ + /* + * We have inserted new direct item and must remove last + * unformatted node. + */ *mode = M_CUT; /* we store position of first direct item in the in-core inode */ diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 8a9e2dcfe00..ca416d099e7 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -50,14 +50,17 @@ #include <linux/stat.h> #include <linux/quotaops.h> #include <linux/security.h> +#include <linux/posix_acl_xattr.h> #define PRIVROOT_NAME ".reiserfs_priv" #define XAROOT_NAME "xattrs" -/* Helpers for inode ops. We do this so that we don't have all the VFS +/* + * Helpers for inode ops. We do this so that we don't have all the VFS * overhead and also for proper i_mutex annotation. - * dir->i_mutex must be held for all of them. */ + * dir->i_mutex must be held for all of them. + */ #ifdef CONFIG_REISERFS_FS_XATTR static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) { @@ -72,10 +75,12 @@ static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return dir->i_op->mkdir(dir, dentry, mode); } -/* We use I_MUTEX_CHILD here to silence lockdep. It's safe because xattr +/* + * We use I_MUTEX_CHILD here to silence lockdep. It's safe because xattr * mutation ops aren't called during rename or splace, which are the * only other users of I_MUTEX_CHILD. It violates the ordering, but that's - * better than allocating another subclass just for this code. */ + * better than allocating another subclass just for this code. + */ static int xattr_unlink(struct inode *dir, struct dentry *dentry) { int error; @@ -165,9 +170,11 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags) return xadir; } -/* The following are side effects of other operations that aren't explicitly +/* + * The following are side effects of other operations that aren't explicitly * modifying extended attributes. This includes operations such as permissions - * or ownership changes, object deletions, etc. */ + * or ownership changes, object deletions, etc. + */ struct reiserfs_dentry_buf { struct dir_context ctx; struct dentry *xadir; @@ -266,11 +273,13 @@ static int reiserfs_for_each_xattr(struct inode *inode, cleanup_dentry_buf(&buf); if (!err) { - /* We start a transaction here to avoid a ABBA situation + /* + * We start a transaction here to avoid a ABBA situation * between the xattr root's i_mutex and the journal lock. * This doesn't incur much additional overhead since the * new transaction will just nest inside the - * outer transaction. */ + * outer transaction. + */ int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb); struct reiserfs_transaction_handle th; @@ -283,7 +292,7 @@ static int reiserfs_for_each_xattr(struct inode *inode, I_MUTEX_XATTR); err = action(dir, data); reiserfs_write_lock(inode->i_sb); - jerror = journal_end(&th, inode->i_sb, blocks); + jerror = journal_end(&th); reiserfs_write_unlock(inode->i_sb); mutex_unlock(&dir->d_parent->d_inode->i_mutex); err = jerror ?: err; @@ -348,9 +357,11 @@ int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs) } #ifdef CONFIG_REISERFS_FS_XATTR -/* Returns a dentry corresponding to a specific extended attribute file +/* + * Returns a dentry corresponding to a specific extended attribute file * for the inode. If flags allow, the file is created. Otherwise, a - * valid or negative dentry, or an error is returned. */ + * valid or negative dentry, or an error is returned. + */ static struct dentry *xattr_lookup(struct inode *inode, const char *name, int flags) { @@ -399,8 +410,10 @@ static struct page *reiserfs_get_page(struct inode *dir, size_t n) { struct address_space *mapping = dir->i_mapping; struct page *page; - /* We can deadlock if we try to free dentries, - and an unlink/rmdir has just occurred - GFP_NOFS avoids this */ + /* + * We can deadlock if we try to free dentries, + * and an unlink/rmdir has just occurred - GFP_NOFS avoids this + */ mapping_set_gfp_mask(mapping, GFP_NOFS); page = read_mapping_page(mapping, n >> PAGE_CACHE_SHIFT, NULL); if (!IS_ERR(page)) { @@ -410,7 +423,7 @@ static struct page *reiserfs_get_page(struct inode *dir, size_t n) } return page; - fail: +fail: reiserfs_put_page(page); return ERR_PTR(-EIO); } @@ -588,7 +601,7 @@ int reiserfs_xattr_set(struct inode *inode, const char *name, buffer, buffer_size, flags); reiserfs_write_lock(inode->i_sb); - error2 = journal_end(&th, inode->i_sb, jbegin_count); + error2 = journal_end(&th); reiserfs_write_unlock(inode->i_sb); if (error == 0) error = error2; @@ -614,8 +627,10 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer, if (name == NULL) return -EINVAL; - /* We can't have xattrs attached to v1 items since they don't have - * generation numbers */ + /* + * We can't have xattrs attached to v1 items since they don't have + * generation numbers + */ if (get_inode_sd_version(inode) == STAT_DATA_V1) return -EOPNOTSUPP; @@ -904,20 +919,24 @@ static const struct xattr_handler *reiserfs_xattr_handlers[] = { &reiserfs_xattr_security_handler, #endif #ifdef CONFIG_REISERFS_FS_POSIX_ACL - &reiserfs_posix_acl_access_handler, - &reiserfs_posix_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif NULL }; static int xattr_mount_check(struct super_block *s) { - /* We need generation numbers to ensure that the oid mapping is correct - * v3.5 filesystems don't have them. */ + /* + * We need generation numbers to ensure that the oid mapping is correct + * v3.5 filesystems don't have them. + */ if (old_format_only(s)) { if (reiserfs_xattrs_optional(s)) { - /* Old format filesystem, but optional xattrs have - * been enabled. Error out. */ + /* + * Old format filesystem, but optional xattrs have + * been enabled. Error out. + */ reiserfs_warning(s, "jdm-2005", "xattrs/ACLs not supported " "on pre-v3.6 format filesystems. " @@ -971,9 +990,11 @@ int reiserfs_lookup_privroot(struct super_block *s) return err; } -/* We need to take a copy of the mount flags since things like +/* + * We need to take a copy of the mount flags since things like * MS_RDONLY don't get set until *after* we're called. - * mount_flags != mount_options */ + * mount_flags != mount_options + */ int reiserfs_xattr_init(struct super_block *s, int mount_flags) { int err = 0; @@ -1006,8 +1027,8 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) error: if (err) { - clear_bit(REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt)); - clear_bit(REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt)); + clear_bit(REISERFS_XATTRS_USER, &REISERFS_SB(s)->s_mount_opt); + clear_bit(REISERFS_POSIXACL, &REISERFS_SB(s)->s_mount_opt); } /* The super_block MS_POSIXACL must mirror the (no)acl mount option. */ diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h index f59626c5d33..857ec7e3016 100644 --- a/fs/reiserfs/xattr.h +++ b/fs/reiserfs/xattr.h @@ -61,7 +61,8 @@ static inline loff_t reiserfs_xattr_nblocks(struct inode *inode, loff_t size) return ret; } -/* We may have to create up to 3 objects: xattr root, xattr dir, xattr file. +/* + * We may have to create up to 3 objects: xattr root, xattr dir, xattr file. * Let's try to be smart about it. * xattr root: We cache it. If it's not cached, we may need to create it. * xattr dir: If anything has been loaded for this inode, we can set a flag diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 06c04f73da6..44503e29379 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -11,38 +11,24 @@ #include "acl.h" #include <asm/uaccess.h> -static int reiserfs_set_acl(struct reiserfs_transaction_handle *th, +static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, int type, struct posix_acl *acl); -static int -posix_acl_set(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags, int type) + +int +reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - struct inode *inode = dentry->d_inode; - struct posix_acl *acl; int error, error2; struct reiserfs_transaction_handle th; size_t jcreate_blocks; - if (!reiserfs_posixacl(inode->i_sb)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) { - return PTR_ERR(acl); - } else if (acl) { - error = posix_acl_valid(acl); - if (error) - goto release_and_out; - } - } else - acl = NULL; + int size = acl ? posix_acl_xattr_size(acl->a_count) : 0; + - /* Pessimism: We can't assume that anything from the xattr root up - * has been created. */ + /* + * Pessimism: We can't assume that anything from the xattr root up + * has been created. + */ jcreate_blocks = reiserfs_xattr_jcreate_nblocks(inode) + reiserfs_xattr_nblocks(inode, size) * 2; @@ -51,44 +37,21 @@ posix_acl_set(struct dentry *dentry, const char *name, const void *value, error = journal_begin(&th, inode->i_sb, jcreate_blocks); reiserfs_write_unlock(inode->i_sb); if (error == 0) { - error = reiserfs_set_acl(&th, inode, type, acl); + error = __reiserfs_set_acl(&th, inode, type, acl); reiserfs_write_lock(inode->i_sb); - error2 = journal_end(&th, inode->i_sb, jcreate_blocks); + error2 = journal_end(&th); reiserfs_write_unlock(inode->i_sb); if (error2) error = error2; } - release_and_out: - posix_acl_release(acl); - return error; -} - -static int -posix_acl_get(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) -{ - struct posix_acl *acl; - int error; - - if (!reiserfs_posixacl(dentry->d_sb)) - return -EOPNOTSUPP; - - acl = reiserfs_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - return error; } /* * Convert from filesystem to in-memory representation. */ -static struct posix_acl *posix_acl_from_disk(const void *value, size_t size) +static struct posix_acl *reiserfs_posix_acl_from_disk(const void *value, size_t size) { const char *end = (char *)value + size; int n, count; @@ -150,7 +113,7 @@ static struct posix_acl *posix_acl_from_disk(const void *value, size_t size) goto fail; return acl; - fail: +fail: posix_acl_release(acl); return ERR_PTR(-EINVAL); } @@ -158,7 +121,7 @@ static struct posix_acl *posix_acl_from_disk(const void *value, size_t size) /* * Convert from in-memory to filesystem representation. */ -static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size) +static void *reiserfs_posix_acl_to_disk(const struct posix_acl *acl, size_t * size) { reiserfs_acl_header *ext_acl; char *e; @@ -203,7 +166,7 @@ static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size) } return (char *)ext_acl; - fail: +fail: kfree(ext_acl); return ERR_PTR(-EINVAL); } @@ -221,10 +184,6 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) int size; int retval; - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - switch (type) { case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; @@ -251,13 +210,15 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) retval = reiserfs_xattr_get(inode, name, value, size); if (retval == -ENODATA || retval == -ENOSYS) { - /* This shouldn't actually happen as it should have - been caught above.. but just in case */ + /* + * This shouldn't actually happen as it should have + * been caught above.. but just in case + */ acl = NULL; } else if (retval < 0) { acl = ERR_PTR(retval); } else { - acl = posix_acl_from_disk(value, retval); + acl = reiserfs_posix_acl_from_disk(value, retval); } if (!IS_ERR(acl)) set_cached_acl(inode, type, acl); @@ -273,7 +234,7 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) * BKL held [before 2.5.x] */ static int -reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, +__reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, int type, struct posix_acl *acl) { char *name; @@ -281,9 +242,6 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, size_t size = 0; int error; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - switch (type) { case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; @@ -307,7 +265,7 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, } if (acl) { - value = posix_acl_to_disk(acl, &size); + value = reiserfs_posix_acl_to_disk(acl, &size); if (IS_ERR(value)) return (int)PTR_ERR(value); } @@ -336,64 +294,61 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, return error; } -/* dir->i_mutex: locked, - * inode is new and not released into the wild yet */ +/* + * dir->i_mutex: locked, + * inode is new and not released into the wild yet + */ int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, struct inode *dir, struct dentry *dentry, struct inode *inode) { - struct posix_acl *acl; + struct posix_acl *default_acl, *acl; int err = 0; /* ACLs only get applied to files and directories */ if (S_ISLNK(inode->i_mode)) return 0; - /* ACLs can only be used on "new" objects, so if it's an old object - * there is nothing to inherit from */ + /* + * ACLs can only be used on "new" objects, so if it's an old object + * there is nothing to inherit from + */ if (get_inode_sd_version(dir) == STAT_DATA_V1) goto apply_umask; - /* Don't apply ACLs to objects in the .reiserfs_priv tree.. This + /* + * Don't apply ACLs to objects in the .reiserfs_priv tree.. This * would be useless since permissions are ignored, and a pain because - * it introduces locking cycles */ + * it introduces locking cycles + */ if (IS_PRIVATE(dir)) { inode->i_flags |= S_PRIVATE; goto apply_umask; } - acl = reiserfs_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); + err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (err) + return err; + if (default_acl) { + err = __reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT, + default_acl); + posix_acl_release(default_acl); + } if (acl) { - /* Copy the default ACL to the default ACL of a new directory */ - if (S_ISDIR(inode->i_mode)) { - err = reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT, - acl); - if (err) - goto cleanup; - } - - /* Now we reconcile the new ACL and the mode, - potentially modifying both */ - err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); - if (err < 0) - return err; - - /* If we need an ACL.. */ - if (err > 0) - err = reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, acl); - cleanup: + if (!err) + err = __reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, + acl); posix_acl_release(acl); - } else { - apply_umask: - /* no ACL, apply umask */ - inode->i_mode &= ~current_umask(); } return err; + +apply_umask: + /* no ACL, apply umask */ + inode->i_mode &= ~current_umask(); + return err; } /* This is used to cache the default acl before a new object is created. @@ -442,84 +397,11 @@ int reiserfs_cache_default_acl(struct inode *inode) */ int reiserfs_acl_chmod(struct inode *inode) { - struct reiserfs_transaction_handle th; - struct posix_acl *acl; - size_t size; - int error; - if (IS_PRIVATE(inode)) return 0; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - if (get_inode_sd_version(inode) == STAT_DATA_V1 || - !reiserfs_posixacl(inode->i_sb)) { - return 0; - } - - acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); - if (!acl) + !reiserfs_posixacl(inode->i_sb)) return 0; - if (IS_ERR(acl)) - return PTR_ERR(acl); - error = posix_acl_chmod(&acl, GFP_NOFS, inode->i_mode); - if (error) - return error; - size = reiserfs_xattr_nblocks(inode, reiserfs_acl_size(acl->a_count)); - reiserfs_write_lock(inode->i_sb); - error = journal_begin(&th, inode->i_sb, size * 2); - reiserfs_write_unlock(inode->i_sb); - if (!error) { - int error2; - error = reiserfs_set_acl(&th, inode, ACL_TYPE_ACCESS, acl); - reiserfs_write_lock(inode->i_sb); - error2 = journal_end(&th, inode->i_sb, size * 2); - reiserfs_write_unlock(inode->i_sb); - if (error2) - error = error2; - } - posix_acl_release(acl); - return error; -} - -static size_t posix_acl_access_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, - size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - if (!reiserfs_posixacl(dentry->d_sb)) - return 0; - if (list && size <= list_size) - memcpy(list, POSIX_ACL_XATTR_ACCESS, size); - return size; -} - -const struct xattr_handler reiserfs_posix_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .get = posix_acl_get, - .set = posix_acl_set, - .list = posix_acl_access_list, -}; - -static size_t posix_acl_default_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, - size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - if (!reiserfs_posixacl(dentry->d_sb)) - return 0; - if (list && size <= list_size) - memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); - return size; + return posix_acl_chmod(inode, inode->i_mode); } - -const struct xattr_handler reiserfs_posix_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .get = posix_acl_get, - .set = posix_acl_set, - .list = posix_acl_default_list, -}; diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c index f373bde8f54..ea06c755486 100644 --- a/fs/romfs/mmap-nommu.c +++ b/fs/romfs/mmap-nommu.c @@ -72,8 +72,8 @@ static int romfs_mmap(struct file *file, struct vm_area_struct *vma) const struct file_operations romfs_ro_fops = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, + .read = new_sync_read, + .read_iter = generic_file_read_iter, .splice_read = generic_file_splice_read, .mmap = romfs_mmap, .get_unmapped_area = romfs_get_unmapped_area, diff --git a/fs/romfs/super.c b/fs/romfs/super.c index ff1d3d42e72..ef90e8bca95 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -432,6 +432,7 @@ static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf) */ static int romfs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_RDONLY; return 0; } @@ -533,16 +534,14 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent) root = romfs_iget(sb, pos); if (IS_ERR(root)) - goto error; + return PTR_ERR(root); sb->s_root = d_make_root(root); if (!sb->s_root) - goto error; + return -ENOMEM; return 0; -error: - return -EINVAL; error_rsb_inval: ret = -EINVAL; error_rsb: diff --git a/fs/select.c b/fs/select.c index 35d4adc749d..467bb1cb3ea 100644 --- a/fs/select.c +++ b/fs/select.c @@ -238,8 +238,7 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state, set_current_state(state); if (!pwq->triggered) - rc = freezable_schedule_hrtimeout_range(expires, slack, - HRTIMER_MODE_ABS); + rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* @@ -455,7 +454,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) const struct file_operations *f_op; f_op = f.file->f_op; mask = DEFAULT_POLLMASK; - if (f_op && f_op->poll) { + if (f_op->poll) { wait_key_set(wait, in, out, bit, busy_flag); mask = (*f_op->poll)(f.file, wait); @@ -762,7 +761,7 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait, mask = POLLNVAL; if (f.file) { mask = DEFAULT_POLLMASK; - if (f.file->f_op && f.file->f_op->poll) { + if (f.file->f_op->poll) { pwait->_key = pollfd->events|POLLERR|POLLHUP; pwait->_key |= busy_flag; mask = f.file->f_op->poll(f.file, pwait); diff --git a/fs/seq_file.c b/fs/seq_file.c index 3135c2525c7..3857b720cb1 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -8,8 +8,10 @@ #include <linux/fs.h> #include <linux/export.h> #include <linux/seq_file.h> +#include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/cred.h> +#include <linux/mm.h> #include <asm/uaccess.h> #include <asm/page.h> @@ -30,6 +32,16 @@ static void seq_set_overflow(struct seq_file *m) m->count = m->size; } +static void *seq_buf_alloc(unsigned long size) +{ + void *buf; + + buf = kmalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!buf && size > PAGE_SIZE) + buf = vmalloc(size); + return buf; +} + /** * seq_open - initialize sequential file * @file: file we initialize @@ -96,7 +108,7 @@ static int traverse(struct seq_file *m, loff_t offset) return 0; } if (!m->buf) { - m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) return -ENOMEM; } @@ -135,8 +147,9 @@ static int traverse(struct seq_file *m, loff_t offset) Eoverflow: m->op->stop(m, p); - kfree(m->buf); - m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + kvfree(m->buf); + m->count = 0; + m->buf = seq_buf_alloc(m->size <<= 1); return !m->buf ? -ENOMEM : -EAGAIN; } @@ -191,7 +204,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) /* grab buffer if we didn't have one */ if (!m->buf) { - m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) goto Enomem; } @@ -231,11 +244,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) if (m->count < m->size) goto Fill; m->op->stop(m, p); - kfree(m->buf); - m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + kvfree(m->buf); + m->count = 0; + m->buf = seq_buf_alloc(m->size <<= 1); if (!m->buf) goto Enomem; - m->count = 0; m->version = 0; pos = m->index; p = m->op->start(m, &pos); @@ -328,6 +341,8 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence) m->read_pos = offset; retval = file->f_pos = offset; } + } else { + file->f_pos = offset; } } file->f_version = m->version; @@ -347,7 +362,7 @@ EXPORT_SYMBOL(seq_lseek); int seq_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; - kfree(m->buf); + kvfree(m->buf); kfree(m); return 0; } @@ -602,13 +617,13 @@ EXPORT_SYMBOL(single_open); int single_open_size(struct file *file, int (*show)(struct seq_file *, void *), void *data, size_t size) { - char *buf = kmalloc(size, GFP_KERNEL); + char *buf = seq_buf_alloc(size); int ret; if (!buf) return -ENOMEM; ret = single_open(file, show, data); if (ret) { - kfree(buf); + kvfree(buf); return ret; } ((struct seq_file *)file->private_data)->buf = buf; @@ -764,6 +779,21 @@ int seq_write(struct seq_file *seq, const void *data, size_t len) } EXPORT_SYMBOL(seq_write); +/** + * seq_pad - write padding spaces to buffer + * @m: seq_file identifying the buffer to which data should be written + * @c: the byte to append after padding if non-zero + */ +void seq_pad(struct seq_file *m, char c) +{ + int size = m->pad_until - m->count; + if (size > 0) + seq_printf(m, "%*s", size, ""); + if (c) + seq_putc(m, c); +} +EXPORT_SYMBOL(seq_pad); + struct list_head *seq_list_start(struct list_head *head, loff_t pos) { struct list_head *lh; diff --git a/fs/splice.c b/fs/splice.c index 3b7ee656f3a..f5cb9ba8451 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -32,6 +32,7 @@ #include <linux/gfp.h> #include <linux/socket.h> #include <linux/compat.h> +#include <linux/aio.h> #include "internal.h" /* @@ -136,8 +137,6 @@ error: const struct pipe_buf_operations page_cache_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = page_cache_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = page_cache_pipe_buf_steal, @@ -156,8 +155,6 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, static const struct pipe_buf_operations user_page_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = user_page_pipe_buf_steal, @@ -547,14 +544,28 @@ EXPORT_SYMBOL(generic_file_splice_read); static const struct pipe_buf_operations default_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_steal, .get = generic_pipe_buf_get, }; +static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + +/* Pipe buffer operations for a socket and similar. */ +const struct pipe_buf_operations nosteal_pipe_buf_ops = { + .can_merge = 0, + .confirm = generic_pipe_buf_confirm, + .release = generic_pipe_buf_release, + .steal = generic_pipe_buf_nosteal, + .get = generic_pipe_buf_get, +}; +EXPORT_SYMBOL(nosteal_pipe_buf_ops); + static ssize_t kernel_readv(struct file *file, const struct iovec *vec, unsigned long vlen, loff_t offset) { @@ -695,7 +706,7 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe, loff_t pos = sd->pos; int more; - if (!likely(file->f_op && file->f_op->sendpage)) + if (!likely(file->f_op->sendpage)) return -EINVAL; more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; @@ -707,63 +718,6 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe, sd->len, &pos, more); } -/* - * This is a little more tricky than the file -> pipe splicing. There are - * basically three cases: - * - * - Destination page already exists in the address space and there - * are users of it. For that case we have no other option that - * copying the data. Tough luck. - * - Destination page already exists in the address space, but there - * are no users of it. Make sure it's uptodate, then drop it. Fall - * through to last case. - * - Destination page does not exist, we can add the pipe page to - * the page cache and avoid the copy. - * - * If asked to move pages to the output file (SPLICE_F_MOVE is set in - * sd->flags), we attempt to migrate pages from the pipe to the output - * file address space page cache. This is possible if no one else has - * the pipe page referenced outside of the pipe and page cache. If - * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create - * a new page in the output file page cache and fill/dirty that. - */ -int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, - struct splice_desc *sd) -{ - struct file *file = sd->u.file; - struct address_space *mapping = file->f_mapping; - unsigned int offset, this_len; - struct page *page; - void *fsdata; - int ret; - - offset = sd->pos & ~PAGE_CACHE_MASK; - - this_len = sd->len; - if (this_len + offset > PAGE_CACHE_SIZE) - this_len = PAGE_CACHE_SIZE - offset; - - ret = pagecache_write_begin(file, mapping, sd->pos, this_len, - AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); - if (unlikely(ret)) - goto out; - - if (buf->page != page) { - char *src = buf->ops->map(pipe, buf, 1); - char *dst = kmap_atomic(page); - - memcpy(dst + offset, src + buf->offset, this_len); - flush_dcache_page(page); - kunmap_atomic(dst); - buf->ops->unmap(pipe, buf, src); - } - ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len, - page, fsdata); -out: - return ret; -} -EXPORT_SYMBOL(pipe_to_file); - static void wakeup_pipe_writers(struct pipe_inode_info *pipe) { smp_mb(); @@ -792,7 +746,7 @@ static void wakeup_pipe_writers(struct pipe_inode_info *pipe) * locking is required around copying the pipe buffers to the * destination. */ -int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, +static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, splice_actor *actor) { int ret; @@ -839,7 +793,6 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, return 1; } -EXPORT_SYMBOL(splice_from_pipe_feed); /** * splice_from_pipe_next - wait for some data to splice from @@ -851,7 +804,7 @@ EXPORT_SYMBOL(splice_from_pipe_feed); * value (one) if pipe buffers are available. It will return zero * or -errno if no more data needs to be spliced. */ -int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) +static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) { while (!pipe->nrbufs) { if (!pipe->writers) @@ -876,7 +829,6 @@ int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) return 1; } -EXPORT_SYMBOL(splice_from_pipe_next); /** * splice_from_pipe_begin - start splicing from pipe @@ -887,12 +839,11 @@ EXPORT_SYMBOL(splice_from_pipe_next); * splice_from_pipe_next() and splice_from_pipe_feed() to * initialize the necessary fields of @sd. */ -void splice_from_pipe_begin(struct splice_desc *sd) +static void splice_from_pipe_begin(struct splice_desc *sd) { sd->num_spliced = 0; sd->need_wakeup = false; } -EXPORT_SYMBOL(splice_from_pipe_begin); /** * splice_from_pipe_end - finish splicing from pipe @@ -904,12 +855,11 @@ EXPORT_SYMBOL(splice_from_pipe_begin); * be called after a loop containing splice_from_pipe_next() and * splice_from_pipe_feed(). */ -void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) +static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) { if (sd->need_wakeup) wakeup_pipe_writers(pipe); } -EXPORT_SYMBOL(splice_from_pipe_end); /** * __splice_from_pipe - splice data from a pipe to given actor @@ -975,7 +925,7 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, } /** - * generic_file_splice_write - splice data from a pipe to a file + * iter_file_splice_write - splice data from a pipe to a file * @pipe: pipe info * @out: file to write to * @ppos: position in @out @@ -985,40 +935,122 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, * Description: * Will either move or copy pages (determined by @flags options) from * the given pipe inode to the given file. + * This one is ->write_iter-based. * */ ssize_t -generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, +iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { - struct address_space *mapping = out->f_mapping; - struct inode *inode = mapping->host; struct splice_desc sd = { .total_len = len, .flags = flags, .pos = *ppos, .u.file = out, }; + int nbufs = pipe->buffers; + struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec), + GFP_KERNEL); ssize_t ret; + if (unlikely(!array)) + return -ENOMEM; + pipe_lock(pipe); splice_from_pipe_begin(&sd); - do { + while (sd.total_len) { + struct iov_iter from; + struct kiocb kiocb; + size_t left; + int n, idx; + ret = splice_from_pipe_next(pipe, &sd); if (ret <= 0) break; - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); - ret = file_remove_suid(out); - if (!ret) { - ret = file_update_time(out); - if (!ret) - ret = splice_from_pipe_feed(pipe, &sd, - pipe_to_file); + if (unlikely(nbufs < pipe->buffers)) { + kfree(array); + nbufs = pipe->buffers; + array = kcalloc(nbufs, sizeof(struct bio_vec), + GFP_KERNEL); + if (!array) { + ret = -ENOMEM; + break; + } } - mutex_unlock(&inode->i_mutex); - } while (ret > 0); + + /* build the vector */ + left = sd.total_len; + for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) { + struct pipe_buffer *buf = pipe->bufs + idx; + size_t this_len = buf->len; + + if (this_len > left) + this_len = left; + + if (idx == pipe->buffers - 1) + idx = -1; + + ret = buf->ops->confirm(pipe, buf); + if (unlikely(ret)) { + if (ret == -ENODATA) + ret = 0; + goto done; + } + + array[n].bv_page = buf->page; + array[n].bv_len = this_len; + array[n].bv_offset = buf->offset; + left -= this_len; + } + + /* ... iov_iter */ + from.type = ITER_BVEC | WRITE; + from.bvec = array; + from.nr_segs = n; + from.count = sd.total_len - left; + from.iov_offset = 0; + + /* ... and iocb */ + init_sync_kiocb(&kiocb, out); + kiocb.ki_pos = sd.pos; + kiocb.ki_nbytes = sd.total_len - left; + + /* now, send it */ + ret = out->f_op->write_iter(&kiocb, &from); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&kiocb); + + if (ret <= 0) + break; + + sd.num_spliced += ret; + sd.total_len -= ret; + *ppos = sd.pos = kiocb.ki_pos; + + /* dismiss the fully eaten buffers, adjust the partial one */ + while (ret) { + struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; + if (ret >= buf->len) { + const struct pipe_buf_operations *ops = buf->ops; + ret -= buf->len; + buf->len = 0; + buf->ops = NULL; + ops->release(pipe, buf); + pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); + pipe->nrbufs--; + if (pipe->files) + sd.need_wakeup = true; + } else { + buf->offset += ret; + buf->len -= ret; + ret = 0; + } + } + } +done: + kfree(array); splice_from_pipe_end(pipe, &sd); pipe_unlock(pipe); @@ -1026,21 +1058,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, if (sd.num_spliced) ret = sd.num_spliced; - if (ret > 0) { - int err; - - err = generic_write_sync(out, *ppos, ret); - if (err) - ret = err; - else - *ppos += ret; - balance_dirty_pages_ratelimited(mapping); - } - return ret; } -EXPORT_SYMBOL(generic_file_splice_write); +EXPORT_SYMBOL(iter_file_splice_write); static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) @@ -1049,9 +1070,9 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, void *data; loff_t tmp = sd->pos; - data = buf->ops->map(pipe, buf, 0); + data = kmap(buf->page); ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp); - buf->ops->unmap(pipe, buf, data); + kunmap(buf->page); return ret; } @@ -1099,7 +1120,7 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); - if (out->f_op && out->f_op->splice_write) + if (out->f_op->splice_write) splice_write = out->f_op->splice_write; else splice_write = default_file_splice_write; @@ -1125,7 +1146,7 @@ static long do_splice_to(struct file *in, loff_t *ppos, if (unlikely(ret < 0)) return ret; - if (in->f_op && in->f_op->splice_read) + if (in->f_op->splice_read) splice_read = in->f_op->splice_read; else splice_read = default_file_splice_read; @@ -1510,116 +1531,50 @@ static int get_iovec_page_array(const struct iovec __user *iov, static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { - char *src; - int ret; - - /* - * See if we can use the atomic maps, by prefaulting in the - * pages and doing an atomic copy - */ - if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) { - src = buf->ops->map(pipe, buf, 1); - ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset, - sd->len); - buf->ops->unmap(pipe, buf, src); - if (!ret) { - ret = sd->len; - goto out; - } - } - - /* - * No dice, use slow non-atomic map and copy - */ - src = buf->ops->map(pipe, buf, 0); - - ret = sd->len; - if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len)) - ret = -EFAULT; - - buf->ops->unmap(pipe, buf, src); -out: - if (ret > 0) - sd->u.userptr += ret; - return ret; + int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data); + return n == sd->len ? n : -EFAULT; } /* * For lack of a better implementation, implement vmsplice() to userspace * as a simple copy of the pipes pages to the user iov. */ -static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, +static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov, unsigned long nr_segs, unsigned int flags) { struct pipe_inode_info *pipe; struct splice_desc sd; - ssize_t size; - int error; long ret; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + ssize_t count; pipe = get_pipe_info(file); if (!pipe) return -EBADF; - pipe_lock(pipe); - - error = ret = 0; - while (nr_segs) { - void __user *base; - size_t len; - - /* - * Get user address base and length for this iovec. - */ - error = get_user(base, &iov->iov_base); - if (unlikely(error)) - break; - error = get_user(len, &iov->iov_len); - if (unlikely(error)) - break; - - /* - * Sanity check this iovec. 0 read succeeds. - */ - if (unlikely(!len)) - break; - if (unlikely(!base)) { - error = -EFAULT; - break; - } - - if (unlikely(!access_ok(VERIFY_WRITE, base, len))) { - error = -EFAULT; - break; - } - - sd.len = 0; - sd.total_len = len; - sd.flags = flags; - sd.u.userptr = base; - sd.pos = 0; - - size = __splice_from_pipe(pipe, &sd, pipe_to_user); - if (size < 0) { - if (!ret) - ret = size; - - break; - } - - ret += size; + ret = rw_copy_check_uvector(READ, uiov, nr_segs, + ARRAY_SIZE(iovstack), iovstack, &iov); + if (ret <= 0) + goto out; - if (size < len) - break; + count = ret; + iov_iter_init(&iter, READ, iov, nr_segs, count); - nr_segs--; - iov++; - } + sd.len = 0; + sd.total_len = count; + sd.flags = flags; + sd.u.data = &iter; + sd.pos = 0; + pipe_lock(pipe); + ret = __splice_from_pipe(pipe, &sd, pipe_to_user); pipe_unlock(pipe); - if (!ret) - ret = error; +out: + if (iov != iovstack) + kfree(iov); return ret; } diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index c70111ebefd..b6fa8657dcb 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -25,6 +25,78 @@ config SQUASHFS If unsure, say N. +choice + prompt "File decompression options" + depends on SQUASHFS + help + Squashfs now supports two options for decompressing file + data. Traditionally Squashfs has decompressed into an + intermediate buffer and then memcopied it into the page cache. + Squashfs now supports the ability to decompress directly into + the page cache. + + If unsure, select "Decompress file data into an intermediate buffer" + +config SQUASHFS_FILE_CACHE + bool "Decompress file data into an intermediate buffer" + help + Decompress file data into an intermediate buffer and then + memcopy it into the page cache. + +config SQUASHFS_FILE_DIRECT + bool "Decompress files directly into the page cache" + help + Directly decompress file data into the page cache. + Doing so can significantly improve performance because + it eliminates a memcpy and it also removes the lock contention + on the single buffer. + +endchoice + +choice + prompt "Decompressor parallelisation options" + depends on SQUASHFS + help + Squashfs now supports three parallelisation options for + decompression. Each one exhibits various trade-offs between + decompression performance and CPU and memory usage. + + If in doubt, select "Single threaded compression" + +config SQUASHFS_DECOMP_SINGLE + bool "Single threaded compression" + help + Traditionally Squashfs has used single-threaded decompression. + Only one block (data or metadata) can be decompressed at any + one time. This limits CPU and memory usage to a minimum. + +config SQUASHFS_DECOMP_MULTI + bool "Use multiple decompressors for parallel I/O" + help + By default Squashfs uses a single decompressor but it gives + poor performance on parallel I/O workloads when using multiple CPU + machines due to waiting on decompressor availability. + + If you have a parallel I/O workload and your system has enough memory, + using this option may improve overall I/O performance. + + This decompressor implementation uses up to two parallel + decompressors per core. It dynamically allocates decompressors + on a demand basis. + +config SQUASHFS_DECOMP_MULTI_PERCPU + bool "Use percpu multiple decompressors for parallel I/O" + help + By default Squashfs uses a single decompressor but it gives + poor performance on parallel I/O workloads when using multiple CPU + machines due to waiting on decompressor availability. + + This decompressor implementation uses a maximum of one + decompressor per core. It uses percpu variables to ensure + decompression is load-balanced across the cores. + +endchoice + config SQUASHFS_XATTR bool "Squashfs XATTR support" depends on SQUASHFS diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index 110b0476f3b..4132520b4ff 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -5,6 +5,11 @@ obj-$(CONFIG_SQUASHFS) += squashfs.o squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o squashfs-y += namei.o super.o symlink.o decompressor.o +squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o +squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o +squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o +squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o +squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 41d108ecc9b..0cea9b9236d 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -36,6 +36,7 @@ #include "squashfs_fs_sb.h" #include "squashfs.h" #include "decompressor.h" +#include "page_actor.h" /* * Read the metadata block length, this is stored in the first two @@ -86,16 +87,16 @@ static struct buffer_head *get_block_length(struct super_block *sb, * generated a larger block - this does occasionally happen with compression * algorithms). */ -int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, - int length, u64 *next_index, int srclength, int pages) +int squashfs_read_data(struct super_block *sb, u64 index, int length, + u64 *next_index, struct squashfs_page_actor *output) { struct squashfs_sb_info *msblk = sb->s_fs_info; struct buffer_head **bh; int offset = index & ((1 << msblk->devblksize_log2) - 1); u64 cur_index = index >> msblk->devblksize_log2; - int bytes, compressed, b = 0, k = 0, page = 0, avail; + int bytes, compressed, b = 0, k = 0, avail, i; - bh = kcalloc(((srclength + msblk->devblksize - 1) + bh = kcalloc(((output->length + msblk->devblksize - 1) >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL); if (bh == NULL) return -ENOMEM; @@ -111,9 +112,9 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, *next_index = index + length; TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n", - index, compressed ? "" : "un", length, srclength); + index, compressed ? "" : "un", length, output->length); - if (length < 0 || length > srclength || + if (length < 0 || length > output->length || (index + length) > msblk->bytes_used) goto read_failure; @@ -145,7 +146,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, TRACE("Block @ 0x%llx, %scompressed size %d\n", index, compressed ? "" : "un", length); - if (length < 0 || length > srclength || + if (length < 0 || length > output->length || (index + length) > msblk->bytes_used) goto block_release; @@ -158,9 +159,15 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, ll_rw_block(READ, b - 1, bh + 1); } + for (i = 0; i < b; i++) { + wait_on_buffer(bh[i]); + if (!buffer_uptodate(bh[i])) + goto block_release; + } + if (compressed) { - length = squashfs_decompress(msblk, buffer, bh, b, offset, - length, srclength, pages); + length = squashfs_decompress(msblk, bh, b, offset, length, + output); if (length < 0) goto read_failure; } else { @@ -168,22 +175,20 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, * Block is uncompressed. */ int in, pg_offset = 0; + void *data = squashfs_first_page(output); for (bytes = length; k < b; k++) { in = min(bytes, msblk->devblksize - offset); bytes -= in; - wait_on_buffer(bh[k]); - if (!buffer_uptodate(bh[k])) - goto block_release; while (in) { if (pg_offset == PAGE_CACHE_SIZE) { - page++; + data = squashfs_next_page(output); pg_offset = 0; } avail = min_t(int, in, PAGE_CACHE_SIZE - pg_offset); - memcpy(buffer[page] + pg_offset, - bh[k]->b_data + offset, avail); + memcpy(data + pg_offset, bh[k]->b_data + offset, + avail); in -= avail; pg_offset += avail; offset += avail; @@ -191,6 +196,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, offset = 0; put_bh(bh[k]); } + squashfs_finish_page(output); } kfree(bh); diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index af0b7380259..1cb70a0b216 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -56,6 +56,7 @@ #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs.h" +#include "page_actor.h" /* * Look-up block in cache, and increment usage count. If not in cache, read @@ -119,9 +120,8 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb, entry->error = 0; spin_unlock(&cache->lock); - entry->length = squashfs_read_data(sb, entry->data, - block, length, &entry->next_index, - cache->block_size, cache->pages); + entry->length = squashfs_read_data(sb, block, length, + &entry->next_index, entry->actor); spin_lock(&cache->lock); @@ -220,6 +220,7 @@ void squashfs_cache_delete(struct squashfs_cache *cache) kfree(cache->entry[i].data[j]); kfree(cache->entry[i].data); } + kfree(cache->entry[i].actor); } kfree(cache->entry); @@ -280,6 +281,13 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries, goto cleanup; } } + + entry->actor = squashfs_page_actor_init(entry->data, + cache->pages, 0); + if (entry->actor == NULL) { + ERROR("Failed to allocate %s cache entry\n", name); + goto cleanup; + } } return cache; @@ -410,6 +418,7 @@ void *squashfs_read_table(struct super_block *sb, u64 block, int length) int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; int i, res; void *table, *buffer, **data; + struct squashfs_page_actor *actor; table = buffer = kmalloc(length, GFP_KERNEL); if (table == NULL) @@ -421,19 +430,28 @@ void *squashfs_read_table(struct super_block *sb, u64 block, int length) goto failed; } + actor = squashfs_page_actor_init(data, pages, length); + if (actor == NULL) { + res = -ENOMEM; + goto failed2; + } + for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE) data[i] = buffer; - res = squashfs_read_data(sb, data, block, length | - SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length, pages); + res = squashfs_read_data(sb, block, length | + SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, actor); kfree(data); + kfree(actor); if (res < 0) goto failed; return table; +failed2: + kfree(data); failed: kfree(table); return ERR_PTR(res); diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c index 3f6271d86ab..ac22fe73b0a 100644 --- a/fs/squashfs/decompressor.c +++ b/fs/squashfs/decompressor.c @@ -30,6 +30,7 @@ #include "squashfs_fs_sb.h" #include "decompressor.h" #include "squashfs.h" +#include "page_actor.h" /* * This file (and decompressor.h) implements a decompressor framework for @@ -37,29 +38,29 @@ */ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = { - NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0 + NULL, NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0 }; #ifndef CONFIG_SQUASHFS_LZO static const struct squashfs_decompressor squashfs_lzo_comp_ops = { - NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 + NULL, NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 }; #endif #ifndef CONFIG_SQUASHFS_XZ static const struct squashfs_decompressor squashfs_xz_comp_ops = { - NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0 + NULL, NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0 }; #endif #ifndef CONFIG_SQUASHFS_ZLIB static const struct squashfs_decompressor squashfs_zlib_comp_ops = { - NULL, NULL, NULL, ZLIB_COMPRESSION, "zlib", 0 + NULL, NULL, NULL, NULL, ZLIB_COMPRESSION, "zlib", 0 }; #endif static const struct squashfs_decompressor squashfs_unknown_comp_ops = { - NULL, NULL, NULL, 0, "unknown", 0 + NULL, NULL, NULL, NULL, 0, "unknown", 0 }; static const struct squashfs_decompressor *decompressor[] = { @@ -83,10 +84,11 @@ const struct squashfs_decompressor *squashfs_lookup_decompressor(int id) } -void *squashfs_decompressor_init(struct super_block *sb, unsigned short flags) +static void *get_comp_opts(struct super_block *sb, unsigned short flags) { struct squashfs_sb_info *msblk = sb->s_fs_info; - void *strm, *buffer = NULL; + void *buffer = NULL, *comp_opts; + struct squashfs_page_actor *actor = NULL; int length = 0; /* @@ -94,23 +96,46 @@ void *squashfs_decompressor_init(struct super_block *sb, unsigned short flags) */ if (SQUASHFS_COMP_OPTS(flags)) { buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL); - if (buffer == NULL) - return ERR_PTR(-ENOMEM); + if (buffer == NULL) { + comp_opts = ERR_PTR(-ENOMEM); + goto out; + } + + actor = squashfs_page_actor_init(&buffer, 1, 0); + if (actor == NULL) { + comp_opts = ERR_PTR(-ENOMEM); + goto out; + } - length = squashfs_read_data(sb, &buffer, - sizeof(struct squashfs_super_block), 0, NULL, - PAGE_CACHE_SIZE, 1); + length = squashfs_read_data(sb, + sizeof(struct squashfs_super_block), 0, NULL, actor); if (length < 0) { - strm = ERR_PTR(length); - goto finished; + comp_opts = ERR_PTR(length); + goto out; } } - strm = msblk->decompressor->init(msblk, buffer, length); + comp_opts = squashfs_comp_opts(msblk, buffer, length); -finished: +out: + kfree(actor); kfree(buffer); + return comp_opts; +} + + +void *squashfs_decompressor_setup(struct super_block *sb, unsigned short flags) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + void *stream, *comp_opts = get_comp_opts(sb, flags); + + if (IS_ERR(comp_opts)) + return comp_opts; + + stream = squashfs_decompressor_create(msblk, comp_opts); + if (IS_ERR(stream)) + kfree(comp_opts); - return strm; + return stream; } diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h index 330073e2902..af098532180 100644 --- a/fs/squashfs/decompressor.h +++ b/fs/squashfs/decompressor.h @@ -24,28 +24,22 @@ */ struct squashfs_decompressor { - void *(*init)(struct squashfs_sb_info *, void *, int); + void *(*init)(struct squashfs_sb_info *, void *); + void *(*comp_opts)(struct squashfs_sb_info *, void *, int); void (*free)(void *); - int (*decompress)(struct squashfs_sb_info *, void **, - struct buffer_head **, int, int, int, int, int); + int (*decompress)(struct squashfs_sb_info *, void *, + struct buffer_head **, int, int, int, + struct squashfs_page_actor *); int id; char *name; int supported; }; -static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk, - void *s) +static inline void *squashfs_comp_opts(struct squashfs_sb_info *msblk, + void *buff, int length) { - if (msblk->decompressor) - msblk->decompressor->free(s); -} - -static inline int squashfs_decompress(struct squashfs_sb_info *msblk, - void **buffer, struct buffer_head **bh, int b, int offset, int length, - int srclength, int pages) -{ - return msblk->decompressor->decompress(msblk, buffer, bh, b, offset, - length, srclength, pages); + return msblk->decompressor->comp_opts ? + msblk->decompressor->comp_opts(msblk, buff, length) : NULL; } #ifdef CONFIG_SQUASHFS_XZ diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c new file mode 100644 index 00000000000..d6008a63647 --- /dev/null +++ b/fs/squashfs/decompressor_multi.c @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2013 + * Minchan Kim <minchan@kernel.org> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ +#include <linux/types.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> +#include <linux/sched.h> +#include <linux/wait.h> +#include <linux/cpumask.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "decompressor.h" +#include "squashfs.h" + +/* + * This file implements multi-threaded decompression in the + * decompressor framework + */ + + +/* + * The reason that multiply two is that a CPU can request new I/O + * while it is waiting previous request. + */ +#define MAX_DECOMPRESSOR (num_online_cpus() * 2) + + +int squashfs_max_decompressors(void) +{ + return MAX_DECOMPRESSOR; +} + + +struct squashfs_stream { + void *comp_opts; + struct list_head strm_list; + struct mutex mutex; + int avail_decomp; + wait_queue_head_t wait; +}; + + +struct decomp_stream { + void *stream; + struct list_head list; +}; + + +static void put_decomp_stream(struct decomp_stream *decomp_strm, + struct squashfs_stream *stream) +{ + mutex_lock(&stream->mutex); + list_add(&decomp_strm->list, &stream->strm_list); + mutex_unlock(&stream->mutex); + wake_up(&stream->wait); +} + +void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, + void *comp_opts) +{ + struct squashfs_stream *stream; + struct decomp_stream *decomp_strm = NULL; + int err = -ENOMEM; + + stream = kzalloc(sizeof(*stream), GFP_KERNEL); + if (!stream) + goto out; + + stream->comp_opts = comp_opts; + mutex_init(&stream->mutex); + INIT_LIST_HEAD(&stream->strm_list); + init_waitqueue_head(&stream->wait); + + /* + * We should have a decompressor at least as default + * so if we fail to allocate new decompressor dynamically, + * we could always fall back to default decompressor and + * file system works. + */ + decomp_strm = kmalloc(sizeof(*decomp_strm), GFP_KERNEL); + if (!decomp_strm) + goto out; + + decomp_strm->stream = msblk->decompressor->init(msblk, + stream->comp_opts); + if (IS_ERR(decomp_strm->stream)) { + err = PTR_ERR(decomp_strm->stream); + goto out; + } + + list_add(&decomp_strm->list, &stream->strm_list); + stream->avail_decomp = 1; + return stream; + +out: + kfree(decomp_strm); + kfree(stream); + return ERR_PTR(err); +} + + +void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) +{ + struct squashfs_stream *stream = msblk->stream; + if (stream) { + struct decomp_stream *decomp_strm; + + while (!list_empty(&stream->strm_list)) { + decomp_strm = list_entry(stream->strm_list.prev, + struct decomp_stream, list); + list_del(&decomp_strm->list); + msblk->decompressor->free(decomp_strm->stream); + kfree(decomp_strm); + stream->avail_decomp--; + } + WARN_ON(stream->avail_decomp); + kfree(stream->comp_opts); + kfree(stream); + } +} + + +static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk, + struct squashfs_stream *stream) +{ + struct decomp_stream *decomp_strm; + + while (1) { + mutex_lock(&stream->mutex); + + /* There is available decomp_stream */ + if (!list_empty(&stream->strm_list)) { + decomp_strm = list_entry(stream->strm_list.prev, + struct decomp_stream, list); + list_del(&decomp_strm->list); + mutex_unlock(&stream->mutex); + break; + } + + /* + * If there is no available decomp and already full, + * let's wait for releasing decomp from other users. + */ + if (stream->avail_decomp >= MAX_DECOMPRESSOR) + goto wait; + + /* Let's allocate new decomp */ + decomp_strm = kmalloc(sizeof(*decomp_strm), GFP_KERNEL); + if (!decomp_strm) + goto wait; + + decomp_strm->stream = msblk->decompressor->init(msblk, + stream->comp_opts); + if (IS_ERR(decomp_strm->stream)) { + kfree(decomp_strm); + goto wait; + } + + stream->avail_decomp++; + WARN_ON(stream->avail_decomp > MAX_DECOMPRESSOR); + + mutex_unlock(&stream->mutex); + break; +wait: + /* + * If system memory is tough, let's for other's + * releasing instead of hurting VM because it could + * make page cache thrashing. + */ + mutex_unlock(&stream->mutex); + wait_event(stream->wait, + !list_empty(&stream->strm_list)); + } + + return decomp_strm; +} + + +int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, + int b, int offset, int length, struct squashfs_page_actor *output) +{ + int res; + struct squashfs_stream *stream = msblk->stream; + struct decomp_stream *decomp_stream = get_decomp_stream(msblk, stream); + res = msblk->decompressor->decompress(msblk, decomp_stream->stream, + bh, b, offset, length, output); + put_decomp_stream(decomp_stream, stream); + if (res < 0) + ERROR("%s decompression failed, data probably corrupt\n", + msblk->decompressor->name); + return res; +} diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c new file mode 100644 index 00000000000..23a9c28ad8e --- /dev/null +++ b/fs/squashfs/decompressor_multi_percpu.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2013 + * Phillip Lougher <phillip@squashfs.org.uk> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/percpu.h> +#include <linux/buffer_head.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "decompressor.h" +#include "squashfs.h" + +/* + * This file implements multi-threaded decompression using percpu + * variables, one thread per cpu core. + */ + +struct squashfs_stream { + void *stream; +}; + +void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, + void *comp_opts) +{ + struct squashfs_stream *stream; + struct squashfs_stream __percpu *percpu; + int err, cpu; + + percpu = alloc_percpu(struct squashfs_stream); + if (percpu == NULL) + return ERR_PTR(-ENOMEM); + + for_each_possible_cpu(cpu) { + stream = per_cpu_ptr(percpu, cpu); + stream->stream = msblk->decompressor->init(msblk, comp_opts); + if (IS_ERR(stream->stream)) { + err = PTR_ERR(stream->stream); + goto out; + } + } + + kfree(comp_opts); + return (__force void *) percpu; + +out: + for_each_possible_cpu(cpu) { + stream = per_cpu_ptr(percpu, cpu); + if (!IS_ERR_OR_NULL(stream->stream)) + msblk->decompressor->free(stream->stream); + } + free_percpu(percpu); + return ERR_PTR(err); +} + +void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) +{ + struct squashfs_stream __percpu *percpu = + (struct squashfs_stream __percpu *) msblk->stream; + struct squashfs_stream *stream; + int cpu; + + if (msblk->stream) { + for_each_possible_cpu(cpu) { + stream = per_cpu_ptr(percpu, cpu); + msblk->decompressor->free(stream->stream); + } + free_percpu(percpu); + } +} + +int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, + int b, int offset, int length, struct squashfs_page_actor *output) +{ + struct squashfs_stream __percpu *percpu = + (struct squashfs_stream __percpu *) msblk->stream; + struct squashfs_stream *stream = get_cpu_ptr(percpu); + int res = msblk->decompressor->decompress(msblk, stream->stream, bh, b, + offset, length, output); + put_cpu_ptr(stream); + + if (res < 0) + ERROR("%s decompression failed, data probably corrupt\n", + msblk->decompressor->name); + + return res; +} + +int squashfs_max_decompressors(void) +{ + return num_possible_cpus(); +} diff --git a/fs/squashfs/decompressor_single.c b/fs/squashfs/decompressor_single.c new file mode 100644 index 00000000000..a6c75929a00 --- /dev/null +++ b/fs/squashfs/decompressor_single.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2013 + * Phillip Lougher <phillip@squashfs.org.uk> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/types.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "decompressor.h" +#include "squashfs.h" + +/* + * This file implements single-threaded decompression in the + * decompressor framework + */ + +struct squashfs_stream { + void *stream; + struct mutex mutex; +}; + +void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, + void *comp_opts) +{ + struct squashfs_stream *stream; + int err = -ENOMEM; + + stream = kmalloc(sizeof(*stream), GFP_KERNEL); + if (stream == NULL) + goto out; + + stream->stream = msblk->decompressor->init(msblk, comp_opts); + if (IS_ERR(stream->stream)) { + err = PTR_ERR(stream->stream); + goto out; + } + + kfree(comp_opts); + mutex_init(&stream->mutex); + return stream; + +out: + kfree(stream); + return ERR_PTR(err); +} + +void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) +{ + struct squashfs_stream *stream = msblk->stream; + + if (stream) { + msblk->decompressor->free(stream->stream); + kfree(stream); + } +} + +int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, + int b, int offset, int length, struct squashfs_page_actor *output) +{ + int res; + struct squashfs_stream *stream = msblk->stream; + + mutex_lock(&stream->mutex); + res = msblk->decompressor->decompress(msblk, stream->stream, bh, b, + offset, length, output); + mutex_unlock(&stream->mutex); + + if (res < 0) + ERROR("%s decompression failed, data probably corrupt\n", + msblk->decompressor->name); + + return res; +} + +int squashfs_max_decompressors(void) +{ + return 1; +} diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 8ca62c28fe1..e5c9689062b 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -370,77 +370,15 @@ static int read_blocklist(struct inode *inode, int index, u64 *block) return le32_to_cpu(size); } - -static int squashfs_readpage(struct file *file, struct page *page) +/* Copy data into page cache */ +void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer, + int bytes, int offset) { struct inode *inode = page->mapping->host; struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; - int bytes, i, offset = 0, sparse = 0; - struct squashfs_cache_entry *buffer = NULL; void *pageaddr; - - int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1; - int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT); - int start_index = page->index & ~mask; - int end_index = start_index | mask; - int file_end = i_size_read(inode) >> msblk->block_log; - - TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n", - page->index, squashfs_i(inode)->start); - - if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT)) - goto out; - - if (index < file_end || squashfs_i(inode)->fragment_block == - SQUASHFS_INVALID_BLK) { - /* - * Reading a datablock from disk. Need to read block list - * to get location and block size. - */ - u64 block = 0; - int bsize = read_blocklist(inode, index, &block); - if (bsize < 0) - goto error_out; - - if (bsize == 0) { /* hole */ - bytes = index == file_end ? - (i_size_read(inode) & (msblk->block_size - 1)) : - msblk->block_size; - sparse = 1; - } else { - /* - * Read and decompress datablock. - */ - buffer = squashfs_get_datablock(inode->i_sb, - block, bsize); - if (buffer->error) { - ERROR("Unable to read page, block %llx, size %x" - "\n", block, bsize); - squashfs_cache_put(buffer); - goto error_out; - } - bytes = buffer->length; - } - } else { - /* - * Datablock is stored inside a fragment (tail-end packed - * block). - */ - buffer = squashfs_get_fragment(inode->i_sb, - squashfs_i(inode)->fragment_block, - squashfs_i(inode)->fragment_size); - - if (buffer->error) { - ERROR("Unable to read page, block %llx, size %x\n", - squashfs_i(inode)->fragment_block, - squashfs_i(inode)->fragment_size); - squashfs_cache_put(buffer); - goto error_out; - } - bytes = i_size_read(inode) & (msblk->block_size - 1); - offset = squashfs_i(inode)->fragment_offset; - } + int i, mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1; + int start_index = page->index & ~mask, end_index = start_index | mask; /* * Loop copying datablock into pages. As the datablock likely covers @@ -451,7 +389,7 @@ static int squashfs_readpage(struct file *file, struct page *page) for (i = start_index; i <= end_index && bytes > 0; i++, bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) { struct page *push_page; - int avail = sparse ? 0 : min_t(int, bytes, PAGE_CACHE_SIZE); + int avail = buffer ? min_t(int, bytes, PAGE_CACHE_SIZE) : 0; TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail); @@ -475,11 +413,75 @@ skip_page: if (i != page->index) page_cache_release(push_page); } +} + +/* Read datablock stored packed inside a fragment (tail-end packed block) */ +static int squashfs_readpage_fragment(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb, + squashfs_i(inode)->fragment_block, + squashfs_i(inode)->fragment_size); + int res = buffer->error; + + if (res) + ERROR("Unable to read page, block %llx, size %x\n", + squashfs_i(inode)->fragment_block, + squashfs_i(inode)->fragment_size); + else + squashfs_copy_cache(page, buffer, i_size_read(inode) & + (msblk->block_size - 1), + squashfs_i(inode)->fragment_offset); + + squashfs_cache_put(buffer); + return res; +} - if (!sparse) - squashfs_cache_put(buffer); +static int squashfs_readpage_sparse(struct page *page, int index, int file_end) +{ + struct inode *inode = page->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + int bytes = index == file_end ? + (i_size_read(inode) & (msblk->block_size - 1)) : + msblk->block_size; + squashfs_copy_cache(page, NULL, bytes, 0); return 0; +} + +static int squashfs_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT); + int file_end = i_size_read(inode) >> msblk->block_log; + int res; + void *pageaddr; + + TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n", + page->index, squashfs_i(inode)->start); + + if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT)) + goto out; + + if (index < file_end || squashfs_i(inode)->fragment_block == + SQUASHFS_INVALID_BLK) { + u64 block = 0; + int bsize = read_blocklist(inode, index, &block); + if (bsize < 0) + goto error_out; + + if (bsize == 0) + res = squashfs_readpage_sparse(page, index, file_end); + else + res = squashfs_readpage_block(page, block, bsize); + } else + res = squashfs_readpage_fragment(page); + + if (!res) + return 0; error_out: SetPageError(page); diff --git a/fs/squashfs/file_cache.c b/fs/squashfs/file_cache.c new file mode 100644 index 00000000000..f2310d2a201 --- /dev/null +++ b/fs/squashfs/file_cache.c @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2013 + * Phillip Lougher <phillip@squashfs.org.uk> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/pagemap.h> +#include <linux/mutex.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* Read separately compressed datablock and memcopy into page cache */ +int squashfs_readpage_block(struct page *page, u64 block, int bsize) +{ + struct inode *i = page->mapping->host; + struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb, + block, bsize); + int res = buffer->error; + + if (res) + ERROR("Unable to read page, block %llx, size %x\n", block, + bsize); + else + squashfs_copy_cache(page, buffer, buffer->length, 0); + + squashfs_cache_put(buffer); + return res; +} diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c new file mode 100644 index 00000000000..62a0de6632e --- /dev/null +++ b/fs/squashfs/file_direct.c @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2013 + * Phillip Lougher <phillip@squashfs.org.uk> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/pagemap.h> +#include <linux/mutex.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" +#include "page_actor.h" + +static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, + int pages, struct page **page); + +/* Read separately compressed datablock directly into page cache */ +int squashfs_readpage_block(struct page *target_page, u64 block, int bsize) + +{ + struct inode *inode = target_page->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + + int file_end = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1; + int start_index = target_page->index & ~mask; + int end_index = start_index | mask; + int i, n, pages, missing_pages, bytes, res = -ENOMEM; + struct page **page; + struct squashfs_page_actor *actor; + void *pageaddr; + + if (end_index > file_end) + end_index = file_end; + + pages = end_index - start_index + 1; + + page = kmalloc(sizeof(void *) * pages, GFP_KERNEL); + if (page == NULL) + return res; + + /* + * Create a "page actor" which will kmap and kunmap the + * page cache pages appropriately within the decompressor + */ + actor = squashfs_page_actor_init_special(page, pages, 0); + if (actor == NULL) + goto out; + + /* Try to grab all the pages covered by the Squashfs block */ + for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) { + page[i] = (n == target_page->index) ? target_page : + grab_cache_page_nowait(target_page->mapping, n); + + if (page[i] == NULL) { + missing_pages++; + continue; + } + + if (PageUptodate(page[i])) { + unlock_page(page[i]); + page_cache_release(page[i]); + page[i] = NULL; + missing_pages++; + } + } + + if (missing_pages) { + /* + * Couldn't get one or more pages, this page has either + * been VM reclaimed, but others are still in the page cache + * and uptodate, or we're racing with another thread in + * squashfs_readpage also trying to grab them. Fall back to + * using an intermediate buffer. + */ + res = squashfs_read_cache(target_page, block, bsize, pages, + page); + if (res < 0) + goto mark_errored; + + goto out; + } + + /* Decompress directly into the page cache buffers */ + res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); + if (res < 0) + goto mark_errored; + + /* Last page may have trailing bytes not filled */ + bytes = res % PAGE_CACHE_SIZE; + if (bytes) { + pageaddr = kmap_atomic(page[pages - 1]); + memset(pageaddr + bytes, 0, PAGE_CACHE_SIZE - bytes); + kunmap_atomic(pageaddr); + } + + /* Mark pages as uptodate, unlock and release */ + for (i = 0; i < pages; i++) { + flush_dcache_page(page[i]); + SetPageUptodate(page[i]); + unlock_page(page[i]); + if (page[i] != target_page) + page_cache_release(page[i]); + } + + kfree(actor); + kfree(page); + + return 0; + +mark_errored: + /* Decompression failed, mark pages as errored. Target_page is + * dealt with by the caller + */ + for (i = 0; i < pages; i++) { + if (page[i] == NULL || page[i] == target_page) + continue; + flush_dcache_page(page[i]); + SetPageError(page[i]); + unlock_page(page[i]); + page_cache_release(page[i]); + } + +out: + kfree(actor); + kfree(page); + return res; +} + + +static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, + int pages, struct page **page) +{ + struct inode *i = target_page->mapping->host; + struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb, + block, bsize); + int bytes = buffer->length, res = buffer->error, n, offset = 0; + void *pageaddr; + + if (res) { + ERROR("Unable to read page, block %llx, size %x\n", block, + bsize); + goto out; + } + + for (n = 0; n < pages && bytes > 0; n++, + bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) { + int avail = min_t(int, bytes, PAGE_CACHE_SIZE); + + if (page[n] == NULL) + continue; + + pageaddr = kmap_atomic(page[n]); + squashfs_copy_data(pageaddr, buffer, offset, avail); + memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail); + kunmap_atomic(pageaddr); + flush_dcache_page(page[n]); + SetPageUptodate(page[n]); + unlock_page(page[n]); + if (page[n] != target_page) + page_cache_release(page[n]); + } + +out: + squashfs_cache_put(buffer); + return res; +} diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index 00f4dfc5f08..244b9fbfff7 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -31,13 +31,14 @@ #include "squashfs_fs_sb.h" #include "squashfs.h" #include "decompressor.h" +#include "page_actor.h" struct squashfs_lzo { void *input; void *output; }; -static void *lzo_init(struct squashfs_sb_info *msblk, void *buff, int len) +static void *lzo_init(struct squashfs_sb_info *msblk, void *buff) { int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE); @@ -74,22 +75,16 @@ static void lzo_free(void *strm) } -static int lzo_uncompress(struct squashfs_sb_info *msblk, void **buffer, - struct buffer_head **bh, int b, int offset, int length, int srclength, - int pages) +static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, + struct buffer_head **bh, int b, int offset, int length, + struct squashfs_page_actor *output) { - struct squashfs_lzo *stream = msblk->stream; - void *buff = stream->input; + struct squashfs_lzo *stream = strm; + void *buff = stream->input, *data; int avail, i, bytes = length, res; - size_t out_len = srclength; - - mutex_lock(&msblk->read_data_mutex); + size_t out_len = output->length; for (i = 0; i < b; i++) { - wait_on_buffer(bh[i]); - if (!buffer_uptodate(bh[i])) - goto block_release; - avail = min(bytes, msblk->devblksize - offset); memcpy(buff, bh[i]->b_data + offset, avail); buff += avail; @@ -104,24 +99,24 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void **buffer, goto failed; res = bytes = (int)out_len; - for (i = 0, buff = stream->output; bytes && i < pages; i++) { - avail = min_t(int, bytes, PAGE_CACHE_SIZE); - memcpy(buffer[i], buff, avail); - buff += avail; - bytes -= avail; + data = squashfs_first_page(output); + buff = stream->output; + while (data) { + if (bytes <= PAGE_CACHE_SIZE) { + memcpy(data, buff, bytes); + break; + } else { + memcpy(data, buff, PAGE_CACHE_SIZE); + buff += PAGE_CACHE_SIZE; + bytes -= PAGE_CACHE_SIZE; + data = squashfs_next_page(output); + } } + squashfs_finish_page(output); - mutex_unlock(&msblk->read_data_mutex); return res; -block_release: - for (; i < b; i++) - put_bh(bh[i]); - failed: - mutex_unlock(&msblk->read_data_mutex); - - ERROR("lzo decompression failed, data probably corrupt\n"); return -EIO; } diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c new file mode 100644 index 00000000000..5a1c11f5644 --- /dev/null +++ b/fs/squashfs/page_actor.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2013 + * Phillip Lougher <phillip@squashfs.org.uk> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/pagemap.h> +#include "page_actor.h" + +/* + * This file contains implementations of page_actor for decompressing into + * an intermediate buffer, and for decompressing directly into the + * page cache. + * + * Calling code should avoid sleeping between calls to squashfs_first_page() + * and squashfs_finish_page(). + */ + +/* Implementation of page_actor for decompressing into intermediate buffer */ +static void *cache_first_page(struct squashfs_page_actor *actor) +{ + actor->next_page = 1; + return actor->buffer[0]; +} + +static void *cache_next_page(struct squashfs_page_actor *actor) +{ + if (actor->next_page == actor->pages) + return NULL; + + return actor->buffer[actor->next_page++]; +} + +static void cache_finish_page(struct squashfs_page_actor *actor) +{ + /* empty */ +} + +struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, + int pages, int length) +{ + struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); + + if (actor == NULL) + return NULL; + + actor->length = length ? : pages * PAGE_CACHE_SIZE; + actor->buffer = buffer; + actor->pages = pages; + actor->next_page = 0; + actor->squashfs_first_page = cache_first_page; + actor->squashfs_next_page = cache_next_page; + actor->squashfs_finish_page = cache_finish_page; + return actor; +} + +/* Implementation of page_actor for decompressing directly into page cache. */ +static void *direct_first_page(struct squashfs_page_actor *actor) +{ + actor->next_page = 1; + return actor->pageaddr = kmap_atomic(actor->page[0]); +} + +static void *direct_next_page(struct squashfs_page_actor *actor) +{ + if (actor->pageaddr) + kunmap_atomic(actor->pageaddr); + + return actor->pageaddr = actor->next_page == actor->pages ? NULL : + kmap_atomic(actor->page[actor->next_page++]); +} + +static void direct_finish_page(struct squashfs_page_actor *actor) +{ + if (actor->pageaddr) + kunmap_atomic(actor->pageaddr); +} + +struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page, + int pages, int length) +{ + struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); + + if (actor == NULL) + return NULL; + + actor->length = length ? : pages * PAGE_CACHE_SIZE; + actor->page = page; + actor->pages = pages; + actor->next_page = 0; + actor->pageaddr = NULL; + actor->squashfs_first_page = direct_first_page; + actor->squashfs_next_page = direct_next_page; + actor->squashfs_finish_page = direct_finish_page; + return actor; +} diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h new file mode 100644 index 00000000000..26dd82008b8 --- /dev/null +++ b/fs/squashfs/page_actor.h @@ -0,0 +1,81 @@ +#ifndef PAGE_ACTOR_H +#define PAGE_ACTOR_H +/* + * Copyright (c) 2013 + * Phillip Lougher <phillip@squashfs.org.uk> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#ifndef CONFIG_SQUASHFS_FILE_DIRECT +struct squashfs_page_actor { + void **page; + int pages; + int length; + int next_page; +}; + +static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page, + int pages, int length) +{ + struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); + + if (actor == NULL) + return NULL; + + actor->length = length ? : pages * PAGE_CACHE_SIZE; + actor->page = page; + actor->pages = pages; + actor->next_page = 0; + return actor; +} + +static inline void *squashfs_first_page(struct squashfs_page_actor *actor) +{ + actor->next_page = 1; + return actor->page[0]; +} + +static inline void *squashfs_next_page(struct squashfs_page_actor *actor) +{ + return actor->next_page == actor->pages ? NULL : + actor->page[actor->next_page++]; +} + +static inline void squashfs_finish_page(struct squashfs_page_actor *actor) +{ + /* empty */ +} +#else +struct squashfs_page_actor { + union { + void **buffer; + struct page **page; + }; + void *pageaddr; + void *(*squashfs_first_page)(struct squashfs_page_actor *); + void *(*squashfs_next_page)(struct squashfs_page_actor *); + void (*squashfs_finish_page)(struct squashfs_page_actor *); + int pages; + int length; + int next_page; +}; + +extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int); +extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page + **, int, int); +static inline void *squashfs_first_page(struct squashfs_page_actor *actor) +{ + return actor->squashfs_first_page(actor); +} +static inline void *squashfs_next_page(struct squashfs_page_actor *actor) +{ + return actor->squashfs_next_page(actor); +} +static inline void squashfs_finish_page(struct squashfs_page_actor *actor) +{ + actor->squashfs_finish_page(actor); +} +#endif +#endif diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index d1266516ed0..887d6d27008 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -25,11 +25,11 @@ #define ERROR(s, args...) pr_err("SQUASHFS error: "s, ## args) -#define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args) +#define WARNING(s, args...) pr_warn("SQUASHFS: "s, ## args) /* block.c */ -extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *, - int, int); +extern int squashfs_read_data(struct super_block *, u64, int, u64 *, + struct squashfs_page_actor *); /* cache.c */ extern struct squashfs_cache *squashfs_cache_init(char *, int, int); @@ -48,7 +48,14 @@ extern void *squashfs_read_table(struct super_block *, u64, int); /* decompressor.c */ extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int); -extern void *squashfs_decompressor_init(struct super_block *, unsigned short); +extern void *squashfs_decompressor_setup(struct super_block *, unsigned short); + +/* decompressor_xxx.c */ +extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *); +extern void squashfs_decompressor_destroy(struct squashfs_sb_info *); +extern int squashfs_decompress(struct squashfs_sb_info *, struct buffer_head **, + int, int, int, struct squashfs_page_actor *); +extern int squashfs_max_decompressors(void); /* export.c */ extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, u64, @@ -59,6 +66,13 @@ extern int squashfs_frag_lookup(struct super_block *, unsigned int, u64 *); extern __le64 *squashfs_read_fragment_index_table(struct super_block *, u64, u64, unsigned int); +/* file.c */ +void squashfs_copy_cache(struct page *, struct squashfs_cache_entry *, int, + int); + +/* file_xxx.c */ +extern int squashfs_readpage_block(struct page *, u64, int); + /* id.c */ extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *); extern __le64 *squashfs_read_id_index_table(struct super_block *, u64, u64, diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 52934a22f29..1da565cb50c 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -50,6 +50,7 @@ struct squashfs_cache_entry { wait_queue_head_t wait_queue; struct squashfs_cache *cache; void **data; + struct squashfs_page_actor *actor; }; struct squashfs_sb_info { @@ -63,10 +64,9 @@ struct squashfs_sb_info { __le64 *id_table; __le64 *fragment_index; __le64 *xattr_id_table; - struct mutex read_data_mutex; struct mutex meta_index_mutex; struct meta_index *meta_index; - void *stream; + struct squashfs_stream *stream; __le64 *inode_lookup_table; u64 inode_table; u64 directory_table; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 60553a9053c..031c8d67fd5 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -98,7 +98,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE); msblk->devblksize_log2 = ffz(~msblk->devblksize); - mutex_init(&msblk->read_data_mutex); mutex_init(&msblk->meta_index_mutex); /* @@ -206,13 +205,14 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; /* Allocate read_page block */ - msblk->read_page = squashfs_cache_init("data", 1, msblk->block_size); + msblk->read_page = squashfs_cache_init("data", + squashfs_max_decompressors(), msblk->block_size); if (msblk->read_page == NULL) { ERROR("Failed to allocate read_page block\n"); goto failed_mount; } - msblk->stream = squashfs_decompressor_init(sb, flags); + msblk->stream = squashfs_decompressor_setup(sb, flags); if (IS_ERR(msblk->stream)) { err = PTR_ERR(msblk->stream); msblk->stream = NULL; @@ -336,7 +336,7 @@ failed_mount: squashfs_cache_delete(msblk->block_cache); squashfs_cache_delete(msblk->fragment_cache); squashfs_cache_delete(msblk->read_page); - squashfs_decompressor_free(msblk, msblk->stream); + squashfs_decompressor_destroy(msblk); kfree(msblk->inode_lookup_table); kfree(msblk->fragment_index); kfree(msblk->id_table); @@ -371,6 +371,7 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf) static int squashfs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_RDONLY; return 0; } @@ -383,7 +384,7 @@ static void squashfs_put_super(struct super_block *sb) squashfs_cache_delete(sbi->block_cache); squashfs_cache_delete(sbi->fragment_cache); squashfs_cache_delete(sbi->read_page); - squashfs_decompressor_free(sbi, sbi->stream); + squashfs_decompressor_destroy(sbi); kfree(sbi->id_table); kfree(sbi->fragment_index); kfree(sbi->meta_index); diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index 1760b7d108f..c609624e4b8 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -32,44 +32,70 @@ #include "squashfs_fs_sb.h" #include "squashfs.h" #include "decompressor.h" +#include "page_actor.h" struct squashfs_xz { struct xz_dec *state; struct xz_buf buf; }; -struct comp_opts { +struct disk_comp_opts { __le32 dictionary_size; __le32 flags; }; -static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff, - int len) +struct comp_opts { + int dict_size; +}; + +static void *squashfs_xz_comp_opts(struct squashfs_sb_info *msblk, + void *buff, int len) { - struct comp_opts *comp_opts = buff; - struct squashfs_xz *stream; - int dict_size = msblk->block_size; - int err, n; + struct disk_comp_opts *comp_opts = buff; + struct comp_opts *opts; + int err = 0, n; + + opts = kmalloc(sizeof(*opts), GFP_KERNEL); + if (opts == NULL) { + err = -ENOMEM; + goto out2; + } if (comp_opts) { /* check compressor options are the expected length */ if (len < sizeof(*comp_opts)) { err = -EIO; - goto failed; + goto out; } - dict_size = le32_to_cpu(comp_opts->dictionary_size); + opts->dict_size = le32_to_cpu(comp_opts->dictionary_size); /* the dictionary size should be 2^n or 2^n+2^(n+1) */ - n = ffs(dict_size) - 1; - if (dict_size != (1 << n) && dict_size != (1 << n) + + n = ffs(opts->dict_size) - 1; + if (opts->dict_size != (1 << n) && opts->dict_size != (1 << n) + (1 << (n + 1))) { err = -EIO; - goto failed; + goto out; } - } + } else + /* use defaults */ + opts->dict_size = max_t(int, msblk->block_size, + SQUASHFS_METADATA_SIZE); + + return opts; + +out: + kfree(opts); +out2: + return ERR_PTR(err); +} + - dict_size = max_t(int, dict_size, SQUASHFS_METADATA_SIZE); +static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff) +{ + struct comp_opts *comp_opts = buff; + struct squashfs_xz *stream; + int err; stream = kmalloc(sizeof(*stream), GFP_KERNEL); if (stream == NULL) { @@ -77,7 +103,7 @@ static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff, goto failed; } - stream->state = xz_dec_init(XZ_PREALLOC, dict_size); + stream->state = xz_dec_init(XZ_PREALLOC, comp_opts->dict_size); if (stream->state == NULL) { kfree(stream); err = -ENOMEM; @@ -103,42 +129,37 @@ static void squashfs_xz_free(void *strm) } -static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer, - struct buffer_head **bh, int b, int offset, int length, int srclength, - int pages) +static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, + struct buffer_head **bh, int b, int offset, int length, + struct squashfs_page_actor *output) { enum xz_ret xz_err; - int avail, total = 0, k = 0, page = 0; - struct squashfs_xz *stream = msblk->stream; - - mutex_lock(&msblk->read_data_mutex); + int avail, total = 0, k = 0; + struct squashfs_xz *stream = strm; xz_dec_reset(stream->state); stream->buf.in_pos = 0; stream->buf.in_size = 0; stream->buf.out_pos = 0; stream->buf.out_size = PAGE_CACHE_SIZE; - stream->buf.out = buffer[page++]; + stream->buf.out = squashfs_first_page(output); do { if (stream->buf.in_pos == stream->buf.in_size && k < b) { avail = min(length, msblk->devblksize - offset); length -= avail; - wait_on_buffer(bh[k]); - if (!buffer_uptodate(bh[k])) - goto release_mutex; - stream->buf.in = bh[k]->b_data + offset; stream->buf.in_size = avail; stream->buf.in_pos = 0; offset = 0; } - if (stream->buf.out_pos == stream->buf.out_size - && page < pages) { - stream->buf.out = buffer[page++]; - stream->buf.out_pos = 0; - total += PAGE_CACHE_SIZE; + if (stream->buf.out_pos == stream->buf.out_size) { + stream->buf.out = squashfs_next_page(output); + if (stream->buf.out != NULL) { + stream->buf.out_pos = 0; + total += PAGE_CACHE_SIZE; + } } xz_err = xz_dec_run(stream->state, &stream->buf); @@ -147,23 +168,14 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer, put_bh(bh[k++]); } while (xz_err == XZ_OK); - if (xz_err != XZ_STREAM_END) { - ERROR("xz_dec_run error, data probably corrupt\n"); - goto release_mutex; - } - - if (k < b) { - ERROR("xz_uncompress error, input remaining\n"); - goto release_mutex; - } + squashfs_finish_page(output); - total += stream->buf.out_pos; - mutex_unlock(&msblk->read_data_mutex); - return total; + if (xz_err != XZ_STREAM_END || k < b) + goto out; -release_mutex: - mutex_unlock(&msblk->read_data_mutex); + return total + stream->buf.out_pos; +out: for (; k < b; k++) put_bh(bh[k]); @@ -172,6 +184,7 @@ release_mutex: const struct squashfs_decompressor squashfs_xz_comp_ops = { .init = squashfs_xz_init, + .comp_opts = squashfs_xz_comp_opts, .free = squashfs_xz_free, .decompress = squashfs_xz_uncompress, .id = XZ_COMPRESSION, diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index 55d918fd2d8..8727caba688 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -32,8 +32,9 @@ #include "squashfs_fs_sb.h" #include "squashfs.h" #include "decompressor.h" +#include "page_actor.h" -static void *zlib_init(struct squashfs_sb_info *dummy, void *buff, int len) +static void *zlib_init(struct squashfs_sb_info *dummy, void *buff) { z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL); if (stream == NULL) @@ -61,44 +62,37 @@ static void zlib_free(void *strm) } -static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, - struct buffer_head **bh, int b, int offset, int length, int srclength, - int pages) +static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, + struct buffer_head **bh, int b, int offset, int length, + struct squashfs_page_actor *output) { - int zlib_err, zlib_init = 0; - int k = 0, page = 0; - z_stream *stream = msblk->stream; - - mutex_lock(&msblk->read_data_mutex); + int zlib_err, zlib_init = 0, k = 0; + z_stream *stream = strm; - stream->avail_out = 0; + stream->avail_out = PAGE_CACHE_SIZE; + stream->next_out = squashfs_first_page(output); stream->avail_in = 0; do { if (stream->avail_in == 0 && k < b) { int avail = min(length, msblk->devblksize - offset); length -= avail; - wait_on_buffer(bh[k]); - if (!buffer_uptodate(bh[k])) - goto release_mutex; - stream->next_in = bh[k]->b_data + offset; stream->avail_in = avail; offset = 0; } - if (stream->avail_out == 0 && page < pages) { - stream->next_out = buffer[page++]; - stream->avail_out = PAGE_CACHE_SIZE; + if (stream->avail_out == 0) { + stream->next_out = squashfs_next_page(output); + if (stream->next_out != NULL) + stream->avail_out = PAGE_CACHE_SIZE; } if (!zlib_init) { zlib_err = zlib_inflateInit(stream); if (zlib_err != Z_OK) { - ERROR("zlib_inflateInit returned unexpected " - "result 0x%x, srclength %d\n", - zlib_err, srclength); - goto release_mutex; + squashfs_finish_page(output); + goto out; } zlib_init = 1; } @@ -109,29 +103,21 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, put_bh(bh[k++]); } while (zlib_err == Z_OK); - if (zlib_err != Z_STREAM_END) { - ERROR("zlib_inflate error, data probably corrupt\n"); - goto release_mutex; - } + squashfs_finish_page(output); - zlib_err = zlib_inflateEnd(stream); - if (zlib_err != Z_OK) { - ERROR("zlib_inflate error, data probably corrupt\n"); - goto release_mutex; - } + if (zlib_err != Z_STREAM_END) + goto out; - if (k < b) { - ERROR("zlib_uncompress error, data remaining\n"); - goto release_mutex; - } + zlib_err = zlib_inflateEnd(stream); + if (zlib_err != Z_OK) + goto out; - length = stream->total_out; - mutex_unlock(&msblk->read_data_mutex); - return length; + if (k < b) + goto out; -release_mutex: - mutex_unlock(&msblk->read_data_mutex); + return stream->total_out; +out: for (; k < b; k++) put_bh(bh[k]); diff --git a/fs/stat.c b/fs/stat.c index d0ea7ef75e2..ae0c3cef992 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -37,14 +37,21 @@ void generic_fillattr(struct inode *inode, struct kstat *stat) EXPORT_SYMBOL(generic_fillattr); -int vfs_getattr(struct path *path, struct kstat *stat) +/** + * vfs_getattr_nosec - getattr without security checks + * @path: file to get attributes from + * @stat: structure to return attributes in + * + * Get attributes without calling security_inode_getattr. + * + * Currently the only caller other than vfs_getattr is internal to the + * filehandle lookup code, which uses only the inode number and returns + * no attributes to any user. Any other code probably wants + * vfs_getattr. + */ +int vfs_getattr_nosec(struct path *path, struct kstat *stat) { struct inode *inode = path->dentry->d_inode; - int retval; - - retval = security_inode_getattr(path->mnt, path->dentry); - if (retval) - return retval; if (inode->i_op->getattr) return inode->i_op->getattr(path->mnt, path->dentry, stat); @@ -53,6 +60,18 @@ int vfs_getattr(struct path *path, struct kstat *stat) return 0; } +EXPORT_SYMBOL(vfs_getattr_nosec); + +int vfs_getattr(struct path *path, struct kstat *stat) +{ + int retval; + + retval = security_inode_getattr(path->mnt, path->dentry); + if (retval) + return retval; + return vfs_getattr_nosec(path, stat); +} + EXPORT_SYMBOL(vfs_getattr); int vfs_fstat(unsigned int fd, struct kstat *stat) diff --git a/fs/statfs.c b/fs/statfs.c index c219e733f55..083dc0ac914 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -94,7 +94,7 @@ retry: int fd_statfs(int fd, struct kstatfs *st) { - struct fd f = fdget(fd); + struct fd f = fdget_raw(fd); int error = -EBADF; if (f.file) { error = vfs_statfs(&f.file->f_path, st); diff --git a/fs/super.c b/fs/super.c index 0225c20f877..d20d5b11ded 100644 --- a/fs/super.c +++ b/fs/super.c @@ -112,9 +112,14 @@ static unsigned long super_cache_count(struct shrinker *shrink, sb = container_of(shrink, struct super_block, s_shrink); - if (!grab_super_passive(sb)) - return 0; - + /* + * Don't call grab_super_passive as it is a potential + * scalability bottleneck. The counts could get updated + * between super_cache_count and super_cache_scan anyway. + * Call to super_cache_count with shrinker_rwsem held + * ensures the safety of call to list_lru_count_node() and + * s_op->nr_cached_objects(). + */ if (sb->s_op && sb->s_op->nr_cached_objects) total_objects = sb->s_op->nr_cached_objects(sb, sc->nid); @@ -125,37 +130,27 @@ static unsigned long super_cache_count(struct shrinker *shrink, sc->nid); total_objects = vfs_pressure_ratio(total_objects); - drop_super(sb); return total_objects; } -static int init_sb_writers(struct super_block *s, struct file_system_type *type) -{ - int err; - int i; - - for (i = 0; i < SB_FREEZE_LEVELS; i++) { - err = percpu_counter_init(&s->s_writers.counter[i], 0); - if (err < 0) - goto err_out; - lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], - &type->s_writers_key[i], 0); - } - init_waitqueue_head(&s->s_writers.wait); - init_waitqueue_head(&s->s_writers.wait_unfrozen); - return 0; -err_out: - while (--i >= 0) - percpu_counter_destroy(&s->s_writers.counter[i]); - return err; -} - -static void destroy_sb_writers(struct super_block *s) +/** + * destroy_super - frees a superblock + * @s: superblock to free + * + * Frees a superblock. + */ +static void destroy_super(struct super_block *s) { int i; - + list_lru_destroy(&s->s_dentry_lru); + list_lru_destroy(&s->s_inode_lru); for (i = 0; i < SB_FREEZE_LEVELS; i++) percpu_counter_destroy(&s->s_writers.counter[i]); + security_sb_free(s); + WARN_ON(!list_empty(&s->s_mounts)); + kfree(s->s_subtype); + kfree(s->s_options); + kfree_rcu(s, rcu); } /** @@ -170,111 +165,75 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) { struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); static const struct super_operations default_op; + int i; - if (s) { - if (security_sb_alloc(s)) - goto out_free_sb; + if (!s) + return NULL; -#ifdef CONFIG_SMP - s->s_files = alloc_percpu(struct list_head); - if (!s->s_files) - goto err_out; - else { - int i; + INIT_LIST_HEAD(&s->s_mounts); - for_each_possible_cpu(i) - INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i)); - } -#else - INIT_LIST_HEAD(&s->s_files); -#endif - if (init_sb_writers(s, type)) - goto err_out; - s->s_flags = flags; - s->s_bdi = &default_backing_dev_info; - INIT_HLIST_NODE(&s->s_instances); - INIT_HLIST_BL_HEAD(&s->s_anon); - INIT_LIST_HEAD(&s->s_inodes); - - if (list_lru_init(&s->s_dentry_lru)) - goto err_out; - if (list_lru_init(&s->s_inode_lru)) - goto err_out_dentry_lru; - - INIT_LIST_HEAD(&s->s_mounts); - init_rwsem(&s->s_umount); - lockdep_set_class(&s->s_umount, &type->s_umount_key); - /* - * sget() can have s_umount recursion. - * - * When it cannot find a suitable sb, it allocates a new - * one (this one), and tries again to find a suitable old - * one. - * - * In case that succeeds, it will acquire the s_umount - * lock of the old one. Since these are clearly distrinct - * locks, and this object isn't exposed yet, there's no - * risk of deadlocks. - * - * Annotate this by putting this lock in a different - * subclass. - */ - down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); - s->s_count = 1; - atomic_set(&s->s_active, 1); - mutex_init(&s->s_vfs_rename_mutex); - lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); - mutex_init(&s->s_dquot.dqio_mutex); - mutex_init(&s->s_dquot.dqonoff_mutex); - init_rwsem(&s->s_dquot.dqptr_sem); - s->s_maxbytes = MAX_NON_LFS; - s->s_op = &default_op; - s->s_time_gran = 1000000000; - s->cleancache_poolid = -1; - - s->s_shrink.seeks = DEFAULT_SEEKS; - s->s_shrink.scan_objects = super_cache_scan; - s->s_shrink.count_objects = super_cache_count; - s->s_shrink.batch = 1024; - s->s_shrink.flags = SHRINKER_NUMA_AWARE; + if (security_sb_alloc(s)) + goto fail; + + for (i = 0; i < SB_FREEZE_LEVELS; i++) { + if (percpu_counter_init(&s->s_writers.counter[i], 0) < 0) + goto fail; + lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], + &type->s_writers_key[i], 0); } -out: + init_waitqueue_head(&s->s_writers.wait); + init_waitqueue_head(&s->s_writers.wait_unfrozen); + s->s_flags = flags; + s->s_bdi = &default_backing_dev_info; + INIT_HLIST_NODE(&s->s_instances); + INIT_HLIST_BL_HEAD(&s->s_anon); + INIT_LIST_HEAD(&s->s_inodes); + + if (list_lru_init(&s->s_dentry_lru)) + goto fail; + if (list_lru_init(&s->s_inode_lru)) + goto fail; + + init_rwsem(&s->s_umount); + lockdep_set_class(&s->s_umount, &type->s_umount_key); + /* + * sget() can have s_umount recursion. + * + * When it cannot find a suitable sb, it allocates a new + * one (this one), and tries again to find a suitable old + * one. + * + * In case that succeeds, it will acquire the s_umount + * lock of the old one. Since these are clearly distrinct + * locks, and this object isn't exposed yet, there's no + * risk of deadlocks. + * + * Annotate this by putting this lock in a different + * subclass. + */ + down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); + s->s_count = 1; + atomic_set(&s->s_active, 1); + mutex_init(&s->s_vfs_rename_mutex); + lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); + mutex_init(&s->s_dquot.dqio_mutex); + mutex_init(&s->s_dquot.dqonoff_mutex); + init_rwsem(&s->s_dquot.dqptr_sem); + s->s_maxbytes = MAX_NON_LFS; + s->s_op = &default_op; + s->s_time_gran = 1000000000; + s->cleancache_poolid = -1; + + s->s_shrink.seeks = DEFAULT_SEEKS; + s->s_shrink.scan_objects = super_cache_scan; + s->s_shrink.count_objects = super_cache_count; + s->s_shrink.batch = 1024; + s->s_shrink.flags = SHRINKER_NUMA_AWARE; return s; -err_out_dentry_lru: - list_lru_destroy(&s->s_dentry_lru); -err_out: - security_sb_free(s); -#ifdef CONFIG_SMP - if (s->s_files) - free_percpu(s->s_files); -#endif - destroy_sb_writers(s); -out_free_sb: - kfree(s); - s = NULL; - goto out; -} - -/** - * destroy_super - frees a superblock - * @s: superblock to free - * - * Frees a superblock. - */ -static inline void destroy_super(struct super_block *s) -{ - list_lru_destroy(&s->s_dentry_lru); - list_lru_destroy(&s->s_inode_lru); -#ifdef CONFIG_SMP - free_percpu(s->s_files); -#endif - destroy_sb_writers(s); - security_sb_free(s); - WARN_ON(!list_empty(&s->s_mounts)); - kfree(s->s_subtype); - kfree(s->s_options); - kfree(s); +fail: + destroy_super(s); + return NULL; } /* Superblock refcounting */ @@ -321,10 +280,8 @@ void deactivate_locked_super(struct super_block *s) struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { cleancache_invalidate_fs(s); - fs->kill_sb(s); - - /* caches are now gone, we can safely kill the shrinker now */ unregister_shrinker(&s->s_shrink); + fs->kill_sb(s); put_filesystem(fs); put_super(s); @@ -748,7 +705,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) if (flags & MS_RDONLY) acct_auto_close(sb); shrink_dcache_sb(sb); - sync_filesystem(sb); remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); @@ -756,7 +712,8 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) make sure there are no rw files opened */ if (remount_ro) { if (force) { - mark_files_ro(sb); + sb->s_readonly_remount = 1; + smp_wmb(); } else { retval = sb_prepare_remount_readonly(sb); if (retval) @@ -845,7 +802,10 @@ void emergency_remount(void) static DEFINE_IDA(unnamed_dev_ida); static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ -static int unnamed_dev_start = 0; /* don't bother trying below it */ +/* Many userspace utilities consider an FSID of 0 invalid. + * Always return at least 1 from get_anon_bdev. + */ +static int unnamed_dev_start = 1; int get_anon_bdev(dev_t *p) { diff --git a/fs/sync.c b/fs/sync.c index 905f3f6b3d8..b28d1dd10e8 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -177,7 +177,7 @@ SYSCALL_DEFINE1(syncfs, int, fd) */ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) { - if (!file->f_op || !file->f_op->fsync) + if (!file->f_op->fsync) return -EINVAL; return file->f_op->fsync(file, start, end, datasync); } @@ -219,23 +219,6 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd) return do_fsync(fd, 1); } -/** - * generic_write_sync - perform syncing after a write if file / inode is sync - * @file: file to which the write happened - * @pos: offset where the write started - * @count: length of the write - * - * This is just a simple wrapper about our general syncing function. - */ -int generic_write_sync(struct file *file, loff_t pos, loff_t count) -{ - if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host)) - return 0; - return vfs_fsync_range(file, pos, pos + count - 1, - (file->f_flags & __O_SYNC) ? 0 : 1); -} -EXPORT_SYMBOL(generic_write_sync); - /* * sys_sync_file_range() permits finely controlled syncing over a segment of * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig index 8c41feacbac..b2756014508 100644 --- a/fs/sysfs/Kconfig +++ b/fs/sysfs/Kconfig @@ -1,6 +1,7 @@ config SYSFS bool "sysfs file system support" if EXPERT default y + select KERNFS help The sysfs filesystem is a virtual filesystem that the kernel uses to export internal kernel objects, their attributes, and their diff --git a/fs/sysfs/Makefile b/fs/sysfs/Makefile index 7a1ceb946b8..6eff6e1205a 100644 --- a/fs/sysfs/Makefile +++ b/fs/sysfs/Makefile @@ -2,5 +2,4 @@ # Makefile for the sysfs virtual filesystem # -obj-y := inode.o file.o dir.o symlink.o mount.o bin.o \ - group.o +obj-y := file.o dir.o symlink.o mount.o group.o diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c deleted file mode 100644 index c590cabd57b..00000000000 --- a/fs/sysfs/bin.c +++ /dev/null @@ -1,502 +0,0 @@ -/* - * fs/sysfs/bin.c - sysfs binary file implementation - * - * Copyright (c) 2003 Patrick Mochel - * Copyright (c) 2003 Matthew Wilcox - * Copyright (c) 2004 Silicon Graphics, Inc. - * Copyright (c) 2007 SUSE Linux Products GmbH - * Copyright (c) 2007 Tejun Heo <teheo@suse.de> - * - * This file is released under the GPLv2. - * - * Please see Documentation/filesystems/sysfs.txt for more information. - */ - -#undef DEBUG - -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/kernel.h> -#include <linux/kobject.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/mutex.h> -#include <linux/mm.h> -#include <linux/uaccess.h> - -#include "sysfs.h" - -/* - * There's one bin_buffer for each open file. - * - * filp->private_data points to bin_buffer and - * sysfs_dirent->s_bin_attr.buffers points to a the bin_buffer s - * sysfs_dirent->s_bin_attr.buffers is protected by sysfs_bin_lock - */ -static DEFINE_MUTEX(sysfs_bin_lock); - -struct bin_buffer { - struct mutex mutex; - void *buffer; - int mmapped; - const struct vm_operations_struct *vm_ops; - struct file *file; - struct hlist_node list; -}; - -static int -fill_read(struct file *file, char *buffer, loff_t off, size_t count) -{ - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; - struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - int rc; - - /* need attr_sd for attr, its parent for kobj */ - if (!sysfs_get_active(attr_sd)) - return -ENODEV; - - rc = -EIO; - if (attr->read) - rc = attr->read(file, kobj, attr, buffer, off, count); - - sysfs_put_active(attr_sd); - - return rc; -} - -static ssize_t -read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) -{ - struct bin_buffer *bb = file->private_data; - int size = file_inode(file)->i_size; - loff_t offs = *off; - int count = min_t(size_t, bytes, PAGE_SIZE); - char *temp; - - if (!bytes) - return 0; - - if (size) { - if (offs > size) - return 0; - if (offs + count > size) - count = size - offs; - } - - temp = kmalloc(count, GFP_KERNEL); - if (!temp) - return -ENOMEM; - - mutex_lock(&bb->mutex); - - count = fill_read(file, bb->buffer, offs, count); - if (count < 0) { - mutex_unlock(&bb->mutex); - goto out_free; - } - - memcpy(temp, bb->buffer, count); - - mutex_unlock(&bb->mutex); - - if (copy_to_user(userbuf, temp, count)) { - count = -EFAULT; - goto out_free; - } - - pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count); - - *off = offs + count; - - out_free: - kfree(temp); - return count; -} - -static int -flush_write(struct file *file, char *buffer, loff_t offset, size_t count) -{ - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; - struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - int rc; - - /* need attr_sd for attr, its parent for kobj */ - if (!sysfs_get_active(attr_sd)) - return -ENODEV; - - rc = -EIO; - if (attr->write) - rc = attr->write(file, kobj, attr, buffer, offset, count); - - sysfs_put_active(attr_sd); - - return rc; -} - -static ssize_t write(struct file *file, const char __user *userbuf, - size_t bytes, loff_t *off) -{ - struct bin_buffer *bb = file->private_data; - int size = file_inode(file)->i_size; - loff_t offs = *off; - int count = min_t(size_t, bytes, PAGE_SIZE); - char *temp; - - if (!bytes) - return 0; - - if (size) { - if (offs > size) - return 0; - if (offs + count > size) - count = size - offs; - } - - temp = memdup_user(userbuf, count); - if (IS_ERR(temp)) - return PTR_ERR(temp); - - mutex_lock(&bb->mutex); - - memcpy(bb->buffer, temp, count); - - count = flush_write(file, bb->buffer, offs, count); - mutex_unlock(&bb->mutex); - - if (count > 0) - *off = offs + count; - - kfree(temp); - return count; -} - -static void bin_vma_open(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - struct bin_buffer *bb = file->private_data; - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - - if (!bb->vm_ops) - return; - - if (!sysfs_get_active(attr_sd)) - return; - - if (bb->vm_ops->open) - bb->vm_ops->open(vma); - - sysfs_put_active(attr_sd); -} - -static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct file *file = vma->vm_file; - struct bin_buffer *bb = file->private_data; - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - int ret; - - if (!bb->vm_ops) - return VM_FAULT_SIGBUS; - - if (!sysfs_get_active(attr_sd)) - return VM_FAULT_SIGBUS; - - ret = VM_FAULT_SIGBUS; - if (bb->vm_ops->fault) - ret = bb->vm_ops->fault(vma, vmf); - - sysfs_put_active(attr_sd); - return ret; -} - -static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct file *file = vma->vm_file; - struct bin_buffer *bb = file->private_data; - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - int ret; - - if (!bb->vm_ops) - return VM_FAULT_SIGBUS; - - if (!sysfs_get_active(attr_sd)) - return VM_FAULT_SIGBUS; - - ret = 0; - if (bb->vm_ops->page_mkwrite) - ret = bb->vm_ops->page_mkwrite(vma, vmf); - else - file_update_time(file); - - sysfs_put_active(attr_sd); - return ret; -} - -static int bin_access(struct vm_area_struct *vma, unsigned long addr, - void *buf, int len, int write) -{ - struct file *file = vma->vm_file; - struct bin_buffer *bb = file->private_data; - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - int ret; - - if (!bb->vm_ops) - return -EINVAL; - - if (!sysfs_get_active(attr_sd)) - return -EINVAL; - - ret = -EINVAL; - if (bb->vm_ops->access) - ret = bb->vm_ops->access(vma, addr, buf, len, write); - - sysfs_put_active(attr_sd); - return ret; -} - -#ifdef CONFIG_NUMA -static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new) -{ - struct file *file = vma->vm_file; - struct bin_buffer *bb = file->private_data; - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - int ret; - - if (!bb->vm_ops) - return 0; - - if (!sysfs_get_active(attr_sd)) - return -EINVAL; - - ret = 0; - if (bb->vm_ops->set_policy) - ret = bb->vm_ops->set_policy(vma, new); - - sysfs_put_active(attr_sd); - return ret; -} - -static struct mempolicy *bin_get_policy(struct vm_area_struct *vma, - unsigned long addr) -{ - struct file *file = vma->vm_file; - struct bin_buffer *bb = file->private_data; - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - struct mempolicy *pol; - - if (!bb->vm_ops) - return vma->vm_policy; - - if (!sysfs_get_active(attr_sd)) - return vma->vm_policy; - - pol = vma->vm_policy; - if (bb->vm_ops->get_policy) - pol = bb->vm_ops->get_policy(vma, addr); - - sysfs_put_active(attr_sd); - return pol; -} - -static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from, - const nodemask_t *to, unsigned long flags) -{ - struct file *file = vma->vm_file; - struct bin_buffer *bb = file->private_data; - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - int ret; - - if (!bb->vm_ops) - return 0; - - if (!sysfs_get_active(attr_sd)) - return 0; - - ret = 0; - if (bb->vm_ops->migrate) - ret = bb->vm_ops->migrate(vma, from, to, flags); - - sysfs_put_active(attr_sd); - return ret; -} -#endif - -static const struct vm_operations_struct bin_vm_ops = { - .open = bin_vma_open, - .fault = bin_fault, - .page_mkwrite = bin_page_mkwrite, - .access = bin_access, -#ifdef CONFIG_NUMA - .set_policy = bin_set_policy, - .get_policy = bin_get_policy, - .migrate = bin_migrate, -#endif -}; - -static int mmap(struct file *file, struct vm_area_struct *vma) -{ - struct bin_buffer *bb = file->private_data; - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; - struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - int rc; - - mutex_lock(&bb->mutex); - - /* need attr_sd for attr, its parent for kobj */ - rc = -ENODEV; - if (!sysfs_get_active(attr_sd)) - goto out_unlock; - - rc = -EINVAL; - if (!attr->mmap) - goto out_put; - - rc = attr->mmap(file, kobj, attr, vma); - if (rc) - goto out_put; - - /* - * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() - * to satisfy versions of X which crash if the mmap fails: that - * substitutes a new vm_file, and we don't then want bin_vm_ops. - */ - if (vma->vm_file != file) - goto out_put; - - rc = -EINVAL; - if (bb->mmapped && bb->vm_ops != vma->vm_ops) - goto out_put; - - /* - * It is not possible to successfully wrap close. - * So error if someone is trying to use close. - */ - rc = -EINVAL; - if (vma->vm_ops && vma->vm_ops->close) - goto out_put; - - rc = 0; - bb->mmapped = 1; - bb->vm_ops = vma->vm_ops; - vma->vm_ops = &bin_vm_ops; -out_put: - sysfs_put_active(attr_sd); -out_unlock: - mutex_unlock(&bb->mutex); - - return rc; -} - -static int open(struct inode *inode, struct file *file) -{ - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; - struct bin_buffer *bb = NULL; - int error; - - /* binary file operations requires both @sd and its parent */ - if (!sysfs_get_active(attr_sd)) - return -ENODEV; - - error = -EACCES; - if ((file->f_mode & FMODE_WRITE) && !(attr->write || attr->mmap)) - goto err_out; - if ((file->f_mode & FMODE_READ) && !(attr->read || attr->mmap)) - goto err_out; - - error = -ENOMEM; - bb = kzalloc(sizeof(*bb), GFP_KERNEL); - if (!bb) - goto err_out; - - bb->buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!bb->buffer) - goto err_out; - - mutex_init(&bb->mutex); - bb->file = file; - file->private_data = bb; - - mutex_lock(&sysfs_bin_lock); - hlist_add_head(&bb->list, &attr_sd->s_bin_attr.buffers); - mutex_unlock(&sysfs_bin_lock); - - /* open succeeded, put active references */ - sysfs_put_active(attr_sd); - return 0; - - err_out: - sysfs_put_active(attr_sd); - kfree(bb); - return error; -} - -static int release(struct inode *inode, struct file *file) -{ - struct bin_buffer *bb = file->private_data; - - mutex_lock(&sysfs_bin_lock); - hlist_del(&bb->list); - mutex_unlock(&sysfs_bin_lock); - - kfree(bb->buffer); - kfree(bb); - return 0; -} - -const struct file_operations bin_fops = { - .read = read, - .write = write, - .mmap = mmap, - .llseek = generic_file_llseek, - .open = open, - .release = release, -}; - - -void unmap_bin_file(struct sysfs_dirent *attr_sd) -{ - struct bin_buffer *bb; - - if (sysfs_type(attr_sd) != SYSFS_KOBJ_BIN_ATTR) - return; - - mutex_lock(&sysfs_bin_lock); - - hlist_for_each_entry(bb, &attr_sd->s_bin_attr.buffers, list) { - struct inode *inode = file_inode(bb->file); - - unmap_mapping_range(inode->i_mapping, 0, 0, 1); - } - - mutex_unlock(&sysfs_bin_lock); -} - -/** - * sysfs_create_bin_file - create binary file for object. - * @kobj: object. - * @attr: attribute descriptor. - */ -int sysfs_create_bin_file(struct kobject *kobj, - const struct bin_attribute *attr) -{ - BUG_ON(!kobj || !kobj->sd || !attr); - - return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR); -} -EXPORT_SYMBOL_GPL(sysfs_create_bin_file); - -/** - * sysfs_remove_bin_file - remove binary file for object. - * @kobj: object. - * @attr: attribute descriptor. - */ -void sysfs_remove_bin_file(struct kobject *kobj, - const struct bin_attribute *attr) -{ - sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name); -} -EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 4d83cedb9fc..0b45ff42f37 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -13,835 +13,55 @@ #undef DEBUG #include <linux/fs.h> -#include <linux/mount.h> -#include <linux/module.h> #include <linux/kobject.h> -#include <linux/namei.h> -#include <linux/idr.h> -#include <linux/completion.h> -#include <linux/mutex.h> #include <linux/slab.h> -#include <linux/security.h> -#include <linux/hash.h> #include "sysfs.h" -DEFINE_MUTEX(sysfs_mutex); -DEFINE_SPINLOCK(sysfs_assoc_lock); +DEFINE_SPINLOCK(sysfs_symlink_target_lock); -#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb); - -static DEFINE_SPINLOCK(sysfs_ino_lock); -static DEFINE_IDA(sysfs_ino_ida); - -/** - * sysfs_name_hash - * @ns: Namespace tag to hash - * @name: Null terminated string to hash - * - * Returns 31 bit hash of ns + name (so it fits in an off_t ) - */ -static unsigned int sysfs_name_hash(const void *ns, const char *name) -{ - unsigned long hash = init_name_hash(); - unsigned int len = strlen(name); - while (len--) - hash = partial_name_hash(*name++, hash); - hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31)); - hash &= 0x7fffffffU; - /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ - if (hash < 1) - hash += 2; - if (hash >= INT_MAX) - hash = INT_MAX - 1; - return hash; -} - -static int sysfs_name_compare(unsigned int hash, const void *ns, - const char *name, const struct sysfs_dirent *sd) -{ - if (hash != sd->s_hash) - return hash - sd->s_hash; - if (ns != sd->s_ns) - return ns - sd->s_ns; - return strcmp(name, sd->s_name); -} - -static int sysfs_sd_compare(const struct sysfs_dirent *left, - const struct sysfs_dirent *right) -{ - return sysfs_name_compare(left->s_hash, left->s_ns, left->s_name, - right); -} - -/** - * sysfs_link_sibling - link sysfs_dirent into sibling rbtree - * @sd: sysfs_dirent of interest - * - * Link @sd into its sibling rbtree which starts from - * sd->s_parent->s_dir.children. - * - * Locking: - * mutex_lock(sysfs_mutex) - * - * RETURNS: - * 0 on susccess -EEXIST on failure. - */ -static int sysfs_link_sibling(struct sysfs_dirent *sd) -{ - struct rb_node **node = &sd->s_parent->s_dir.children.rb_node; - struct rb_node *parent = NULL; - - if (sysfs_type(sd) == SYSFS_DIR) - sd->s_parent->s_dir.subdirs++; - - while (*node) { - struct sysfs_dirent *pos; - int result; - - pos = to_sysfs_dirent(*node); - parent = *node; - result = sysfs_sd_compare(sd, pos); - if (result < 0) - node = &pos->s_rb.rb_left; - else if (result > 0) - node = &pos->s_rb.rb_right; - else - return -EEXIST; - } - /* add new node and rebalance the tree */ - rb_link_node(&sd->s_rb, parent, node); - rb_insert_color(&sd->s_rb, &sd->s_parent->s_dir.children); - return 0; -} - -/** - * sysfs_unlink_sibling - unlink sysfs_dirent from sibling rbtree - * @sd: sysfs_dirent of interest - * - * Unlink @sd from its sibling rbtree which starts from - * sd->s_parent->s_dir.children. - * - * Locking: - * mutex_lock(sysfs_mutex) - */ -static void sysfs_unlink_sibling(struct sysfs_dirent *sd) -{ - if (sysfs_type(sd) == SYSFS_DIR) - sd->s_parent->s_dir.subdirs--; - - rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children); -} - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - -/* Test for attributes that want to ignore lockdep for read-locking */ -static bool ignore_lockdep(struct sysfs_dirent *sd) -{ - return sysfs_type(sd) == SYSFS_KOBJ_ATTR && - sd->s_attr.attr->ignore_lockdep; -} - -#else - -static inline bool ignore_lockdep(struct sysfs_dirent *sd) +void sysfs_warn_dup(struct kernfs_node *parent, const char *name) { - return true; -} - -#endif - -/** - * sysfs_get_active - get an active reference to sysfs_dirent - * @sd: sysfs_dirent to get an active reference to - * - * Get an active reference of @sd. This function is noop if @sd - * is NULL. - * - * RETURNS: - * Pointer to @sd on success, NULL on failure. - */ -struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) -{ - if (unlikely(!sd)) - return NULL; - - if (!atomic_inc_unless_negative(&sd->s_active)) - return NULL; - - if (likely(!ignore_lockdep(sd))) - rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_); - return sd; -} - -/** - * sysfs_put_active - put an active reference to sysfs_dirent - * @sd: sysfs_dirent to put an active reference to - * - * Put an active reference to @sd. This function is noop if @sd - * is NULL. - */ -void sysfs_put_active(struct sysfs_dirent *sd) -{ - int v; + char *buf, *path = NULL; - if (unlikely(!sd)) - return; + buf = kzalloc(PATH_MAX, GFP_KERNEL); + if (buf) + path = kernfs_path(parent, buf, PATH_MAX); - if (likely(!ignore_lockdep(sd))) - rwsem_release(&sd->dep_map, 1, _RET_IP_); - v = atomic_dec_return(&sd->s_active); - if (likely(v != SD_DEACTIVATED_BIAS)) - return; + WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s/%s'\n", + path, name); - /* atomic_dec_return() is a mb(), we'll always see the updated - * sd->u.completion. - */ - complete(sd->u.completion); + kfree(buf); } /** - * sysfs_deactivate - deactivate sysfs_dirent - * @sd: sysfs_dirent to deactivate - * - * Deny new active references and drain existing ones. + * sysfs_create_dir_ns - create a directory for an object with a namespace tag + * @kobj: object we're creating directory for + * @ns: the namespace tag to use */ -static void sysfs_deactivate(struct sysfs_dirent *sd) -{ - DECLARE_COMPLETION_ONSTACK(wait); - int v; - - BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED)); - - if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF)) - return; - - sd->u.completion = (void *)&wait; - - rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_); - /* atomic_add_return() is a mb(), put_active() will always see - * the updated sd->u.completion. - */ - v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active); - - if (v != SD_DEACTIVATED_BIAS) { - lock_contended(&sd->dep_map, _RET_IP_); - wait_for_completion(&wait); - } - - lock_acquired(&sd->dep_map, _RET_IP_); - rwsem_release(&sd->dep_map, 1, _RET_IP_); -} - -static int sysfs_alloc_ino(unsigned int *pino) -{ - int ino, rc; - - retry: - spin_lock(&sysfs_ino_lock); - rc = ida_get_new_above(&sysfs_ino_ida, 2, &ino); - spin_unlock(&sysfs_ino_lock); - - if (rc == -EAGAIN) { - if (ida_pre_get(&sysfs_ino_ida, GFP_KERNEL)) - goto retry; - rc = -ENOMEM; - } - - *pino = ino; - return rc; -} - -static void sysfs_free_ino(unsigned int ino) +int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { - spin_lock(&sysfs_ino_lock); - ida_remove(&sysfs_ino_ida, ino); - spin_unlock(&sysfs_ino_lock); -} - -void release_sysfs_dirent(struct sysfs_dirent *sd) -{ - struct sysfs_dirent *parent_sd; - - repeat: - /* Moving/renaming is always done while holding reference. - * sd->s_parent won't change beneath us. - */ - parent_sd = sd->s_parent; - - WARN(!(sd->s_flags & SYSFS_FLAG_REMOVED), - "sysfs: free using entry: %s/%s\n", - parent_sd ? parent_sd->s_name : "", sd->s_name); - - if (sysfs_type(sd) == SYSFS_KOBJ_LINK) - sysfs_put(sd->s_symlink.target_sd); - if (sysfs_type(sd) & SYSFS_COPY_NAME) - kfree(sd->s_name); - if (sd->s_iattr && sd->s_iattr->ia_secdata) - security_release_secctx(sd->s_iattr->ia_secdata, - sd->s_iattr->ia_secdata_len); - kfree(sd->s_iattr); - sysfs_free_ino(sd->s_ino); - kmem_cache_free(sysfs_dir_cachep, sd); - - sd = parent_sd; - if (sd && atomic_dec_and_test(&sd->s_count)) - goto repeat; -} - -static int sysfs_dentry_delete(const struct dentry *dentry) -{ - struct sysfs_dirent *sd = dentry->d_fsdata; - return !(sd && !(sd->s_flags & SYSFS_FLAG_REMOVED)); -} - -static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags) -{ - struct sysfs_dirent *sd; - int type; - - if (flags & LOOKUP_RCU) - return -ECHILD; - - sd = dentry->d_fsdata; - mutex_lock(&sysfs_mutex); - - /* The sysfs dirent has been deleted */ - if (sd->s_flags & SYSFS_FLAG_REMOVED) - goto out_bad; - - /* The sysfs dirent has been moved? */ - if (dentry->d_parent->d_fsdata != sd->s_parent) - goto out_bad; - - /* The sysfs dirent has been renamed */ - if (strcmp(dentry->d_name.name, sd->s_name) != 0) - goto out_bad; - - /* The sysfs dirent has been moved to a different namespace */ - type = KOBJ_NS_TYPE_NONE; - if (sd->s_parent) { - type = sysfs_ns_type(sd->s_parent); - if (type != KOBJ_NS_TYPE_NONE && - sysfs_info(dentry->d_sb)->ns[type] != sd->s_ns) - goto out_bad; - } - - mutex_unlock(&sysfs_mutex); -out_valid: - return 1; -out_bad: - /* Remove the dentry from the dcache hashes. - * If this is a deleted dentry we use d_drop instead of d_delete - * so sysfs doesn't need to cope with negative dentries. - * - * If this is a dentry that has simply been renamed we - * use d_drop to remove it from the dcache lookup on its - * old parent. If this dentry persists later when a lookup - * is performed at its new name the dentry will be readded - * to the dcache hashes. - */ - mutex_unlock(&sysfs_mutex); - - /* If we have submounts we must allow the vfs caches - * to lie about the state of the filesystem to prevent - * leaks and other nasty things. - */ - if (check_submounts_and_drop(dentry) != 0) - goto out_valid; - - return 0; -} - -static void sysfs_dentry_release(struct dentry *dentry) -{ - sysfs_put(dentry->d_fsdata); -} - -const struct dentry_operations sysfs_dentry_ops = { - .d_revalidate = sysfs_dentry_revalidate, - .d_delete = sysfs_dentry_delete, - .d_release = sysfs_dentry_release, -}; - -struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) -{ - char *dup_name = NULL; - struct sysfs_dirent *sd; - - if (type & SYSFS_COPY_NAME) { - name = dup_name = kstrdup(name, GFP_KERNEL); - if (!name) - return NULL; - } - - sd = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL); - if (!sd) - goto err_out1; - - if (sysfs_alloc_ino(&sd->s_ino)) - goto err_out2; - - atomic_set(&sd->s_count, 1); - atomic_set(&sd->s_active, 0); - - sd->s_name = name; - sd->s_mode = mode; - sd->s_flags = type | SYSFS_FLAG_REMOVED; - - return sd; - - err_out2: - kmem_cache_free(sysfs_dir_cachep, sd); - err_out1: - kfree(dup_name); - return NULL; -} - -/** - * sysfs_addrm_start - prepare for sysfs_dirent add/remove - * @acxt: pointer to sysfs_addrm_cxt to be used - * @parent_sd: parent sysfs_dirent - * - * This function is called when the caller is about to add or - * remove sysfs_dirent under @parent_sd. This function acquires - * sysfs_mutex. @acxt is used to keep and pass context to - * other addrm functions. - * - * LOCKING: - * Kernel thread context (may sleep). sysfs_mutex is locked on - * return. - */ -void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, - struct sysfs_dirent *parent_sd) -{ - memset(acxt, 0, sizeof(*acxt)); - acxt->parent_sd = parent_sd; - - mutex_lock(&sysfs_mutex); -} - -/** - * __sysfs_add_one - add sysfs_dirent to parent without warning - * @acxt: addrm context to use - * @sd: sysfs_dirent to be added - * - * Get @acxt->parent_sd and set sd->s_parent to it and increment - * nlink of parent inode if @sd is a directory and link into the - * children list of the parent. - * - * This function should be called between calls to - * sysfs_addrm_start() and sysfs_addrm_finish() and should be - * passed the same @acxt as passed to sysfs_addrm_start(). - * - * LOCKING: - * Determined by sysfs_addrm_start(). - * - * RETURNS: - * 0 on success, -EEXIST if entry with the given name already - * exists. - */ -int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) -{ - struct sysfs_inode_attrs *ps_iattr; - int ret; - - if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) { - WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", - sysfs_ns_type(acxt->parent_sd) ? "required" : "invalid", - acxt->parent_sd->s_name, sd->s_name); - return -EINVAL; - } - - sd->s_hash = sysfs_name_hash(sd->s_ns, sd->s_name); - sd->s_parent = sysfs_get(acxt->parent_sd); - - ret = sysfs_link_sibling(sd); - if (ret) - return ret; - - /* Update timestamps on the parent */ - ps_iattr = acxt->parent_sd->s_iattr; - if (ps_iattr) { - struct iattr *ps_iattrs = &ps_iattr->ia_iattr; - ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; - } - - /* Mark the entry added into directory tree */ - sd->s_flags &= ~SYSFS_FLAG_REMOVED; - - return 0; -} - -/** - * sysfs_pathname - return full path to sysfs dirent - * @sd: sysfs_dirent whose path we want - * @path: caller allocated buffer of size PATH_MAX - * - * Gives the name "/" to the sysfs_root entry; any path returned - * is relative to wherever sysfs is mounted. - */ -static char *sysfs_pathname(struct sysfs_dirent *sd, char *path) -{ - if (sd->s_parent) { - sysfs_pathname(sd->s_parent, path); - strlcat(path, "/", PATH_MAX); - } - strlcat(path, sd->s_name, PATH_MAX); - return path; -} - -/** - * sysfs_add_one - add sysfs_dirent to parent - * @acxt: addrm context to use - * @sd: sysfs_dirent to be added - * - * Get @acxt->parent_sd and set sd->s_parent to it and increment - * nlink of parent inode if @sd is a directory and link into the - * children list of the parent. - * - * This function should be called between calls to - * sysfs_addrm_start() and sysfs_addrm_finish() and should be - * passed the same @acxt as passed to sysfs_addrm_start(). - * - * LOCKING: - * Determined by sysfs_addrm_start(). - * - * RETURNS: - * 0 on success, -EEXIST if entry with the given name already - * exists. - */ -int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) -{ - int ret; - - ret = __sysfs_add_one(acxt, sd); - if (ret == -EEXIST) { - char *path = kzalloc(PATH_MAX, GFP_KERNEL); - WARN(1, KERN_WARNING - "sysfs: cannot create duplicate filename '%s'\n", - (path == NULL) ? sd->s_name - : (sysfs_pathname(acxt->parent_sd, path), - strlcat(path, "/", PATH_MAX), - strlcat(path, sd->s_name, PATH_MAX), - path)); - kfree(path); - } - - return ret; -} - -/** - * sysfs_remove_one - remove sysfs_dirent from parent - * @acxt: addrm context to use - * @sd: sysfs_dirent to be removed - * - * Mark @sd removed and drop nlink of parent inode if @sd is a - * directory. @sd is unlinked from the children list. - * - * This function should be called between calls to - * sysfs_addrm_start() and sysfs_addrm_finish() and should be - * passed the same @acxt as passed to sysfs_addrm_start(). - * - * LOCKING: - * Determined by sysfs_addrm_start(). - */ -void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) -{ - struct sysfs_inode_attrs *ps_iattr; - - BUG_ON(sd->s_flags & SYSFS_FLAG_REMOVED); - - sysfs_unlink_sibling(sd); - - /* Update timestamps on the parent */ - ps_iattr = acxt->parent_sd->s_iattr; - if (ps_iattr) { - struct iattr *ps_iattrs = &ps_iattr->ia_iattr; - ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; - } - - sd->s_flags |= SYSFS_FLAG_REMOVED; - sd->u.removed_list = acxt->removed; - acxt->removed = sd; -} - -/** - * sysfs_addrm_finish - finish up sysfs_dirent add/remove - * @acxt: addrm context to finish up - * - * Finish up sysfs_dirent add/remove. Resources acquired by - * sysfs_addrm_start() are released and removed sysfs_dirents are - * cleaned up. - * - * LOCKING: - * sysfs_mutex is released. - */ -void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt) -{ - /* release resources acquired by sysfs_addrm_start() */ - mutex_unlock(&sysfs_mutex); - - /* kill removed sysfs_dirents */ - while (acxt->removed) { - struct sysfs_dirent *sd = acxt->removed; - - acxt->removed = sd->u.removed_list; - - sysfs_deactivate(sd); - unmap_bin_file(sd); - sysfs_put(sd); - } -} - -/** - * sysfs_find_dirent - find sysfs_dirent with the given name - * @parent_sd: sysfs_dirent to search under - * @name: name to look for - * - * Look for sysfs_dirent with name @name under @parent_sd. - * - * LOCKING: - * mutex_lock(sysfs_mutex) - * - * RETURNS: - * Pointer to sysfs_dirent if found, NULL if not. - */ -struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, - const void *ns, - const unsigned char *name) -{ - struct rb_node *node = parent_sd->s_dir.children.rb_node; - unsigned int hash; - - if (!!sysfs_ns_type(parent_sd) != !!ns) { - WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", - sysfs_ns_type(parent_sd) ? "required" : "invalid", - parent_sd->s_name, name); - return NULL; - } - - hash = sysfs_name_hash(ns, name); - while (node) { - struct sysfs_dirent *sd; - int result; - - sd = to_sysfs_dirent(node); - result = sysfs_name_compare(hash, ns, name, sd); - if (result < 0) - node = node->rb_left; - else if (result > 0) - node = node->rb_right; - else - return sd; - } - return NULL; -} - -/** - * sysfs_get_dirent - find and get sysfs_dirent with the given name - * @parent_sd: sysfs_dirent to search under - * @name: name to look for - * - * Look for sysfs_dirent with name @name under @parent_sd and get - * it if found. - * - * LOCKING: - * Kernel thread context (may sleep). Grabs sysfs_mutex. - * - * RETURNS: - * Pointer to sysfs_dirent if found, NULL if not. - */ -struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, - const void *ns, - const unsigned char *name) -{ - struct sysfs_dirent *sd; - - mutex_lock(&sysfs_mutex); - sd = sysfs_find_dirent(parent_sd, ns, name); - sysfs_get(sd); - mutex_unlock(&sysfs_mutex); - - return sd; -} -EXPORT_SYMBOL_GPL(sysfs_get_dirent); - -static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, - enum kobj_ns_type type, const void *ns, const char *name, - struct sysfs_dirent **p_sd) -{ - umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; - struct sysfs_addrm_cxt acxt; - struct sysfs_dirent *sd; - int rc; - - /* allocate */ - sd = sysfs_new_dirent(name, mode, SYSFS_DIR); - if (!sd) - return -ENOMEM; - - sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT); - sd->s_ns = ns; - sd->s_dir.kobj = kobj; - - /* link in */ - sysfs_addrm_start(&acxt, parent_sd); - rc = sysfs_add_one(&acxt, sd); - sysfs_addrm_finish(&acxt); - - if (rc == 0) - *p_sd = sd; - else - sysfs_put(sd); - - return rc; -} - -int sysfs_create_subdir(struct kobject *kobj, const char *name, - struct sysfs_dirent **p_sd) -{ - return create_dir(kobj, kobj->sd, - KOBJ_NS_TYPE_NONE, NULL, name, p_sd); -} - -/** - * sysfs_read_ns_type: return associated ns_type - * @kobj: the kobject being queried - * - * Each kobject can be tagged with exactly one namespace type - * (i.e. network or user). Return the ns_type associated with - * this object if any - */ -static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj) -{ - const struct kobj_ns_type_operations *ops; - enum kobj_ns_type type; - - ops = kobj_child_ns_ops(kobj); - if (!ops) - return KOBJ_NS_TYPE_NONE; - - type = ops->type; - BUG_ON(type <= KOBJ_NS_TYPE_NONE); - BUG_ON(type >= KOBJ_NS_TYPES); - BUG_ON(!kobj_ns_type_registered(type)); - - return type; -} - -/** - * sysfs_create_dir - create a directory for an object. - * @kobj: object we're creating directory for. - */ -int sysfs_create_dir(struct kobject *kobj) -{ - enum kobj_ns_type type; - struct sysfs_dirent *parent_sd, *sd; - const void *ns = NULL; - int error = 0; + struct kernfs_node *parent, *kn; BUG_ON(!kobj); if (kobj->parent) - parent_sd = kobj->parent->sd; + parent = kobj->parent->sd; else - parent_sd = &sysfs_root; + parent = sysfs_root_kn; - if (!parent_sd) + if (!parent) return -ENOENT; - if (sysfs_ns_type(parent_sd)) - ns = kobj->ktype->namespace(kobj); - type = sysfs_read_ns_type(kobj); - - error = create_dir(kobj, parent_sd, type, ns, kobject_name(kobj), &sd); - if (!error) - kobj->sd = sd; - return error; -} - -static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags) -{ - struct dentry *ret = NULL; - struct dentry *parent = dentry->d_parent; - struct sysfs_dirent *parent_sd = parent->d_fsdata; - struct sysfs_dirent *sd; - struct inode *inode; - enum kobj_ns_type type; - const void *ns; - - mutex_lock(&sysfs_mutex); - - type = sysfs_ns_type(parent_sd); - ns = sysfs_info(dir->i_sb)->ns[type]; - - sd = sysfs_find_dirent(parent_sd, ns, dentry->d_name.name); - - /* no such entry */ - if (!sd) { - ret = ERR_PTR(-ENOENT); - goto out_unlock; - } - dentry->d_fsdata = sysfs_get(sd); - - /* attach dentry and inode */ - inode = sysfs_get_inode(dir->i_sb, sd); - if (!inode) { - ret = ERR_PTR(-ENOMEM); - goto out_unlock; - } - - /* instantiate and hash dentry */ - ret = d_materialise_unique(dentry, inode); - out_unlock: - mutex_unlock(&sysfs_mutex); - return ret; -} - -const struct inode_operations sysfs_dir_inode_operations = { - .lookup = sysfs_lookup, - .permission = sysfs_permission, - .setattr = sysfs_setattr, - .getattr = sysfs_getattr, - .setxattr = sysfs_setxattr, -}; - -static void remove_dir(struct sysfs_dirent *sd) -{ - struct sysfs_addrm_cxt acxt; - - sysfs_addrm_start(&acxt, sd->s_parent); - sysfs_remove_one(&acxt, sd); - sysfs_addrm_finish(&acxt); -} - -void sysfs_remove_subdir(struct sysfs_dirent *sd) -{ - remove_dir(sd); -} - - -static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd) -{ - struct sysfs_addrm_cxt acxt; - struct rb_node *pos; - - if (!dir_sd) - return; - - pr_debug("sysfs %s: removing dir\n", dir_sd->s_name); - sysfs_addrm_start(&acxt, dir_sd); - pos = rb_first(&dir_sd->s_dir.children); - while (pos) { - struct sysfs_dirent *sd = to_sysfs_dirent(pos); - pos = rb_next(pos); - if (sysfs_type(sd) != SYSFS_DIR) - sysfs_remove_one(&acxt, sd); + kn = kernfs_create_dir_ns(parent, kobject_name(kobj), + S_IRWXU | S_IRUGO | S_IXUGO, kobj, ns); + if (IS_ERR(kn)) { + if (PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(parent, kobject_name(kobj)); + return PTR_ERR(kn); } - sysfs_addrm_finish(&acxt); - remove_dir(dir_sd); + kobj->sd = kn; + return 0; } /** @@ -852,201 +72,52 @@ static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd) * the directory before we remove the directory, and we've inlined * what used to be sysfs_rmdir() below, instead of calling separately. */ - void sysfs_remove_dir(struct kobject *kobj) { - struct sysfs_dirent *sd = kobj->sd; - - spin_lock(&sysfs_assoc_lock); - kobj->sd = NULL; - spin_unlock(&sysfs_assoc_lock); - - __sysfs_remove_dir(sd); -} - -int sysfs_rename(struct sysfs_dirent *sd, - struct sysfs_dirent *new_parent_sd, const void *new_ns, - const char *new_name) -{ - int error; - - mutex_lock(&sysfs_mutex); - - error = 0; - if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) && - (strcmp(sd->s_name, new_name) == 0)) - goto out; /* nothing to rename */ - - error = -EEXIST; - if (sysfs_find_dirent(new_parent_sd, new_ns, new_name)) - goto out; - - /* rename sysfs_dirent */ - if (strcmp(sd->s_name, new_name) != 0) { - error = -ENOMEM; - new_name = kstrdup(new_name, GFP_KERNEL); - if (!new_name) - goto out; - - kfree(sd->s_name); - sd->s_name = new_name; - } + struct kernfs_node *kn = kobj->sd; /* - * Move to the appropriate place in the appropriate directories rbtree. + * In general, kboject owner is responsible for ensuring removal + * doesn't race with other operations and sysfs doesn't provide any + * protection; however, when @kobj is used as a symlink target, the + * symlinking entity usually doesn't own @kobj and thus has no + * control over removal. @kobj->sd may be removed anytime + * and symlink code may end up dereferencing an already freed node. + * + * sysfs_symlink_target_lock synchronizes @kobj->sd + * disassociation against symlink operations so that symlink code + * can safely dereference @kobj->sd. */ - sysfs_unlink_sibling(sd); - sysfs_get(new_parent_sd); - sysfs_put(sd->s_parent); - sd->s_ns = new_ns; - sd->s_hash = sysfs_name_hash(sd->s_ns, sd->s_name); - sd->s_parent = new_parent_sd; - sysfs_link_sibling(sd); - - error = 0; - out: - mutex_unlock(&sysfs_mutex); - return error; -} - -int sysfs_rename_dir(struct kobject *kobj, const char *new_name) -{ - struct sysfs_dirent *parent_sd = kobj->sd->s_parent; - const void *new_ns = NULL; - - if (sysfs_ns_type(parent_sd)) - new_ns = kobj->ktype->namespace(kobj); - - return sysfs_rename(kobj->sd, parent_sd, new_ns, new_name); -} - -int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj) -{ - struct sysfs_dirent *sd = kobj->sd; - struct sysfs_dirent *new_parent_sd; - const void *new_ns = NULL; - - BUG_ON(!sd->s_parent); - if (sysfs_ns_type(sd->s_parent)) - new_ns = kobj->ktype->namespace(kobj); - new_parent_sd = new_parent_kobj && new_parent_kobj->sd ? - new_parent_kobj->sd : &sysfs_root; - - return sysfs_rename(sd, new_parent_sd, new_ns, sd->s_name); -} - -/* Relationship between s_mode and the DT_xxx types */ -static inline unsigned char dt_type(struct sysfs_dirent *sd) -{ - return (sd->s_mode >> 12) & 15; -} - -static int sysfs_dir_release(struct inode *inode, struct file *filp) -{ - sysfs_put(filp->private_data); - return 0; -} - -static struct sysfs_dirent *sysfs_dir_pos(const void *ns, - struct sysfs_dirent *parent_sd, loff_t hash, struct sysfs_dirent *pos) -{ - if (pos) { - int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) && - pos->s_parent == parent_sd && - hash == pos->s_hash; - sysfs_put(pos); - if (!valid) - pos = NULL; - } - if (!pos && (hash > 1) && (hash < INT_MAX)) { - struct rb_node *node = parent_sd->s_dir.children.rb_node; - while (node) { - pos = to_sysfs_dirent(node); + spin_lock(&sysfs_symlink_target_lock); + kobj->sd = NULL; + spin_unlock(&sysfs_symlink_target_lock); - if (hash < pos->s_hash) - node = node->rb_left; - else if (hash > pos->s_hash) - node = node->rb_right; - else - break; - } - } - /* Skip over entries in the wrong namespace */ - while (pos && pos->s_ns != ns) { - struct rb_node *node = rb_next(&pos->s_rb); - if (!node) - pos = NULL; - else - pos = to_sysfs_dirent(node); + if (kn) { + WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR); + kernfs_remove(kn); } - return pos; } -static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns, - struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos) +int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, + const void *new_ns) { - pos = sysfs_dir_pos(ns, parent_sd, ino, pos); - if (pos) - do { - struct rb_node *node = rb_next(&pos->s_rb); - if (!node) - pos = NULL; - else - pos = to_sysfs_dirent(node); - } while (pos && pos->s_ns != ns); - return pos; -} - -static int sysfs_readdir(struct file *file, struct dir_context *ctx) -{ - struct dentry *dentry = file->f_path.dentry; - struct sysfs_dirent *parent_sd = dentry->d_fsdata; - struct sysfs_dirent *pos = file->private_data; - enum kobj_ns_type type; - const void *ns; - - type = sysfs_ns_type(parent_sd); - ns = sysfs_info(dentry->d_sb)->ns[type]; - - if (!dir_emit_dots(file, ctx)) - return 0; - mutex_lock(&sysfs_mutex); - for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos); - pos; - pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) { - const char *name = pos->s_name; - unsigned int type = dt_type(pos); - int len = strlen(name); - ino_t ino = pos->s_ino; - ctx->pos = pos->s_hash; - file->private_data = sysfs_get(pos); + struct kernfs_node *parent; + int ret; - mutex_unlock(&sysfs_mutex); - if (!dir_emit(ctx, name, len, ino, type)) - return 0; - mutex_lock(&sysfs_mutex); - } - mutex_unlock(&sysfs_mutex); - file->private_data = NULL; - ctx->pos = INT_MAX; - return 0; + parent = kernfs_get_parent(kobj->sd); + ret = kernfs_rename_ns(kobj->sd, parent, new_name, new_ns); + kernfs_put(parent); + return ret; } -static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence) +int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, + const void *new_ns) { - struct inode *inode = file_inode(file); - loff_t ret; + struct kernfs_node *kn = kobj->sd; + struct kernfs_node *new_parent; - mutex_lock(&inode->i_mutex); - ret = generic_file_llseek(file, offset, whence); - mutex_unlock(&inode->i_mutex); + new_parent = new_parent_kobj && new_parent_kobj->sd ? + new_parent_kobj->sd : sysfs_root_kn; - return ret; + return kernfs_rename_ns(kn, new_parent, kn->name, new_ns); } - -const struct file_operations sysfs_dir_operations = { - .read = generic_read_dir, - .iterate = sysfs_readdir, - .release = sysfs_dir_release, - .llseek = sysfs_dir_llseek, -}; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 15ef5eb1366..e9ef59b3abb 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -14,77 +14,56 @@ #include <linux/kobject.h> #include <linux/kallsyms.h> #include <linux/slab.h> -#include <linux/fsnotify.h> -#include <linux/namei.h> -#include <linux/poll.h> #include <linux/list.h> #include <linux/mutex.h> -#include <linux/limits.h> -#include <linux/uaccess.h> +#include <linux/seq_file.h> #include "sysfs.h" +#include "../kernfs/kernfs-internal.h" /* - * There's one sysfs_buffer for each open file and one - * sysfs_open_dirent for each sysfs_dirent with one or more open - * files. - * - * filp->private_data points to sysfs_buffer and - * sysfs_dirent->s_attr.open points to sysfs_open_dirent. s_attr.open - * is protected by sysfs_open_dirent_lock. + * Determine ktype->sysfs_ops for the given kernfs_node. This function + * must be called while holding an active reference. */ -static DEFINE_SPINLOCK(sysfs_open_dirent_lock); - -struct sysfs_open_dirent { - atomic_t refcnt; - atomic_t event; - wait_queue_head_t poll; - struct list_head buffers; /* goes through sysfs_buffer.list */ -}; +static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn) +{ + struct kobject *kobj = kn->parent->priv; -struct sysfs_buffer { - size_t count; - loff_t pos; - char *page; - const struct sysfs_ops *ops; - struct mutex mutex; - int needs_read_fill; - int event; - struct list_head list; -}; + if (kn->flags & KERNFS_LOCKDEP) + lockdep_assert_held(kn); + return kobj->ktype ? kobj->ktype->sysfs_ops : NULL; +} -/** - * fill_read_buffer - allocate and fill buffer from object. - * @dentry: dentry pointer. - * @buffer: data buffer for file. - * - * Allocate @buffer->page, if it hasn't been already, then call the - * kobject's show() method to fill the buffer with this attribute's - * data. - * This is called only once, on the file's first read unless an error - * is returned. +/* + * Reads on sysfs are handled through seq_file, which takes care of hairy + * details like buffering and seeking. The following function pipes + * sysfs_ops->show() result through seq_file. */ -static int fill_read_buffer(struct dentry *dentry, struct sysfs_buffer *buffer) +static int sysfs_kf_seq_show(struct seq_file *sf, void *v) { - struct sysfs_dirent *attr_sd = dentry->d_fsdata; - struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - const struct sysfs_ops *ops = buffer->ops; - int ret = 0; + struct kernfs_open_file *of = sf->private; + struct kobject *kobj = of->kn->parent->priv; + const struct sysfs_ops *ops = sysfs_file_ops(of->kn); ssize_t count; + char *buf; - if (!buffer->page) - buffer->page = (char *) get_zeroed_page(GFP_KERNEL); - if (!buffer->page) - return -ENOMEM; - - /* need attr_sd for attr and ops, its parent for kobj */ - if (!sysfs_get_active(attr_sd)) - return -ENODEV; - - buffer->event = atomic_read(&attr_sd->s_attr.open->event); - count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page); + /* acquire buffer and ensure that it's >= PAGE_SIZE and clear */ + count = seq_get_buf(sf, &buf); + if (count < PAGE_SIZE) { + seq_commit(sf, -1); + return 0; + } + memset(buf, 0, PAGE_SIZE); - sysfs_put_active(attr_sd); + /* + * Invoke show(). Control may reach here via seq file lseek even + * if @ops->show() isn't implemented. + */ + if (ops->show) { + count = ops->show(kobj, of->kn->priv, buf); + if (count < 0) + return count; + } /* * The code works fine with PAGE_SIZE return but it's likely to @@ -96,486 +75,215 @@ static int fill_read_buffer(struct dentry *dentry, struct sysfs_buffer *buffer) /* Try to struggle along */ count = PAGE_SIZE - 1; } - if (count >= 0) { - buffer->needs_read_fill = 0; - buffer->count = count; - } else { - ret = count; - } - return ret; -} - -/** - * sysfs_read_file - read an attribute. - * @file: file pointer. - * @buf: buffer to fill. - * @count: number of bytes to read. - * @ppos: starting offset in file. - * - * Userspace wants to read an attribute file. The attribute descriptor - * is in the file's ->d_fsdata. The target object is in the directory's - * ->d_fsdata. - * - * We call fill_read_buffer() to allocate and fill the buffer from the - * object's show() method exactly once (if the read is happening from - * the beginning of the file). That should fill the entire buffer with - * all the data the object has to offer for that attribute. - * We then call flush_read_buffer() to copy the buffer to userspace - * in the increments specified. - */ - -static ssize_t -sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) -{ - struct sysfs_buffer *buffer = file->private_data; - ssize_t retval = 0; - - mutex_lock(&buffer->mutex); - if (buffer->needs_read_fill || *ppos == 0) { - retval = fill_read_buffer(file->f_path.dentry, buffer); - if (retval) - goto out; - } - pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n", - __func__, count, *ppos, buffer->page); - retval = simple_read_from_buffer(buf, count, ppos, buffer->page, - buffer->count); -out: - mutex_unlock(&buffer->mutex); - return retval; -} - -/** - * fill_write_buffer - copy buffer from userspace. - * @buffer: data buffer for file. - * @buf: data from user. - * @count: number of bytes in @userbuf. - * - * Allocate @buffer->page if it hasn't been already, then - * copy the user-supplied buffer into it. - */ -static int fill_write_buffer(struct sysfs_buffer *buffer, - const char __user *buf, size_t count) -{ - int error; - - if (!buffer->page) - buffer->page = (char *)get_zeroed_page(GFP_KERNEL); - if (!buffer->page) - return -ENOMEM; - - if (count >= PAGE_SIZE) - count = PAGE_SIZE - 1; - error = copy_from_user(buffer->page, buf, count); - buffer->needs_read_fill = 1; - /* if buf is assumed to contain a string, terminate it by \0, - so e.g. sscanf() can scan the string easily */ - buffer->page[count] = 0; - return error ? -EFAULT : count; -} - - -/** - * flush_write_buffer - push buffer to kobject. - * @dentry: dentry to the attribute - * @buffer: data buffer for file. - * @count: number of bytes - * - * Get the correct pointers for the kobject and the attribute we're - * dealing with, then call the store() method for the attribute, - * passing the buffer that we acquired in fill_write_buffer(). - */ -static int flush_write_buffer(struct dentry *dentry, - struct sysfs_buffer *buffer, size_t count) -{ - struct sysfs_dirent *attr_sd = dentry->d_fsdata; - struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - const struct sysfs_ops *ops = buffer->ops; - int rc; - - /* need attr_sd for attr and ops, its parent for kobj */ - if (!sysfs_get_active(attr_sd)) - return -ENODEV; - - rc = ops->store(kobj, attr_sd->s_attr.attr, buffer->page, count); - - sysfs_put_active(attr_sd); - - return rc; -} - - -/** - * sysfs_write_file - write an attribute. - * @file: file pointer - * @buf: data to write - * @count: number of bytes - * @ppos: starting offset - * - * Similar to sysfs_read_file(), though working in the opposite direction. - * We allocate and fill the data from the user in fill_write_buffer(), - * then push it to the kobject in flush_write_buffer(). - * There is no easy way for us to know if userspace is only doing a partial - * write, so we don't support them. We expect the entire buffer to come - * on the first write. - * Hint: if you're writing a value, first read the file, modify only the - * the value you're changing, then write entire buffer back. - */ -static ssize_t sysfs_write_file(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct sysfs_buffer *buffer = file->private_data; - ssize_t len; - - mutex_lock(&buffer->mutex); - len = fill_write_buffer(buffer, buf, count); - if (len > 0) - len = flush_write_buffer(file->f_path.dentry, buffer, len); - if (len > 0) - *ppos += len; - mutex_unlock(&buffer->mutex); - return len; + seq_commit(sf, count); + return 0; } -/** - * sysfs_get_open_dirent - get or create sysfs_open_dirent - * @sd: target sysfs_dirent - * @buffer: sysfs_buffer for this instance of open - * - * If @sd->s_attr.open exists, increment its reference count; - * otherwise, create one. @buffer is chained to the buffers - * list. - * - * LOCKING: - * Kernel thread context (may sleep). - * - * RETURNS: - * 0 on success, -errno on failure. - */ -static int sysfs_get_open_dirent(struct sysfs_dirent *sd, - struct sysfs_buffer *buffer) +static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf, + size_t count, loff_t pos) { - struct sysfs_open_dirent *od, *new_od = NULL; + struct bin_attribute *battr = of->kn->priv; + struct kobject *kobj = of->kn->parent->priv; + loff_t size = file_inode(of->file)->i_size; - retry: - spin_lock_irq(&sysfs_open_dirent_lock); - - if (!sd->s_attr.open && new_od) { - sd->s_attr.open = new_od; - new_od = NULL; - } - - od = sd->s_attr.open; - if (od) { - atomic_inc(&od->refcnt); - list_add_tail(&buffer->list, &od->buffers); - } - - spin_unlock_irq(&sysfs_open_dirent_lock); - - if (od) { - kfree(new_od); + if (!count) return 0; + + if (size) { + if (pos > size) + return 0; + if (pos + count > size) + count = size - pos; } - /* not there, initialize a new one and retry */ - new_od = kmalloc(sizeof(*new_od), GFP_KERNEL); - if (!new_od) - return -ENOMEM; + if (!battr->read) + return -EIO; - atomic_set(&new_od->refcnt, 0); - atomic_set(&new_od->event, 1); - init_waitqueue_head(&new_od->poll); - INIT_LIST_HEAD(&new_od->buffers); - goto retry; + return battr->read(of->file, kobj, battr, buf, pos, count); } -/** - * sysfs_put_open_dirent - put sysfs_open_dirent - * @sd: target sysfs_dirent - * @buffer: associated sysfs_buffer - * - * Put @sd->s_attr.open and unlink @buffer from the buffers list. - * If reference count reaches zero, disassociate and free it. - * - * LOCKING: - * None. - */ -static void sysfs_put_open_dirent(struct sysfs_dirent *sd, - struct sysfs_buffer *buffer) +/* kernfs write callback for regular sysfs files */ +static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf, + size_t count, loff_t pos) { - struct sysfs_open_dirent *od = sd->s_attr.open; - unsigned long flags; + const struct sysfs_ops *ops = sysfs_file_ops(of->kn); + struct kobject *kobj = of->kn->parent->priv; - spin_lock_irqsave(&sysfs_open_dirent_lock, flags); - - list_del(&buffer->list); - if (atomic_dec_and_test(&od->refcnt)) - sd->s_attr.open = NULL; - else - od = NULL; - - spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); + if (!count) + return 0; - kfree(od); + return ops->store(kobj, of->kn->priv, buf, count); } -static int sysfs_open_file(struct inode *inode, struct file *file) +/* kernfs write callback for bin sysfs files */ +static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf, + size_t count, loff_t pos) { - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - struct sysfs_buffer *buffer; - const struct sysfs_ops *ops; - int error = -EACCES; - - /* need attr_sd for attr and ops, its parent for kobj */ - if (!sysfs_get_active(attr_sd)) - return -ENODEV; - - /* every kobject with an attribute needs a ktype assigned */ - if (kobj->ktype && kobj->ktype->sysfs_ops) - ops = kobj->ktype->sysfs_ops; - else { - WARN(1, KERN_ERR - "missing sysfs attribute operations for kobject: %s\n", - kobject_name(kobj)); - goto err_out; - } - - /* File needs write support. - * The inode's perms must say it's ok, - * and we must have a store method. - */ - if (file->f_mode & FMODE_WRITE) { - if (!(inode->i_mode & S_IWUGO) || !ops->store) - goto err_out; - } - - /* File needs read support. - * The inode's perms must say it's ok, and we there - * must be a show method for it. - */ - if (file->f_mode & FMODE_READ) { - if (!(inode->i_mode & S_IRUGO) || !ops->show) - goto err_out; + struct bin_attribute *battr = of->kn->priv; + struct kobject *kobj = of->kn->parent->priv; + loff_t size = file_inode(of->file)->i_size; + + if (size) { + if (size <= pos) + return 0; + count = min_t(ssize_t, count, size - pos); } + if (!count) + return 0; - /* No error? Great, allocate a buffer for the file, and store it - * it in file->private_data for easy access. - */ - error = -ENOMEM; - buffer = kzalloc(sizeof(struct sysfs_buffer), GFP_KERNEL); - if (!buffer) - goto err_out; - - mutex_init(&buffer->mutex); - buffer->needs_read_fill = 1; - buffer->ops = ops; - file->private_data = buffer; - - /* make sure we have open dirent struct */ - error = sysfs_get_open_dirent(attr_sd, buffer); - if (error) - goto err_free; - - /* open succeeded, put active references */ - sysfs_put_active(attr_sd); - return 0; + if (!battr->write) + return -EIO; - err_free: - kfree(buffer); - err_out: - sysfs_put_active(attr_sd); - return error; + return battr->write(of->file, kobj, battr, buf, pos, count); } -static int sysfs_release(struct inode *inode, struct file *filp) +static int sysfs_kf_bin_mmap(struct kernfs_open_file *of, + struct vm_area_struct *vma) { - struct sysfs_dirent *sd = filp->f_path.dentry->d_fsdata; - struct sysfs_buffer *buffer = filp->private_data; - - sysfs_put_open_dirent(sd, buffer); - - if (buffer->page) - free_page((unsigned long)buffer->page); - kfree(buffer); + struct bin_attribute *battr = of->kn->priv; + struct kobject *kobj = of->kn->parent->priv; - return 0; + return battr->mmap(of->file, kobj, battr, vma); } -/* Sysfs attribute files are pollable. The idea is that you read - * the content and then you use 'poll' or 'select' to wait for - * the content to change. When the content changes (assuming the - * manager for the kobject supports notification), poll will - * return POLLERR|POLLPRI, and select will return the fd whether - * it is waiting for read, write, or exceptions. - * Once poll/select indicates that the value has changed, you - * need to close and re-open the file, or seek to 0 and read again. - * Reminder: this only works for attributes which actively support - * it, and it is not possible to test an attribute from userspace - * to see if it supports poll (Neither 'poll' nor 'select' return - * an appropriate error code). When in doubt, set a suitable timeout value. - */ -static unsigned int sysfs_poll(struct file *filp, poll_table *wait) +void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr) { - struct sysfs_buffer *buffer = filp->private_data; - struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata; - struct sysfs_open_dirent *od = attr_sd->s_attr.open; - - /* need parent for the kobj, grab both */ - if (!sysfs_get_active(attr_sd)) - goto trigger; - - poll_wait(filp, &od->poll, wait); - - sysfs_put_active(attr_sd); + struct kernfs_node *kn = kobj->sd, *tmp; - if (buffer->event != atomic_read(&od->event)) - goto trigger; + if (kn && dir) + kn = kernfs_find_and_get(kn, dir); + else + kernfs_get(kn); - return DEFAULT_POLLMASK; + if (kn && attr) { + tmp = kernfs_find_and_get(kn, attr); + kernfs_put(kn); + kn = tmp; + } - trigger: - buffer->needs_read_fill = 1; - return DEFAULT_POLLMASK|POLLERR|POLLPRI; + if (kn) { + kernfs_notify(kn); + kernfs_put(kn); + } } +EXPORT_SYMBOL_GPL(sysfs_notify); -void sysfs_notify_dirent(struct sysfs_dirent *sd) -{ - struct sysfs_open_dirent *od; - unsigned long flags; - - spin_lock_irqsave(&sysfs_open_dirent_lock, flags); +static const struct kernfs_ops sysfs_file_kfops_empty = { +}; - if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) { - od = sd->s_attr.open; - if (od) { - atomic_inc(&od->event); - wake_up_interruptible(&od->poll); - } - } +static const struct kernfs_ops sysfs_file_kfops_ro = { + .seq_show = sysfs_kf_seq_show, +}; - spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); -} -EXPORT_SYMBOL_GPL(sysfs_notify_dirent); +static const struct kernfs_ops sysfs_file_kfops_wo = { + .write = sysfs_kf_write, +}; -void sysfs_notify(struct kobject *k, const char *dir, const char *attr) -{ - struct sysfs_dirent *sd = k->sd; +static const struct kernfs_ops sysfs_file_kfops_rw = { + .seq_show = sysfs_kf_seq_show, + .write = sysfs_kf_write, +}; - mutex_lock(&sysfs_mutex); +static const struct kernfs_ops sysfs_bin_kfops_ro = { + .read = sysfs_kf_bin_read, +}; - if (sd && dir) - sd = sysfs_find_dirent(sd, NULL, dir); - if (sd && attr) - sd = sysfs_find_dirent(sd, NULL, attr); - if (sd) - sysfs_notify_dirent(sd); +static const struct kernfs_ops sysfs_bin_kfops_wo = { + .write = sysfs_kf_bin_write, +}; - mutex_unlock(&sysfs_mutex); -} -EXPORT_SYMBOL_GPL(sysfs_notify); +static const struct kernfs_ops sysfs_bin_kfops_rw = { + .read = sysfs_kf_bin_read, + .write = sysfs_kf_bin_write, +}; -const struct file_operations sysfs_file_operations = { - .read = sysfs_read_file, - .write = sysfs_write_file, - .llseek = generic_file_llseek, - .open = sysfs_open_file, - .release = sysfs_release, - .poll = sysfs_poll, +static const struct kernfs_ops sysfs_bin_kfops_mmap = { + .read = sysfs_kf_bin_read, + .write = sysfs_kf_bin_write, + .mmap = sysfs_kf_bin_mmap, }; -static int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr, - const void **pns) +int sysfs_add_file_mode_ns(struct kernfs_node *parent, + const struct attribute *attr, bool is_bin, + umode_t mode, const void *ns) { - struct sysfs_dirent *dir_sd = kobj->sd; - const struct sysfs_ops *ops; - const void *ns = NULL; - int err; - - if (!dir_sd) { - WARN(1, KERN_ERR "sysfs: kobject %s without dirent\n", - kobject_name(kobj)); - return -ENOENT; + struct lock_class_key *key = NULL; + const struct kernfs_ops *ops; + struct kernfs_node *kn; + loff_t size; + + if (!is_bin) { + struct kobject *kobj = parent->priv; + const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops; + + /* every kobject with an attribute needs a ktype assigned */ + if (WARN(!sysfs_ops, KERN_ERR + "missing sysfs attribute operations for kobject: %s\n", + kobject_name(kobj))) + return -EINVAL; + + if (sysfs_ops->show && sysfs_ops->store) + ops = &sysfs_file_kfops_rw; + else if (sysfs_ops->show) + ops = &sysfs_file_kfops_ro; + else if (sysfs_ops->store) + ops = &sysfs_file_kfops_wo; + else + ops = &sysfs_file_kfops_empty; + + size = PAGE_SIZE; + } else { + struct bin_attribute *battr = (void *)attr; + + if (battr->mmap) + ops = &sysfs_bin_kfops_mmap; + else if (battr->read && battr->write) + ops = &sysfs_bin_kfops_rw; + else if (battr->read) + ops = &sysfs_bin_kfops_ro; + else if (battr->write) + ops = &sysfs_bin_kfops_wo; + else + ops = &sysfs_file_kfops_empty; + + size = battr->size; } - err = 0; - if (!sysfs_ns_type(dir_sd)) - goto out; - - err = -EINVAL; - if (!kobj->ktype) - goto out; - ops = kobj->ktype->sysfs_ops; - if (!ops) - goto out; - if (!ops->namespace) - goto out; - - err = 0; - ns = ops->namespace(kobj, attr); -out: - if (err) { - WARN(1, KERN_ERR - "missing sysfs namespace attribute operation for kobject: %s\n", - kobject_name(kobj)); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (!attr->ignore_lockdep) + key = attr->key ?: (struct lock_class_key *)&attr->skey; +#endif + kn = __kernfs_create_file(parent, attr->name, mode, size, ops, + (void *)attr, ns, true, key); + if (IS_ERR(kn)) { + if (PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(parent, attr->name); + return PTR_ERR(kn); } - *pns = ns; - return err; -} - -int sysfs_add_file_mode(struct sysfs_dirent *dir_sd, - const struct attribute *attr, int type, umode_t amode) -{ - umode_t mode = (amode & S_IALLUGO) | S_IFREG; - struct sysfs_addrm_cxt acxt; - struct sysfs_dirent *sd; - const void *ns; - int rc; - - rc = sysfs_attr_ns(dir_sd->s_dir.kobj, attr, &ns); - if (rc) - return rc; - - sd = sysfs_new_dirent(attr->name, mode, type); - if (!sd) - return -ENOMEM; - - sd->s_ns = ns; - sd->s_attr.attr = (void *)attr; - sysfs_dirent_init_lockdep(sd); - - sysfs_addrm_start(&acxt, dir_sd); - rc = sysfs_add_one(&acxt, sd); - sysfs_addrm_finish(&acxt); - - if (rc) - sysfs_put(sd); - - return rc; + return 0; } - -int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr, - int type) +int sysfs_add_file(struct kernfs_node *parent, const struct attribute *attr, + bool is_bin) { - return sysfs_add_file_mode(dir_sd, attr, type, attr->mode); + return sysfs_add_file_mode_ns(parent, attr, is_bin, attr->mode, NULL); } - /** - * sysfs_create_file - create an attribute file for an object. - * @kobj: object we're creating for. - * @attr: attribute descriptor. + * sysfs_create_file_ns - create an attribute file for an object with custom ns + * @kobj: object we're creating for + * @attr: attribute descriptor + * @ns: namespace the new file should belong to */ -int sysfs_create_file(struct kobject *kobj, const struct attribute *attr) +int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, + const void *ns) { BUG_ON(!kobj || !kobj->sd || !attr); - return sysfs_add_file(kobj->sd, attr, SYSFS_KOBJ_ATTR); + return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, ns); } -EXPORT_SYMBOL_GPL(sysfs_create_file); +EXPORT_SYMBOL_GPL(sysfs_create_file_ns); int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr) { @@ -600,19 +308,21 @@ EXPORT_SYMBOL_GPL(sysfs_create_files); int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group) { - struct sysfs_dirent *dir_sd; + struct kernfs_node *parent; int error; - if (group) - dir_sd = sysfs_get_dirent(kobj->sd, NULL, group); - else - dir_sd = sysfs_get(kobj->sd); + if (group) { + parent = kernfs_find_and_get(kobj->sd, group); + } else { + parent = kobj->sd; + kernfs_get(parent); + } - if (!dir_sd) + if (!parent) return -ENOENT; - error = sysfs_add_file(dir_sd, attr, SYSFS_KOBJ_ATTR); - sysfs_put(dir_sd); + error = sysfs_add_file(parent, attr, false); + kernfs_put(parent); return error; } @@ -628,49 +338,63 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group); int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode) { - struct sysfs_dirent *sd; + struct kernfs_node *kn; struct iattr newattrs; - const void *ns; int rc; - rc = sysfs_attr_ns(kobj, attr, &ns); - if (rc) - return rc; - - mutex_lock(&sysfs_mutex); - - rc = -ENOENT; - sd = sysfs_find_dirent(kobj->sd, ns, attr->name); - if (!sd) - goto out; + kn = kernfs_find_and_get(kobj->sd, attr->name); + if (!kn) + return -ENOENT; - newattrs.ia_mode = (mode & S_IALLUGO) | (sd->s_mode & ~S_IALLUGO); + newattrs.ia_mode = (mode & S_IALLUGO) | (kn->mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE; - rc = sysfs_sd_setattr(sd, &newattrs); - out: - mutex_unlock(&sysfs_mutex); + rc = kernfs_setattr(kn, &newattrs); + + kernfs_put(kn); return rc; } EXPORT_SYMBOL_GPL(sysfs_chmod_file); /** - * sysfs_remove_file - remove an object attribute. - * @kobj: object we're acting for. - * @attr: attribute descriptor. + * sysfs_remove_file_ns - remove an object attribute with a custom ns tag + * @kobj: object we're acting for + * @attr: attribute descriptor + * @ns: namespace tag of the file to remove + * + * Hash the attribute name and namespace tag and kill the victim. + */ +void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, + const void *ns) +{ + struct kernfs_node *parent = kobj->sd; + + kernfs_remove_by_name_ns(parent, attr->name, ns); +} +EXPORT_SYMBOL_GPL(sysfs_remove_file_ns); + +/** + * sysfs_remove_file_self - remove an object attribute from its own method + * @kobj: object we're acting for + * @attr: attribute descriptor * - * Hash the attribute name and kill the victim. + * See kernfs_remove_self() for details. */ -void sysfs_remove_file(struct kobject *kobj, const struct attribute *attr) +bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr) { - const void *ns; + struct kernfs_node *parent = kobj->sd; + struct kernfs_node *kn; + bool ret; + + kn = kernfs_find_and_get(parent, attr->name); + if (WARN_ON_ONCE(!kn)) + return false; - if (sysfs_attr_ns(kobj, attr, &ns)) - return; + ret = kernfs_remove_self(kn); - sysfs_hash_and_remove(kobj->sd, ns, attr->name); + kernfs_put(kn); + return ret; } -EXPORT_SYMBOL_GPL(sysfs_remove_file); void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr) { @@ -689,107 +413,44 @@ EXPORT_SYMBOL_GPL(sysfs_remove_files); void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group) { - struct sysfs_dirent *dir_sd; + struct kernfs_node *parent; - if (group) - dir_sd = sysfs_get_dirent(kobj->sd, NULL, group); - else - dir_sd = sysfs_get(kobj->sd); - if (dir_sd) { - sysfs_hash_and_remove(dir_sd, NULL, attr->name); - sysfs_put(dir_sd); + if (group) { + parent = kernfs_find_and_get(kobj->sd, group); + } else { + parent = kobj->sd; + kernfs_get(parent); + } + + if (parent) { + kernfs_remove_by_name(parent, attr->name); + kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group); -struct sysfs_schedule_callback_struct { - struct list_head workq_list; - struct kobject *kobj; - void (*func)(void *); - void *data; - struct module *owner; - struct work_struct work; -}; - -static struct workqueue_struct *sysfs_workqueue; -static DEFINE_MUTEX(sysfs_workq_mutex); -static LIST_HEAD(sysfs_workq); -static void sysfs_schedule_callback_work(struct work_struct *work) +/** + * sysfs_create_bin_file - create binary file for object. + * @kobj: object. + * @attr: attribute descriptor. + */ +int sysfs_create_bin_file(struct kobject *kobj, + const struct bin_attribute *attr) { - struct sysfs_schedule_callback_struct *ss = container_of(work, - struct sysfs_schedule_callback_struct, work); - - (ss->func)(ss->data); - kobject_put(ss->kobj); - module_put(ss->owner); - mutex_lock(&sysfs_workq_mutex); - list_del(&ss->workq_list); - mutex_unlock(&sysfs_workq_mutex); - kfree(ss); + BUG_ON(!kobj || !kobj->sd || !attr); + + return sysfs_add_file(kobj->sd, &attr->attr, true); } +EXPORT_SYMBOL_GPL(sysfs_create_bin_file); /** - * sysfs_schedule_callback - helper to schedule a callback for a kobject - * @kobj: object we're acting for. - * @func: callback function to invoke later. - * @data: argument to pass to @func. - * @owner: module owning the callback code - * - * sysfs attribute methods must not unregister themselves or their parent - * kobject (which would amount to the same thing). Attempts to do so will - * deadlock, since unregistration is mutually exclusive with driver - * callbacks. - * - * Instead methods can call this routine, which will attempt to allocate - * and schedule a workqueue request to call back @func with @data as its - * argument in the workqueue's process context. @kobj will be pinned - * until @func returns. - * - * Returns 0 if the request was submitted, -ENOMEM if storage could not - * be allocated, -ENODEV if a reference to @owner isn't available, - * -EAGAIN if a callback has already been scheduled for @kobj. + * sysfs_remove_bin_file - remove binary file for object. + * @kobj: object. + * @attr: attribute descriptor. */ -int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), - void *data, struct module *owner) +void sysfs_remove_bin_file(struct kobject *kobj, + const struct bin_attribute *attr) { - struct sysfs_schedule_callback_struct *ss, *tmp; - - if (!try_module_get(owner)) - return -ENODEV; - - mutex_lock(&sysfs_workq_mutex); - list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list) - if (ss->kobj == kobj) { - module_put(owner); - mutex_unlock(&sysfs_workq_mutex); - return -EAGAIN; - } - mutex_unlock(&sysfs_workq_mutex); - - if (sysfs_workqueue == NULL) { - sysfs_workqueue = create_singlethread_workqueue("sysfsd"); - if (sysfs_workqueue == NULL) { - module_put(owner); - return -ENOMEM; - } - } - - ss = kmalloc(sizeof(*ss), GFP_KERNEL); - if (!ss) { - module_put(owner); - return -ENOMEM; - } - kobject_get(kobj); - ss->kobj = kobj; - ss->func = func; - ss->data = data; - ss->owner = owner; - INIT_WORK(&ss->work, sysfs_schedule_callback_work); - INIT_LIST_HEAD(&ss->workq_list); - mutex_lock(&sysfs_workq_mutex); - list_add_tail(&ss->workq_list, &sysfs_workq); - mutex_unlock(&sysfs_workq_mutex); - queue_work(sysfs_workqueue, &ss->work); - return 0; + kernfs_remove_by_name(kobj->sd, attr->attr.name); } -EXPORT_SYMBOL_GPL(sysfs_schedule_callback); +EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 5f92cd2f61c..7d2a860ba78 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -18,7 +18,7 @@ #include "sysfs.h" -static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, +static void remove_files(struct kernfs_node *parent, const struct attribute_group *grp) { struct attribute *const *attr; @@ -26,13 +26,13 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, if (grp->attrs) for (attr = grp->attrs; *attr; attr++) - sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name); + kernfs_remove_by_name(parent, (*attr)->name); if (grp->bin_attrs) for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) - sysfs_remove_bin_file(kobj, *bin_attr); + kernfs_remove_by_name(parent, (*bin_attr)->attr.name); } -static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, +static int create_files(struct kernfs_node *parent, struct kobject *kobj, const struct attribute_group *grp, int update) { struct attribute *const *attr; @@ -49,21 +49,20 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, * re-adding (if required) the file. */ if (update) - sysfs_hash_and_remove(dir_sd, NULL, - (*attr)->name); + kernfs_remove_by_name(parent, (*attr)->name); if (grp->is_visible) { mode = grp->is_visible(kobj, *attr, i); if (!mode) continue; } - error = sysfs_add_file_mode(dir_sd, *attr, - SYSFS_KOBJ_ATTR, - (*attr)->mode | mode); + error = sysfs_add_file_mode_ns(parent, *attr, false, + (*attr)->mode | mode, + NULL); if (unlikely(error)) break; } if (error) { - remove_files(dir_sd, kobj, grp); + remove_files(parent, grp); goto exit; } } @@ -71,13 +70,16 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, if (grp->bin_attrs) { for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) { if (update) - sysfs_remove_bin_file(kobj, *bin_attr); - error = sysfs_create_bin_file(kobj, *bin_attr); + kernfs_remove_by_name(parent, + (*bin_attr)->attr.name); + error = sysfs_add_file_mode_ns(parent, + &(*bin_attr)->attr, true, + (*bin_attr)->attr.mode, NULL); if (error) break; } if (error) - remove_files(dir_sd, kobj, grp); + remove_files(parent, grp); } exit: return error; @@ -87,7 +89,7 @@ exit: static int internal_create_group(struct kobject *kobj, int update, const struct attribute_group *grp) { - struct sysfs_dirent *sd; + struct kernfs_node *kn; int error; BUG_ON(!kobj || (!update && !kobj->sd)); @@ -101,18 +103,22 @@ static int internal_create_group(struct kobject *kobj, int update, return -EINVAL; } if (grp->name) { - error = sysfs_create_subdir(kobj, grp->name, &sd); - if (error) - return error; + kn = kernfs_create_dir(kobj->sd, grp->name, + S_IRWXU | S_IRUGO | S_IXUGO, kobj); + if (IS_ERR(kn)) { + if (PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(kobj->sd, grp->name); + return PTR_ERR(kn); + } } else - sd = kobj->sd; - sysfs_get(sd); - error = create_files(sd, kobj, grp, update); + kn = kobj->sd; + kernfs_get(kn); + error = create_files(kn, kobj, grp, update); if (error) { if (grp->name) - sysfs_remove_subdir(sd); + kernfs_remove(kn); } - sysfs_put(sd); + kernfs_put(kn); return error; } @@ -202,25 +208,27 @@ EXPORT_SYMBOL_GPL(sysfs_update_group); void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp) { - struct sysfs_dirent *dir_sd = kobj->sd; - struct sysfs_dirent *sd; + struct kernfs_node *parent = kobj->sd; + struct kernfs_node *kn; if (grp->name) { - sd = sysfs_get_dirent(dir_sd, NULL, grp->name); - if (!sd) { - WARN(!sd, KERN_WARNING + kn = kernfs_find_and_get(parent, grp->name); + if (!kn) { + WARN(!kn, KERN_WARNING "sysfs group %p not found for kobject '%s'\n", grp, kobject_name(kobj)); return; } - } else - sd = sysfs_get(dir_sd); + } else { + kn = parent; + kernfs_get(kn); + } - remove_files(sd, kobj, grp); + remove_files(kn, grp); if (grp->name) - sysfs_remove_subdir(sd); + kernfs_remove(kn); - sysfs_put(sd); + kernfs_put(kn); } EXPORT_SYMBOL_GPL(sysfs_remove_group); @@ -256,22 +264,22 @@ EXPORT_SYMBOL_GPL(sysfs_remove_groups); int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp) { - struct sysfs_dirent *dir_sd; + struct kernfs_node *parent; int error = 0; struct attribute *const *attr; int i; - dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name); - if (!dir_sd) + parent = kernfs_find_and_get(kobj->sd, grp->name); + if (!parent) return -ENOENT; for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr)) - error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR); + error = sysfs_add_file(parent, *attr, false); if (error) { while (--i >= 0) - sysfs_hash_and_remove(dir_sd, NULL, (*--attr)->name); + kernfs_remove_by_name(parent, (*--attr)->name); } - sysfs_put(dir_sd); + kernfs_put(parent); return error; } @@ -285,14 +293,14 @@ EXPORT_SYMBOL_GPL(sysfs_merge_group); void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp) { - struct sysfs_dirent *dir_sd; + struct kernfs_node *parent; struct attribute *const *attr; - dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name); - if (dir_sd) { + parent = kernfs_find_and_get(kobj->sd, grp->name); + if (parent) { for (attr = grp->attrs; *attr; ++attr) - sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name); - sysfs_put(dir_sd); + kernfs_remove_by_name(parent, (*attr)->name); + kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_unmerge_group); @@ -307,15 +315,15 @@ EXPORT_SYMBOL_GPL(sysfs_unmerge_group); int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name) { - struct sysfs_dirent *dir_sd; + struct kernfs_node *parent; int error = 0; - dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name); - if (!dir_sd) + parent = kernfs_find_and_get(kobj->sd, group_name); + if (!parent) return -ENOENT; - error = sysfs_create_link_sd(dir_sd, target, link_name); - sysfs_put(dir_sd); + error = sysfs_create_link_sd(parent, target, link_name); + kernfs_put(parent); return error; } @@ -330,12 +338,12 @@ EXPORT_SYMBOL_GPL(sysfs_add_link_to_group); void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name) { - struct sysfs_dirent *dir_sd; + struct kernfs_node *parent; - dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name); - if (dir_sd) { - sysfs_hash_and_remove(dir_sd, NULL, link_name); - sysfs_put(dir_sd); + parent = kernfs_find_and_get(kobj->sd, group_name); + if (parent) { + kernfs_remove_by_name(parent, link_name); + kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c deleted file mode 100644 index 963f910c803..00000000000 --- a/fs/sysfs/inode.c +++ /dev/null @@ -1,357 +0,0 @@ -/* - * fs/sysfs/inode.c - basic sysfs inode and dentry operations - * - * Copyright (c) 2001-3 Patrick Mochel - * Copyright (c) 2007 SUSE Linux Products GmbH - * Copyright (c) 2007 Tejun Heo <teheo@suse.de> - * - * This file is released under the GPLv2. - * - * Please see Documentation/filesystems/sysfs.txt for more information. - */ - -#undef DEBUG - -#include <linux/pagemap.h> -#include <linux/namei.h> -#include <linux/backing-dev.h> -#include <linux/capability.h> -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/sysfs.h> -#include <linux/xattr.h> -#include <linux/security.h> -#include "sysfs.h" - -static const struct address_space_operations sysfs_aops = { - .readpage = simple_readpage, - .write_begin = simple_write_begin, - .write_end = simple_write_end, -}; - -static struct backing_dev_info sysfs_backing_dev_info = { - .name = "sysfs", - .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; - -static const struct inode_operations sysfs_inode_operations = { - .permission = sysfs_permission, - .setattr = sysfs_setattr, - .getattr = sysfs_getattr, - .setxattr = sysfs_setxattr, -}; - -int __init sysfs_inode_init(void) -{ - return bdi_init(&sysfs_backing_dev_info); -} - -static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd) -{ - struct sysfs_inode_attrs *attrs; - struct iattr *iattrs; - - attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL); - if (!attrs) - return NULL; - iattrs = &attrs->ia_iattr; - - /* assign default attributes */ - iattrs->ia_mode = sd->s_mode; - iattrs->ia_uid = GLOBAL_ROOT_UID; - iattrs->ia_gid = GLOBAL_ROOT_GID; - iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; - - return attrs; -} - -int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr) -{ - struct sysfs_inode_attrs *sd_attrs; - struct iattr *iattrs; - unsigned int ia_valid = iattr->ia_valid; - - sd_attrs = sd->s_iattr; - - if (!sd_attrs) { - /* setting attributes for the first time, allocate now */ - sd_attrs = sysfs_init_inode_attrs(sd); - if (!sd_attrs) - return -ENOMEM; - sd->s_iattr = sd_attrs; - } - /* attributes were changed at least once in past */ - iattrs = &sd_attrs->ia_iattr; - - if (ia_valid & ATTR_UID) - iattrs->ia_uid = iattr->ia_uid; - if (ia_valid & ATTR_GID) - iattrs->ia_gid = iattr->ia_gid; - if (ia_valid & ATTR_ATIME) - iattrs->ia_atime = iattr->ia_atime; - if (ia_valid & ATTR_MTIME) - iattrs->ia_mtime = iattr->ia_mtime; - if (ia_valid & ATTR_CTIME) - iattrs->ia_ctime = iattr->ia_ctime; - if (ia_valid & ATTR_MODE) { - umode_t mode = iattr->ia_mode; - iattrs->ia_mode = sd->s_mode = mode; - } - return 0; -} - -int sysfs_setattr(struct dentry *dentry, struct iattr *iattr) -{ - struct inode *inode = dentry->d_inode; - struct sysfs_dirent *sd = dentry->d_fsdata; - int error; - - if (!sd) - return -EINVAL; - - mutex_lock(&sysfs_mutex); - error = inode_change_ok(inode, iattr); - if (error) - goto out; - - error = sysfs_sd_setattr(sd, iattr); - if (error) - goto out; - - /* this ignores size changes */ - setattr_copy(inode, iattr); - -out: - mutex_unlock(&sysfs_mutex); - return error; -} - -static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata, - u32 *secdata_len) -{ - struct sysfs_inode_attrs *iattrs; - void *old_secdata; - size_t old_secdata_len; - - if (!sd->s_iattr) { - sd->s_iattr = sysfs_init_inode_attrs(sd); - if (!sd->s_iattr) - return -ENOMEM; - } - - iattrs = sd->s_iattr; - old_secdata = iattrs->ia_secdata; - old_secdata_len = iattrs->ia_secdata_len; - - iattrs->ia_secdata = *secdata; - iattrs->ia_secdata_len = *secdata_len; - - *secdata = old_secdata; - *secdata_len = old_secdata_len; - return 0; -} - -int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags) -{ - struct sysfs_dirent *sd = dentry->d_fsdata; - void *secdata; - int error; - u32 secdata_len = 0; - - if (!sd) - return -EINVAL; - - if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { - const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; - error = security_inode_setsecurity(dentry->d_inode, suffix, - value, size, flags); - if (error) - goto out; - error = security_inode_getsecctx(dentry->d_inode, - &secdata, &secdata_len); - if (error) - goto out; - - mutex_lock(&sysfs_mutex); - error = sysfs_sd_setsecdata(sd, &secdata, &secdata_len); - mutex_unlock(&sysfs_mutex); - - if (secdata) - security_release_secctx(secdata, secdata_len); - } else - return -EINVAL; -out: - return error; -} - -static inline void set_default_inode_attr(struct inode *inode, umode_t mode) -{ - inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; -} - -static inline void set_inode_attr(struct inode *inode, struct iattr *iattr) -{ - inode->i_uid = iattr->ia_uid; - inode->i_gid = iattr->ia_gid; - inode->i_atime = iattr->ia_atime; - inode->i_mtime = iattr->ia_mtime; - inode->i_ctime = iattr->ia_ctime; -} - -static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode) -{ - struct sysfs_inode_attrs *iattrs = sd->s_iattr; - - inode->i_mode = sd->s_mode; - if (iattrs) { - /* sysfs_dirent has non-default attributes - * get them from persistent copy in sysfs_dirent - */ - set_inode_attr(inode, &iattrs->ia_iattr); - security_inode_notifysecctx(inode, - iattrs->ia_secdata, - iattrs->ia_secdata_len); - } - - if (sysfs_type(sd) == SYSFS_DIR) - set_nlink(inode, sd->s_dir.subdirs + 2); -} - -int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) -{ - struct sysfs_dirent *sd = dentry->d_fsdata; - struct inode *inode = dentry->d_inode; - - mutex_lock(&sysfs_mutex); - sysfs_refresh_inode(sd, inode); - mutex_unlock(&sysfs_mutex); - - generic_fillattr(inode, stat); - return 0; -} - -static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) -{ - struct bin_attribute *bin_attr; - - inode->i_private = sysfs_get(sd); - inode->i_mapping->a_ops = &sysfs_aops; - inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; - inode->i_op = &sysfs_inode_operations; - - set_default_inode_attr(inode, sd->s_mode); - sysfs_refresh_inode(sd, inode); - - /* initialize inode according to type */ - switch (sysfs_type(sd)) { - case SYSFS_DIR: - inode->i_op = &sysfs_dir_inode_operations; - inode->i_fop = &sysfs_dir_operations; - break; - case SYSFS_KOBJ_ATTR: - inode->i_size = PAGE_SIZE; - inode->i_fop = &sysfs_file_operations; - break; - case SYSFS_KOBJ_BIN_ATTR: - bin_attr = sd->s_bin_attr.bin_attr; - inode->i_size = bin_attr->size; - inode->i_fop = &bin_fops; - break; - case SYSFS_KOBJ_LINK: - inode->i_op = &sysfs_symlink_inode_operations; - break; - default: - BUG(); - } - - unlock_new_inode(inode); -} - -/** - * sysfs_get_inode - get inode for sysfs_dirent - * @sb: super block - * @sd: sysfs_dirent to allocate inode for - * - * Get inode for @sd. If such inode doesn't exist, a new inode - * is allocated and basics are initialized. New inode is - * returned locked. - * - * LOCKING: - * Kernel thread context (may sleep). - * - * RETURNS: - * Pointer to allocated inode on success, NULL on failure. - */ -struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd) -{ - struct inode *inode; - - inode = iget_locked(sb, sd->s_ino); - if (inode && (inode->i_state & I_NEW)) - sysfs_init_inode(sd, inode); - - return inode; -} - -/* - * The sysfs_dirent serves as both an inode and a directory entry for sysfs. - * To prevent the sysfs inode numbers from being freed prematurely we take a - * reference to sysfs_dirent from the sysfs inode. A - * super_operations.evict_inode() implementation is needed to drop that - * reference upon inode destruction. - */ -void sysfs_evict_inode(struct inode *inode) -{ - struct sysfs_dirent *sd = inode->i_private; - - truncate_inode_pages(&inode->i_data, 0); - clear_inode(inode); - sysfs_put(sd); -} - -int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, - const char *name) -{ - struct sysfs_addrm_cxt acxt; - struct sysfs_dirent *sd; - - if (!dir_sd) { - WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n", - name); - return -ENOENT; - } - - sysfs_addrm_start(&acxt, dir_sd); - - sd = sysfs_find_dirent(dir_sd, ns, name); - if (sd) - sysfs_remove_one(&acxt, sd); - - sysfs_addrm_finish(&acxt); - - if (sd) - return 0; - else - return -ENOENT; -} - -int sysfs_permission(struct inode *inode, int mask) -{ - struct sysfs_dirent *sd; - - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - - sd = inode->i_private; - - mutex_lock(&sysfs_mutex); - sysfs_refresh_inode(sd, inode); - mutex_unlock(&sysfs_mutex); - - return generic_permission(inode, mask); -} diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 834ec2cdb7a..8a49486bf30 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -13,147 +13,45 @@ #define DEBUG #include <linux/fs.h> +#include <linux/magic.h> #include <linux/mount.h> -#include <linux/pagemap.h> #include <linux/init.h> -#include <linux/module.h> -#include <linux/magic.h> -#include <linux/slab.h> #include <linux/user_namespace.h> #include "sysfs.h" - -static struct vfsmount *sysfs_mnt; -struct kmem_cache *sysfs_dir_cachep; - -static const struct super_operations sysfs_ops = { - .statfs = simple_statfs, - .drop_inode = generic_delete_inode, - .evict_inode = sysfs_evict_inode, -}; - -struct sysfs_dirent sysfs_root = { - .s_name = "", - .s_count = ATOMIC_INIT(1), - .s_flags = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT), - .s_mode = S_IFDIR | S_IRUGO | S_IXUGO, - .s_ino = 1, -}; - -static int sysfs_fill_super(struct super_block *sb, void *data, int silent) -{ - struct inode *inode; - struct dentry *root; - - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = SYSFS_MAGIC; - sb->s_op = &sysfs_ops; - sb->s_time_gran = 1; - - /* get root inode, initialize and unlock it */ - mutex_lock(&sysfs_mutex); - inode = sysfs_get_inode(sb, &sysfs_root); - mutex_unlock(&sysfs_mutex); - if (!inode) { - pr_debug("sysfs: could not get root inode\n"); - return -ENOMEM; - } - - /* instantiate and link root dentry */ - root = d_make_root(inode); - if (!root) { - pr_debug("%s: could not get root dentry!\n", __func__); - return -ENOMEM; - } - root->d_fsdata = &sysfs_root; - sb->s_root = root; - sb->s_d_op = &sysfs_dentry_ops; - return 0; -} - -static int sysfs_test_super(struct super_block *sb, void *data) -{ - struct sysfs_super_info *sb_info = sysfs_info(sb); - struct sysfs_super_info *info = data; - enum kobj_ns_type type; - int found = 1; - - for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) { - if (sb_info->ns[type] != info->ns[type]) - found = 0; - } - return found; -} - -static int sysfs_set_super(struct super_block *sb, void *data) -{ - int error; - error = set_anon_super(sb, data); - if (!error) - sb->s_fs_info = data; - return error; -} - -static void free_sysfs_super_info(struct sysfs_super_info *info) -{ - int type; - for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) - kobj_ns_drop(type, info->ns[type]); - kfree(info); -} +static struct kernfs_root *sysfs_root; +struct kernfs_node *sysfs_root_kn; static struct dentry *sysfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - struct sysfs_super_info *info; - enum kobj_ns_type type; - struct super_block *sb; - int error; + struct dentry *root; + void *ns; + bool new_sb; if (!(flags & MS_KERNMOUNT)) { if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type)) return ERR_PTR(-EPERM); - for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) { - if (!kobj_ns_current_may_mount(type)) - return ERR_PTR(-EPERM); - } - } - - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) - return ERR_PTR(-ENOMEM); - - for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) - info->ns[type] = kobj_ns_grab_current(type); - - sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info); - if (IS_ERR(sb) || sb->s_fs_info != info) - free_sysfs_super_info(info); - if (IS_ERR(sb)) - return ERR_CAST(sb); - if (!sb->s_root) { - error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(sb); - return ERR_PTR(error); - } - sb->s_flags |= MS_ACTIVE; + if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET)) + return ERR_PTR(-EPERM); } - return dget(sb->s_root); + ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); + root = kernfs_mount_ns(fs_type, flags, sysfs_root, + SYSFS_MAGIC, &new_sb, ns); + if (IS_ERR(root) || !new_sb) + kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); + return root; } static void sysfs_kill_sb(struct super_block *sb) { - struct sysfs_super_info *info = sysfs_info(sb); - /* Remove the superblock from fs_supers/s_instances - * so we can't find it, before freeing sysfs_super_info. - */ - kill_anon_super(sb); - free_sysfs_super_info(info); + void *ns = (void *)kernfs_super_ns(sb); + + kernfs_kill_sb(sb); + kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); } static struct file_system_type sysfs_fs_type = { @@ -165,48 +63,20 @@ static struct file_system_type sysfs_fs_type = { int __init sysfs_init(void) { - int err = -ENOMEM; + int err; - sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache", - sizeof(struct sysfs_dirent), - 0, 0, NULL); - if (!sysfs_dir_cachep) - goto out; + sysfs_root = kernfs_create_root(NULL, KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, + NULL); + if (IS_ERR(sysfs_root)) + return PTR_ERR(sysfs_root); - err = sysfs_inode_init(); - if (err) - goto out_err; + sysfs_root_kn = sysfs_root->kn; err = register_filesystem(&sysfs_fs_type); - if (!err) { - sysfs_mnt = kern_mount(&sysfs_fs_type); - if (IS_ERR(sysfs_mnt)) { - printk(KERN_ERR "sysfs: could not mount!\n"); - err = PTR_ERR(sysfs_mnt); - sysfs_mnt = NULL; - unregister_filesystem(&sysfs_fs_type); - goto out_err; - } - } else - goto out_err; -out: - return err; -out_err: - kmem_cache_destroy(sysfs_dir_cachep); - sysfs_dir_cachep = NULL; - goto out; -} - -#undef sysfs_get -struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd) -{ - return __sysfs_get(sd); -} -EXPORT_SYMBOL_GPL(sysfs_get); + if (err) { + kernfs_destroy_root(sysfs_root); + return err; + } -#undef sysfs_put -void sysfs_put(struct sysfs_dirent *sd) -{ - __sysfs_put(sd); + return 0; } -EXPORT_SYMBOL_GPL(sysfs_put); diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 2dd4507d9ed..aecb15f8455 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -11,107 +11,73 @@ */ #include <linux/fs.h> -#include <linux/gfp.h> -#include <linux/mount.h> #include <linux/module.h> #include <linux/kobject.h> -#include <linux/namei.h> #include <linux/mutex.h> #include <linux/security.h> #include "sysfs.h" -static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd, - struct kobject *target, +static int sysfs_do_create_link_sd(struct kernfs_node *parent, + struct kobject *target_kobj, const char *name, int warn) { - struct sysfs_dirent *target_sd = NULL; - struct sysfs_dirent *sd = NULL; - struct sysfs_addrm_cxt acxt; - enum kobj_ns_type ns_type; - int error; + struct kernfs_node *kn, *target = NULL; - BUG_ON(!name || !parent_sd); + BUG_ON(!name || !parent); - /* target->sd can go away beneath us but is protected with - * sysfs_assoc_lock. Fetch target_sd from it. + /* + * We don't own @target_kobj and it may be removed at any time. + * Synchronize using sysfs_symlink_target_lock. See + * sysfs_remove_dir() for details. */ - spin_lock(&sysfs_assoc_lock); - if (target->sd) - target_sd = sysfs_get(target->sd); - spin_unlock(&sysfs_assoc_lock); - - error = -ENOENT; - if (!target_sd) - goto out_put; - - error = -ENOMEM; - sd = sysfs_new_dirent(name, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK); - if (!sd) - goto out_put; - - ns_type = sysfs_ns_type(parent_sd); - if (ns_type) - sd->s_ns = target->ktype->namespace(target); - sd->s_symlink.target_sd = target_sd; - target_sd = NULL; /* reference is now owned by the symlink */ - - sysfs_addrm_start(&acxt, parent_sd); - /* Symlinks must be between directories with the same ns_type */ - if (!ns_type || - (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) { - if (warn) - error = sysfs_add_one(&acxt, sd); - else - error = __sysfs_add_one(&acxt, sd); - } else { - error = -EINVAL; - WARN(1, KERN_WARNING - "sysfs: symlink across ns_types %s/%s -> %s/%s\n", - parent_sd->s_name, - sd->s_name, - sd->s_symlink.target_sd->s_parent->s_name, - sd->s_symlink.target_sd->s_name); + spin_lock(&sysfs_symlink_target_lock); + if (target_kobj->sd) { + target = target_kobj->sd; + kernfs_get(target); } - sysfs_addrm_finish(&acxt); + spin_unlock(&sysfs_symlink_target_lock); + + if (!target) + return -ENOENT; - if (error) - goto out_put; + kn = kernfs_create_link(parent, name, target); + kernfs_put(target); - return 0; + if (!IS_ERR(kn)) + return 0; - out_put: - sysfs_put(target_sd); - sysfs_put(sd); - return error; + if (warn && PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(parent, name); + return PTR_ERR(kn); } /** * sysfs_create_link_sd - create symlink to a given object. - * @sd: directory we're creating the link in. + * @kn: directory we're creating the link in. * @target: object we're pointing to. * @name: name of the symlink. */ -int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target, +int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target, const char *name) { - return sysfs_do_create_link_sd(sd, target, name, 1); + return sysfs_do_create_link_sd(kn, target, name, 1); } static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, const char *name, int warn) { - struct sysfs_dirent *parent_sd = NULL; + struct kernfs_node *parent = NULL; if (!kobj) - parent_sd = &sysfs_root; + parent = sysfs_root_kn; else - parent_sd = kobj->sd; + parent = kobj->sd; - if (!parent_sd) + if (!parent) return -EFAULT; - return sysfs_do_create_link_sd(parent_sd, target, name, warn); + return sysfs_do_create_link_sd(parent, target, name, warn); } /** @@ -155,11 +121,17 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, const char *name) { const void *ns = NULL; - spin_lock(&sysfs_assoc_lock); - if (targ->sd && sysfs_ns_type(kobj->sd)) - ns = targ->sd->s_ns; - spin_unlock(&sysfs_assoc_lock); - sysfs_hash_and_remove(kobj->sd, ns, name); + + /* + * We don't own @target and it may be removed at any time. + * Synchronize using sysfs_symlink_target_lock. See + * sysfs_remove_dir() for details. + */ + spin_lock(&sysfs_symlink_target_lock); + if (targ->sd && kernfs_ns_enabled(kobj->sd)) + ns = targ->sd->ns; + spin_unlock(&sysfs_symlink_target_lock); + kernfs_remove_by_name_ns(kobj->sd, name, ns); } /** @@ -169,156 +141,57 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, */ void sysfs_remove_link(struct kobject *kobj, const char *name) { - struct sysfs_dirent *parent_sd = NULL; + struct kernfs_node *parent = NULL; if (!kobj) - parent_sd = &sysfs_root; + parent = sysfs_root_kn; else - parent_sd = kobj->sd; + parent = kobj->sd; - sysfs_hash_and_remove(parent_sd, NULL, name); + kernfs_remove_by_name(parent, name); } EXPORT_SYMBOL_GPL(sysfs_remove_link); /** - * sysfs_rename_link - rename symlink in object's directory. + * sysfs_rename_link_ns - rename symlink in object's directory. * @kobj: object we're acting for. * @targ: object we're pointing to. * @old: previous name of the symlink. * @new: new name of the symlink. + * @new_ns: new namespace of the symlink. * * A helper function for the common rename symlink idiom. */ -int sysfs_rename_link(struct kobject *kobj, struct kobject *targ, - const char *old, const char *new) +int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ, + const char *old, const char *new, const void *new_ns) { - struct sysfs_dirent *parent_sd, *sd = NULL; - const void *old_ns = NULL, *new_ns = NULL; + struct kernfs_node *parent, *kn = NULL; + const void *old_ns = NULL; int result; if (!kobj) - parent_sd = &sysfs_root; + parent = sysfs_root_kn; else - parent_sd = kobj->sd; + parent = kobj->sd; if (targ->sd) - old_ns = targ->sd->s_ns; + old_ns = targ->sd->ns; result = -ENOENT; - sd = sysfs_get_dirent(parent_sd, old_ns, old); - if (!sd) + kn = kernfs_find_and_get_ns(parent, old, old_ns); + if (!kn) goto out; result = -EINVAL; - if (sysfs_type(sd) != SYSFS_KOBJ_LINK) + if (kernfs_type(kn) != KERNFS_LINK) goto out; - if (sd->s_symlink.target_sd->s_dir.kobj != targ) + if (kn->symlink.target_kn->priv != targ) goto out; - if (sysfs_ns_type(parent_sd)) - new_ns = targ->ktype->namespace(targ); - - result = sysfs_rename(sd, parent_sd, new_ns, new); + result = kernfs_rename_ns(kn, parent, new, new_ns); out: - sysfs_put(sd); + kernfs_put(kn); return result; } -EXPORT_SYMBOL_GPL(sysfs_rename_link); - -static int sysfs_get_target_path(struct sysfs_dirent *parent_sd, - struct sysfs_dirent *target_sd, char *path) -{ - struct sysfs_dirent *base, *sd; - char *s = path; - int len = 0; - - /* go up to the root, stop at the base */ - base = parent_sd; - while (base->s_parent) { - sd = target_sd->s_parent; - while (sd->s_parent && base != sd) - sd = sd->s_parent; - - if (base == sd) - break; - - strcpy(s, "../"); - s += 3; - base = base->s_parent; - } - - /* determine end of target string for reverse fillup */ - sd = target_sd; - while (sd->s_parent && sd != base) { - len += strlen(sd->s_name) + 1; - sd = sd->s_parent; - } - - /* check limits */ - if (len < 2) - return -EINVAL; - len--; - if ((s - path) + len > PATH_MAX) - return -ENAMETOOLONG; - - /* reverse fillup of target string from target to base */ - sd = target_sd; - while (sd->s_parent && sd != base) { - int slen = strlen(sd->s_name); - - len -= slen; - strncpy(s + len, sd->s_name, slen); - if (len) - s[--len] = '/'; - - sd = sd->s_parent; - } - - return 0; -} - -static int sysfs_getlink(struct dentry *dentry, char *path) -{ - struct sysfs_dirent *sd = dentry->d_fsdata; - struct sysfs_dirent *parent_sd = sd->s_parent; - struct sysfs_dirent *target_sd = sd->s_symlink.target_sd; - int error; - - mutex_lock(&sysfs_mutex); - error = sysfs_get_target_path(parent_sd, target_sd, path); - mutex_unlock(&sysfs_mutex); - - return error; -} - -static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - int error = -ENOMEM; - unsigned long page = get_zeroed_page(GFP_KERNEL); - if (page) { - error = sysfs_getlink(dentry, (char *) page); - if (error < 0) - free_page((unsigned long)page); - } - nd_set_link(nd, error ? ERR_PTR(error) : (char *)page); - return NULL; -} - -static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, - void *cookie) -{ - char *page = nd_get_link(nd); - if (!IS_ERR(page)) - free_page((unsigned long)page); -} - -const struct inode_operations sysfs_symlink_inode_operations = { - .setxattr = sysfs_setxattr, - .readlink = generic_readlink, - .follow_link = sysfs_follow_link, - .put_link = sysfs_put_link, - .setattr = sysfs_setattr, - .getattr = sysfs_getattr, - .permission = sysfs_permission, -}; +EXPORT_SYMBOL_GPL(sysfs_rename_link_ns); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index b6deca3e301..0e2f1cccb81 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -8,239 +8,36 @@ * This file is released under the GPLv2. */ -#include <linux/lockdep.h> -#include <linux/kobject_ns.h> -#include <linux/fs.h> -#include <linux/rbtree.h> +#ifndef __SYSFS_INTERNAL_H +#define __SYSFS_INTERNAL_H -struct sysfs_open_dirent; - -/* type-specific structures for sysfs_dirent->s_* union members */ -struct sysfs_elem_dir { - struct kobject *kobj; - - unsigned long subdirs; - /* children rbtree starts here and goes through sd->s_rb */ - struct rb_root children; -}; - -struct sysfs_elem_symlink { - struct sysfs_dirent *target_sd; -}; - -struct sysfs_elem_attr { - struct attribute *attr; - struct sysfs_open_dirent *open; -}; - -struct sysfs_elem_bin_attr { - struct bin_attribute *bin_attr; - struct hlist_head buffers; -}; - -struct sysfs_inode_attrs { - struct iattr ia_iattr; - void *ia_secdata; - u32 ia_secdata_len; -}; - -/* - * sysfs_dirent - the building block of sysfs hierarchy. Each and - * every sysfs node is represented by single sysfs_dirent. - * - * As long as s_count reference is held, the sysfs_dirent itself is - * accessible. Dereferencing s_elem or any other outer entity - * requires s_active reference. - */ -struct sysfs_dirent { - atomic_t s_count; - atomic_t s_active; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif - struct sysfs_dirent *s_parent; - const char *s_name; - - struct rb_node s_rb; - - union { - struct completion *completion; - struct sysfs_dirent *removed_list; - } u; - - const void *s_ns; /* namespace tag */ - unsigned int s_hash; /* ns + name hash */ - union { - struct sysfs_elem_dir s_dir; - struct sysfs_elem_symlink s_symlink; - struct sysfs_elem_attr s_attr; - struct sysfs_elem_bin_attr s_bin_attr; - }; - - unsigned short s_flags; - umode_t s_mode; - unsigned int s_ino; - struct sysfs_inode_attrs *s_iattr; -}; - -#define SD_DEACTIVATED_BIAS INT_MIN - -#define SYSFS_TYPE_MASK 0x00ff -#define SYSFS_DIR 0x0001 -#define SYSFS_KOBJ_ATTR 0x0002 -#define SYSFS_KOBJ_BIN_ATTR 0x0004 -#define SYSFS_KOBJ_LINK 0x0008 -#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) -#define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR) - -/* identify any namespace tag on sysfs_dirents */ -#define SYSFS_NS_TYPE_MASK 0xf00 -#define SYSFS_NS_TYPE_SHIFT 8 - -#define SYSFS_FLAG_MASK ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK) -#define SYSFS_FLAG_REMOVED 0x02000 - -static inline unsigned int sysfs_type(struct sysfs_dirent *sd) -{ - return sd->s_flags & SYSFS_TYPE_MASK; -} - -/* - * Return any namespace tags on this dirent. - * enum kobj_ns_type is defined in linux/kobject.h - */ -static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd) -{ - return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT; -} - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -#define sysfs_dirent_init_lockdep(sd) \ -do { \ - struct attribute *attr = sd->s_attr.attr; \ - struct lock_class_key *key = attr->key; \ - if (!key) \ - key = &attr->skey; \ - \ - lockdep_init_map(&sd->dep_map, "s_active", key, 0); \ -} while (0) -#else -#define sysfs_dirent_init_lockdep(sd) do {} while (0) -#endif - -/* - * Context structure to be used while adding/removing nodes. - */ -struct sysfs_addrm_cxt { - struct sysfs_dirent *parent_sd; - struct sysfs_dirent *removed; -}; +#include <linux/sysfs.h> /* * mount.c */ - -/* - * Each sb is associated with a set of namespace tags (i.e. - * the network namespace of the task which mounted this sysfs - * instance). - */ -struct sysfs_super_info { - void *ns[KOBJ_NS_TYPES]; -}; -#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info)) -extern struct sysfs_dirent sysfs_root; -extern struct kmem_cache *sysfs_dir_cachep; +extern struct kernfs_node *sysfs_root_kn; /* * dir.c */ -extern struct mutex sysfs_mutex; -extern spinlock_t sysfs_assoc_lock; -extern const struct dentry_operations sysfs_dentry_ops; - -extern const struct file_operations sysfs_dir_operations; -extern const struct inode_operations sysfs_dir_inode_operations; - -struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd); -struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd); -void sysfs_put_active(struct sysfs_dirent *sd); -void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, - struct sysfs_dirent *parent_sd); -int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd); -int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd); -void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd); -void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt); - -struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, - const void *ns, - const unsigned char *name); -struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, - const void *ns, - const unsigned char *name); -struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type); - -void release_sysfs_dirent(struct sysfs_dirent *sd); +extern spinlock_t sysfs_symlink_target_lock; -int sysfs_create_subdir(struct kobject *kobj, const char *name, - struct sysfs_dirent **p_sd); -void sysfs_remove_subdir(struct sysfs_dirent *sd); - -int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd, - const void *ns, const char *new_name); - -static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd) -{ - if (sd) { - WARN_ON(!atomic_read(&sd->s_count)); - atomic_inc(&sd->s_count); - } - return sd; -} -#define sysfs_get(sd) __sysfs_get(sd) - -static inline void __sysfs_put(struct sysfs_dirent *sd) -{ - if (sd && atomic_dec_and_test(&sd->s_count)) - release_sysfs_dirent(sd); -} -#define sysfs_put(sd) __sysfs_put(sd) - -/* - * inode.c - */ -struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd); -void sysfs_evict_inode(struct inode *inode); -int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr); -int sysfs_permission(struct inode *inode, int mask); -int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); -int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat); -int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags); -int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, - const char *name); -int sysfs_inode_init(void); +void sysfs_warn_dup(struct kernfs_node *parent, const char *name); /* * file.c */ -extern const struct file_operations sysfs_file_operations; - -int sysfs_add_file(struct sysfs_dirent *dir_sd, - const struct attribute *attr, int type); - -int sysfs_add_file_mode(struct sysfs_dirent *dir_sd, - const struct attribute *attr, int type, umode_t amode); -/* - * bin.c - */ -extern const struct file_operations bin_fops; -void unmap_bin_file(struct sysfs_dirent *attr_sd); +int sysfs_add_file(struct kernfs_node *parent, + const struct attribute *attr, bool is_bin); +int sysfs_add_file_mode_ns(struct kernfs_node *parent, + const struct attribute *attr, bool is_bin, + umode_t amode, const void *ns); /* * symlink.c */ -extern const struct inode_operations sysfs_symlink_inode_operations; -int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target, +int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target, const char *name); + +#endif /* __SYSFS_INTERNAL_H */ diff --git a/fs/sysv/file.c b/fs/sysv/file.c index 9d4dc683179..b00811c75b2 100644 --- a/fs/sysv/file.c +++ b/fs/sysv/file.c @@ -21,10 +21,10 @@ */ const struct file_operations sysv_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index c327d4ee123..88956309cc8 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -60,6 +60,7 @@ static int sysv_remount(struct super_block *sb, int *flags, char *data) { struct sysv_sb_info *sbi = SYSV_SB(sb); + sync_filesystem(sb); if (sbi->s_forced_ro) *flags |= MS_RDONLY; return 0; @@ -295,7 +296,7 @@ int sysv_sync_inode(struct inode *inode) static void sysv_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (!inode->i_nlink) { inode->i_size = 0; sysv_truncate(inode); diff --git a/fs/timerfd.c b/fs/timerfd.c index 929312180dd..0013142c047 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -317,6 +317,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME && clockid != CLOCK_REALTIME_ALARM && + clockid != CLOCK_BOOTTIME && clockid != CLOCK_BOOTTIME_ALARM)) return -EINVAL; diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index e8e01d74dc0..eb997e9c4ab 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -437,7 +437,6 @@ static int calc_dd_growth(const struct ubifs_info *c, */ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req) { - int uninitialized_var(cmt_retries), uninitialized_var(wb_retries); int err, idx_growth, data_growth, dd_growth, retried = 0; ubifs_assert(req->new_page <= 1); diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 6e025e02ffd..177b0152fef 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -745,8 +745,10 @@ void ubifs_dump_lprops(struct ubifs_info *c) for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) { err = ubifs_read_one_lp(c, lnum, &lp); - if (err) + if (err) { ubifs_err("cannot read lprops for LEB %d", lnum); + continue; + } ubifs_dump_lprop(c, &lp); } @@ -2118,26 +2120,10 @@ out_free: */ static void free_inodes(struct fsck_data *fsckd) { - struct rb_node *this = fsckd->inodes.rb_node; - struct fsck_inode *fscki; + struct fsck_inode *fscki, *n; - while (this) { - if (this->rb_left) - this = this->rb_left; - else if (this->rb_right) - this = this->rb_right; - else { - fscki = rb_entry(this, struct fsck_inode, rb); - this = rb_parent(this); - if (this) { - if (this->rb_left == &fscki->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } - kfree(fscki); - } - } + rbtree_postorder_for_each_entry_safe(fscki, n, &fsckd->inodes, rb) + kfree(fscki); } /** @@ -2563,9 +2549,9 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf, unsigned int from, to, ffs = chance(1, 2); unsigned char *p = (void *)buf; - from = prandom_u32() % (len + 1); - /* Corruption may only span one max. write unit */ - to = min(len, ALIGN(from, c->max_write_size)); + from = prandom_u32() % len; + /* Corruption span max to end of write unit */ + to = min(len, ALIGN(from + 1, c->max_write_size)); ubifs_warn("filled bytes %u-%u with %s", from, to - 1, ffs ? "0xFFs" : "random data"); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 123c79b7261..b5b593c4527 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -903,8 +903,9 @@ static int do_writepage(struct page *page, int len) struct ubifs_info *c = inode->i_sb->s_fs_info; #ifdef UBIFS_DEBUG + struct ubifs_inode *ui = ubifs_inode(inode); spin_lock(&ui->ui_lock); - ubifs_assert(page->index <= ui->synced_i_size << PAGE_CACHE_SIZE); + ubifs_assert(page->index <= ui->synced_i_size >> PAGE_CACHE_SHIFT); spin_unlock(&ui->ui_lock); #endif @@ -1363,17 +1364,17 @@ static inline int mctime_update_needed(const struct inode *inode, /** * update_ctime - update mtime and ctime of an inode. - * @c: UBIFS file-system description object * @inode: inode to update * * This function updates mtime and ctime of the inode if it is not equivalent to * current time. Returns zero in case of success and a negative error code in * case of failure. */ -static int update_mctime(struct ubifs_info *c, struct inode *inode) +static int update_mctime(struct inode *inode) { struct timespec now = ubifs_current_time(inode); struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_info *c = inode->i_sb->s_fs_info; if (mctime_update_needed(inode, &now)) { int err, release; @@ -1396,18 +1397,13 @@ static int update_mctime(struct ubifs_info *c, struct inode *inode) return 0; } -static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t ubifs_write_iter(struct kiocb *iocb, struct iov_iter *from) { - int err; - struct inode *inode = iocb->ki_filp->f_mapping->host; - struct ubifs_info *c = inode->i_sb->s_fs_info; - - err = update_mctime(c, inode); + int err = update_mctime(file_inode(iocb->ki_filp)); if (err) return err; - return generic_file_aio_write(iocb, iov, nr_segs, pos); + return generic_file_write_iter(iocb, from); } static int ubifs_set_page_dirty(struct page *page) @@ -1525,8 +1521,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, } wait_for_stable_page(page); - unlock_page(page); - return 0; + return VM_FAULT_LOCKED; out_unlock: unlock_page(page); @@ -1538,6 +1533,7 @@ out_unlock: static const struct vm_operations_struct ubifs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = ubifs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; @@ -1581,15 +1577,15 @@ const struct inode_operations ubifs_symlink_inode_operations = { const struct file_operations ubifs_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = ubifs_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = ubifs_write_iter, .mmap = ubifs_file_mmap, .fsync = ubifs_fsync, .unlocked_ioctl = ubifs_ioctl, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, #ifdef CONFIG_COMPAT .compat_ioctl = ubifs_compat_ioctl, #endif diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 76ca53cd3ee..9718da86ad0 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -668,8 +668,7 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway) ubifs_assert(!wbuf->used); for (i = 0; ; i++) { - int space_before = c->leb_size - wbuf->offs - wbuf->used; - int space_after; + int space_before, space_after; cond_resched(); diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index e18b9889a51..2290d586672 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -988,30 +988,32 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, return err; if (type != ch->node_type) { - ubifs_err("bad node type (%d but expected %d)", - ch->node_type, type); + ubifs_errc(c, "bad node type (%d but expected %d)", + ch->node_type, type); goto out; } err = ubifs_check_node(c, buf, lnum, offs, 0, 0); if (err) { - ubifs_err("expected node type %d", type); + ubifs_errc(c, "expected node type %d", type); return err; } l = le32_to_cpu(ch->len); if (l != len) { - ubifs_err("bad node length %d, expected %d", l, len); + ubifs_errc(c, "bad node length %d, expected %d", l, len); goto out; } return 0; out: - ubifs_err("bad node at LEB %d:%d, LEB mapping status %d", lnum, offs, - ubi_is_mapped(c->ubi, lnum)); - ubifs_dump_node(c, buf); - dump_stack(); + ubifs_errc(c, "bad node at LEB %d:%d, LEB mapping status %d", lnum, + offs, ubi_is_mapped(c->ubi, lnum)); + if (!c->probing) { + ubifs_dump_node(c, buf); + dump_stack(); + } return -EINVAL; } diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index 36bd4efd081..a902c5919e4 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -574,27 +574,10 @@ static int done_already(struct rb_root *done_tree, int lnum) */ static void destroy_done_tree(struct rb_root *done_tree) { - struct rb_node *this = done_tree->rb_node; - struct done_ref *dr; + struct done_ref *dr, *n; - while (this) { - if (this->rb_left) { - this = this->rb_left; - continue; - } else if (this->rb_right) { - this = this->rb_right; - continue; - } - dr = rb_entry(this, struct done_ref, rb); - this = rb_parent(this); - if (this) { - if (this->rb_left == &dr->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } + rbtree_postorder_for_each_entry_safe(dr, n, done_tree, rb) kfree(dr); - } } /** diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 4b826abb152..45d4e96a6ba 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -460,9 +460,9 @@ static int write_cnodes(struct ubifs_info *c) * important. */ clear_bit(DIRTY_CNODE, &cnode->flags); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(COW_CNODE, &cnode->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); offs += len; dbg_chk_lpt_sz(c, 1, len); cnode = cnode->cnext; diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index ba32da3fe08..f1c3e5a1b31 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -815,27 +815,10 @@ static int dbg_find_check_orphan(struct rb_root *root, ino_t inum) static void dbg_free_check_tree(struct rb_root *root) { - struct rb_node *this = root->rb_node; - struct check_orphan *o; + struct check_orphan *o, *n; - while (this) { - if (this->rb_left) { - this = this->rb_left; - continue; - } else if (this->rb_right) { - this = this->rb_right; - continue; - } - o = rb_entry(this, struct check_orphan, rb); - this = rb_parent(this); - if (this) { - if (this->rb_left == &o->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } + rbtree_postorder_for_each_entry_safe(o, n, root, rb) kfree(o); - } } static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr, diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 065096e36ed..c14adb2f420 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -1335,29 +1335,14 @@ static void remove_ino(struct ubifs_info *c, ino_t inum) */ void ubifs_destroy_size_tree(struct ubifs_info *c) { - struct rb_node *this = c->size_tree.rb_node; - struct size_entry *e; + struct size_entry *e, *n; - while (this) { - if (this->rb_left) { - this = this->rb_left; - continue; - } else if (this->rb_right) { - this = this->rb_right; - continue; - } - e = rb_entry(this, struct size_entry, rb); + rbtree_postorder_for_each_entry_safe(e, n, &c->size_tree, rb) { if (e->inode) iput(e->inode); - this = rb_parent(this); - if (this) { - if (this->rb_left == &e->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } kfree(e); } + c->size_tree = RB_ROOT; } diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c index f35135e28e9..9a9fb94a41c 100644 --- a/fs/ubifs/shrinker.c +++ b/fs/ubifs/shrinker.c @@ -128,7 +128,6 @@ static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention) freed = ubifs_destroy_tnc_subtree(znode); atomic_long_sub(freed, &ubifs_clean_zn_cnt); atomic_long_sub(freed, &c->clean_zn_cnt); - ubifs_assert(atomic_long_read(&c->clean_zn_cnt) >= 0); total_freed += freed; znode = zprev; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 3e4aa7281e0..3904c8574ef 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -351,7 +351,7 @@ static void ubifs_evict_inode(struct inode *inode) dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); ubifs_assert(!atomic_read(&inode->i_count)); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (inode->i_nlink) goto done; @@ -873,26 +873,10 @@ static void free_orphans(struct ubifs_info *c) */ static void free_buds(struct ubifs_info *c) { - struct rb_node *this = c->buds.rb_node; - struct ubifs_bud *bud; - - while (this) { - if (this->rb_left) - this = this->rb_left; - else if (this->rb_right) - this = this->rb_right; - else { - bud = rb_entry(this, struct ubifs_bud, rb); - this = rb_parent(this); - if (this) { - if (this->rb_left == &bud->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } - kfree(bud); - } - } + struct ubifs_bud *bud, *n; + + rbtree_postorder_for_each_entry_safe(bud, n, &c->buds, rb) + kfree(bud); } /** @@ -1165,6 +1149,9 @@ static int mount_ubifs(struct ubifs_info *c) size_t sz; c->ro_mount = !!(c->vfs_sb->s_flags & MS_RDONLY); + /* Suppress error messages while probing if MS_SILENT is set */ + c->probing = !!(c->vfs_sb->s_flags & MS_SILENT); + err = init_constants_early(c); if (err) return err; @@ -1230,6 +1217,8 @@ static int mount_ubifs(struct ubifs_info *c) if (err) goto out_free; + c->probing = 0; + /* * Make sure the compressor which is set as default in the superblock * or overridden by mount options is actually compiled in. @@ -1572,7 +1561,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) if (c->space_fixup) { err = ubifs_fixup_free_space(c); if (err) - return err; + goto out; } err = check_free_space(c); @@ -1630,8 +1619,10 @@ static int ubifs_remount_rw(struct ubifs_info *c) } c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, GFP_KERNEL); - if (!c->write_reserve_buf) + if (!c->write_reserve_buf) { + err = -ENOMEM; goto out; + } err = ubifs_lpt_init(c, 0, 1); if (err) @@ -1841,6 +1832,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) int err; struct ubifs_info *c = sb->s_fs_info; + sync_filesystem(sb); dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags); err = ubifs_parse_options(c, data, 1); @@ -2064,8 +2056,10 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) } sb->s_root = d_make_root(root); - if (!sb->s_root) + if (!sb->s_root) { + err = -ENOMEM; goto out_umount; + } mutex_unlock(&c->umount_mutex); return 0; diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 349f31a30f4..8a40cf9c02d 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -178,27 +178,11 @@ static int ins_clr_old_idx_znode(struct ubifs_info *c, */ void destroy_old_idx(struct ubifs_info *c) { - struct rb_node *this = c->old_idx.rb_node; - struct ubifs_old_idx *old_idx; + struct ubifs_old_idx *old_idx, *n; - while (this) { - if (this->rb_left) { - this = this->rb_left; - continue; - } else if (this->rb_right) { - this = this->rb_right; - continue; - } - old_idx = rb_entry(this, struct ubifs_old_idx, rb); - this = rb_parent(this); - if (this) { - if (this->rb_left == &old_idx->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } + rbtree_postorder_for_each_entry_safe(old_idx, n, &c->old_idx, rb) kfree(old_idx); - } + c->old_idx = RB_ROOT; } @@ -2875,10 +2859,11 @@ void ubifs_tnc_close(struct ubifs_info *c) { tnc_destroy_cnext(c); if (c->zroot.znode) { - long n; + long n, freed; - ubifs_destroy_tnc_subtree(c->zroot.znode); n = atomic_long_read(&c->clean_zn_cnt); + freed = ubifs_destroy_tnc_subtree(c->zroot.znode); + ubifs_assert(freed == n); atomic_long_sub(n, &ubifs_clean_zn_cnt); } kfree(c->gap_lebs); diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 52a6559275c..3600994f841 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -895,9 +895,9 @@ static int write_index(struct ubifs_info *c) * the reason for the second barrier. */ clear_bit(DIRTY_ZNODE, &znode->flags); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(COW_ZNODE, &znode->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); /* * We have marked the znode as clean but have not updated the diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index e8c8cfe1435..c1f71fe17cc 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -51,6 +51,15 @@ #define ubifs_warn(fmt, ...) \ pr_warn("UBIFS warning (pid %d): %s: " fmt "\n", \ current->pid, __func__, ##__VA_ARGS__) +/* + * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description + * object as an argument. + */ +#define ubifs_errc(c, fmt, ...) \ + do { \ + if (!(c)->probing) \ + ubifs_err(fmt, ##__VA_ARGS__); \ + } while (0) /* UBIFS file system VFS magic number */ #define UBIFS_SUPER_MAGIC 0x24051905 @@ -1209,6 +1218,7 @@ struct ubifs_debug_info; * @need_recovery: %1 if the file-system needs recovery * @replaying: %1 during journal replay * @mounting: %1 while mounting + * @probing: %1 while attempting to mount if MS_SILENT mount flag is set * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode * @replay_list: temporary list used during journal replay * @replay_buds: list of buds to replay @@ -1441,6 +1451,7 @@ struct ubifs_info { unsigned int replaying:1; unsigned int mounting:1; unsigned int remounting_rw:1; + unsigned int probing:1; struct list_head replay_list; struct list_head replay_buds; unsigned long long cs_sqnum; diff --git a/fs/udf/file.c b/fs/udf/file.c index c02a27a19c6..d80738fdf42 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -119,8 +119,8 @@ static int udf_adinicb_write_end(struct file *file, } static ssize_t udf_adinicb_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, - loff_t offset, unsigned long nr_segs) + struct iov_iter *iter, + loff_t offset) { /* Fallback to buffered I/O. */ return 0; @@ -134,8 +134,7 @@ const struct address_space_operations udf_adinicb_aops = { .direct_IO = udf_adinicb_direct_IO, }; -static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t ppos) +static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t retval; struct file *file = iocb->ki_filp; @@ -144,18 +143,20 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, size_t count = iocb->ki_nbytes; struct udf_inode_info *iinfo = UDF_I(inode); + mutex_lock(&inode->i_mutex); down_write(&iinfo->i_data_sem); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { if (file->f_flags & O_APPEND) pos = inode->i_size; else - pos = ppos; + pos = iocb->ki_pos; if (inode->i_sb->s_blocksize < (udf_file_entry_alloc_offset(inode) + pos + count)) { err = udf_expand_file_adinicb(inode); if (err) { + mutex_unlock(&inode->i_mutex); udf_debug("udf_expand_adinicb: err=%d\n", err); return err; } @@ -169,9 +170,17 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, } else up_write(&iinfo->i_data_sem); - retval = generic_file_aio_write(iocb, iov, nr_segs, ppos); - if (retval > 0) + retval = __generic_file_write_iter(iocb, from); + mutex_unlock(&inode->i_mutex); + + if (retval > 0) { + ssize_t err; + mark_inode_dirty(inode); + err = generic_write_sync(file, iocb->ki_pos - retval, retval); + if (err < 0) + retval = err; + } return retval; } @@ -242,13 +251,13 @@ static int udf_release_file(struct inode *inode, struct file *filp) } const struct file_operations udf_file_operations = { - .read = do_sync_read, - .aio_read = generic_file_aio_read, + .read = new_sync_read, + .read_iter = generic_file_read_iter, .unlocked_ioctl = udf_ioctl, .open = generic_file_open, .mmap = generic_file_mmap, - .write = do_sync_write, - .aio_write = udf_file_aio_write, + .write = new_sync_write, + .write_iter = udf_file_write_iter, .release = udf_release_file, .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 062b7925bca..236cd48184c 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -146,8 +146,8 @@ void udf_evict_inode(struct inode *inode) want_delete = 1; udf_setsize(inode, 0); udf_update_inode(inode, IS_SYNC(inode)); - } else - truncate_inode_pages(&inode->i_data, 0); + } + truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); clear_inode(inode); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && @@ -217,18 +217,18 @@ static int udf_write_begin(struct file *file, struct address_space *mapping, } static ssize_t udf_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, - loff_t offset, unsigned long nr_segs) + struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, - udf_get_block); + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, udf_get_block); if (unlikely(ret < 0 && (rw & WRITE))) - udf_write_failed(mapping, offset + iov_length(iov, nr_segs)); + udf_write_failed(mapping, offset + count); return ret; } @@ -265,6 +265,7 @@ int udf_expand_file_adinicb(struct inode *inode) .nr_to_write = 1, }; + WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); if (!iinfo->i_lenAlloc) { if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 5f6fc17d6bc..9737cba1357 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -1010,6 +1010,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, else udf_truncate_tail_extent(inode); mark_inode_dirty(inode); + up_write(&iinfo->i_data_sem); fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); if (!fi) @@ -1023,7 +1024,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) mark_inode_dirty(dir); - up_write(&iinfo->i_data_sem); if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); diff --git a/fs/udf/super.c b/fs/udf/super.c index 91219385691..3286db047a4 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -76,6 +76,9 @@ #define UDF_DEFAULT_BLOCKSIZE 2048 +#define VSD_FIRST_SECTOR_OFFSET 32768 +#define VSD_MAX_SECTOR_OFFSET 0x800000 + enum { UDF_MAX_LINKS = 0xffff }; /* These are the "meat" - everything else is stuffing */ @@ -172,7 +175,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { udf_inode_cachep = kmem_cache_create("udf_inode_cache", sizeof(struct udf_inode_info), @@ -502,6 +505,7 @@ static int udf_parse_options(char *options, struct udf_options *uopt, while ((p = strsep(&options, ",")) != NULL) { substring_t args[MAX_OPT_ARGS]; int token; + unsigned n; if (!*p) continue; @@ -513,7 +517,10 @@ static int udf_parse_options(char *options, struct udf_options *uopt, case Opt_bs: if (match_int(&args[0], &option)) return 0; - uopt->blocksize = option; + n = option; + if (n != 512 && n != 1024 && n != 2048 && n != 4096) + return 0; + uopt->blocksize = n; uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET); break; case Opt_unhide: @@ -643,6 +650,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) int error = 0; struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb); + sync_filesystem(sb); if (lvidiu) { int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev); if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY)) @@ -685,7 +693,7 @@ out_unlock: static loff_t udf_check_vsd(struct super_block *sb) { struct volStructDesc *vsd = NULL; - loff_t sector = 32768; + loff_t sector = VSD_FIRST_SECTOR_OFFSET; int sectorsize; struct buffer_head *bh = NULL; int nsr02 = 0; @@ -703,8 +711,18 @@ static loff_t udf_check_vsd(struct super_block *sb) udf_debug("Starting at sector %u (%ld byte sectors)\n", (unsigned int)(sector >> sb->s_blocksize_bits), sb->s_blocksize); - /* Process the sequence (if applicable) */ - for (; !nsr02 && !nsr03; sector += sectorsize) { + /* Process the sequence (if applicable). The hard limit on the sector + * offset is arbitrary, hopefully large enough so that all valid UDF + * filesystems will be recognised. There is no mention of an upper + * bound to the size of the volume recognition area in the standard. + * The limit will prevent the code to read all the sectors of a + * specially crafted image (like a bluray disc full of CD001 sectors), + * potentially causing minutes or even hours of uninterruptible I/O + * activity. This actually happened with uninitialised SSD partitions + * (all 0xFF) before the check for the limit and all valid IDs were + * added */ + for (; !nsr02 && !nsr03 && sector < VSD_MAX_SECTOR_OFFSET; + sector += sectorsize) { /* Read a block */ bh = udf_tread(sb, sector >> sb->s_blocksize_bits); if (!bh) @@ -714,10 +732,7 @@ static loff_t udf_check_vsd(struct super_block *sb) vsd = (struct volStructDesc *)(bh->b_data + (sector & (sb->s_blocksize - 1))); - if (vsd->stdIdent[0] == 0) { - brelse(bh); - break; - } else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001, + if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001, VSD_STD_ID_LEN)) { switch (vsd->structType) { case 0: @@ -753,6 +768,17 @@ static loff_t udf_check_vsd(struct super_block *sb) else if (!strncmp(vsd->stdIdent, VSD_STD_ID_NSR03, VSD_STD_ID_LEN)) nsr03 = sector; + else if (!strncmp(vsd->stdIdent, VSD_STD_ID_BOOT2, + VSD_STD_ID_LEN)) + ; /* nothing */ + else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CDW02, + VSD_STD_ID_LEN)) + ; /* nothing */ + else { + /* invalid id : end of volume recognition area */ + brelse(bh); + break; + } brelse(bh); } @@ -760,7 +786,8 @@ static loff_t udf_check_vsd(struct super_block *sb) return nsr03; else if (nsr02) return nsr02; - else if (sector - (sbi->s_session << sb->s_blocksize_bits) == 32768) + else if (!bh && sector - (sbi->s_session << sb->s_blocksize_bits) == + VSD_FIRST_SECTOR_OFFSET) return -1; else return 0; @@ -1270,6 +1297,9 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block) * PHYSICAL partitions are already set up */ type1_idx = i; +#ifdef UDFFS_DEBUG + map = NULL; /* supress 'maybe used uninitialized' warning */ +#endif for (i = 0; i < sbi->s_partitions; i++) { map = &sbi->s_partmaps[i]; @@ -1891,7 +1921,9 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, return 0; } if (nsr_off == -1) - udf_debug("Failed to read byte 32768. Assuming open disc. Skipping validity check\n"); + udf_debug("Failed to read sector at offset %d. " + "Assuming open disc. Skipping validity " + "check\n", VSD_FIRST_SECTOR_OFFSET); if (!sbi->s_last_block) sbi->s_last_block = udf_get_last_block(sb); } else { diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index a7ea492ae66..7bc20809c99 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -24,7 +24,7 @@ #define INVBLOCK ((u64)-1L) -static u64 ufs_add_fragments(struct inode *, u64, unsigned, unsigned, int *); +static u64 ufs_add_fragments(struct inode *, u64, unsigned, unsigned); static u64 ufs_alloc_fragments(struct inode *, unsigned, u64, unsigned, int *); static u64 ufs_alloccg_block(struct inode *, struct ufs_cg_private_info *, u64, int *); static u64 ufs_bitmap_search (struct super_block *, struct ufs_cg_private_info *, u64, unsigned); @@ -38,7 +38,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned cgno, bit, end_bit, bbase, blkmap, i; @@ -46,7 +45,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first(uspi); UFSD("ENTER, fragment %llu, count %u\n", (unsigned long long)fragment, count); @@ -54,7 +52,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) if (ufs_fragnum(fragment) + count > uspi->s_fpg) ufs_error (sb, "ufs_free_fragments", "internal error"); - mutex_lock(&UFS_SB(sb)->s_lock); + lock_ufs(sb); cgno = ufs_dtog(uspi, fragment); bit = ufs_dtogd(uspi, fragment); @@ -118,12 +116,12 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) ubh_sync_block(UCPI_UBH(ucpi)); ufs_mark_sb_dirty(sb); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); UFSD("EXIT\n"); return; failed: - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); UFSD("EXIT (FAILED)\n"); return; } @@ -135,7 +133,6 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count) { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned overflow, cgno, bit, end_bit, i; @@ -143,7 +140,6 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count) sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first(uspi); UFSD("ENTER, fragment %llu, count %u\n", (unsigned long long)fragment, count); @@ -155,7 +151,7 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count) goto failed; } - mutex_lock(&UFS_SB(sb)->s_lock); + lock_ufs(sb); do_more: overflow = 0; @@ -215,12 +211,12 @@ do_more: } ufs_mark_sb_dirty(sb); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); UFSD("EXIT\n"); return; failed_unlock: - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); failed: UFSD("EXIT (FAILED)\n"); return; @@ -361,7 +357,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, usb1 = ubh_get_usb_first(uspi); *err = -ENOSPC; - mutex_lock(&UFS_SB(sb)->s_lock); + lock_ufs(sb); tmp = ufs_data_ptr_to_cpu(sb, p); if (count + ufs_fragnum(fragment) > uspi->s_fpb) { @@ -382,19 +378,19 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, "fragment %llu, tmp %llu\n", (unsigned long long)fragment, (unsigned long long)tmp); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); return INVBLOCK; } if (fragment < UFS_I(inode)->i_lastfrag) { UFSD("EXIT (ALREADY ALLOCATED)\n"); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); return 0; } } else { if (tmp) { UFSD("EXIT (ALREADY ALLOCATED)\n"); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); return 0; } } @@ -403,7 +399,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, * There is not enough space for user on the device */ if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) { - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); UFSD("EXIT (FAILED)\n"); return 0; } @@ -428,7 +424,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); } - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); UFSD("EXIT, result %llu\n", (unsigned long long)result); return result; } @@ -436,14 +432,14 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, /* * resize block */ - result = ufs_add_fragments (inode, tmp, oldcount, newcount, err); + result = ufs_add_fragments(inode, tmp, oldcount, newcount); if (result) { *err = 0; UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); UFSD("EXIT, result %llu\n", (unsigned long long)result); return result; } @@ -481,7 +477,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, *err = 0; UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); if (newcount < request) ufs_free_fragments (inode, result + newcount, request - newcount); ufs_free_fragments (inode, tmp, oldcount); @@ -489,17 +485,16 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, return result; } - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); UFSD("EXIT (FAILED)\n"); return 0; } static u64 ufs_add_fragments(struct inode *inode, u64 fragment, - unsigned oldcount, unsigned newcount, int *err) + unsigned oldcount, unsigned newcount) { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned cgno, fragno, fragoff, count, fragsize, i; @@ -509,7 +504,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first (uspi); count = newcount - oldcount; cgno = ufs_dtog(uspi, fragment); @@ -577,7 +571,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno, { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned oldcg, i, j, k, allocsize; @@ -588,7 +581,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno, sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first(uspi); oldcg = cgno; /* @@ -690,7 +682,6 @@ static u64 ufs_alloccg_block(struct inode *inode, { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cylinder_group * ucg; u64 result, blkno; @@ -698,7 +689,6 @@ static u64 ufs_alloccg_block(struct inode *inode, sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first(uspi); ucg = ubh_get_ucg(UCPI_UBH(ucpi)); if (goal == 0) { @@ -794,7 +784,6 @@ static u64 ufs_bitmap_search(struct super_block *sb, 0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe }; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - struct ufs_super_block_first *usb1; struct ufs_cylinder_group *ucg; unsigned start, length, loc; unsigned pos, want, blockmap, mask, end; @@ -803,7 +792,6 @@ static u64 ufs_bitmap_search(struct super_block *sb, UFSD("ENTER, cg %u, goal %llu, count %u\n", ucpi->c_cgx, (unsigned long long)goal, count); - usb1 = ubh_get_usb_first (uspi); ucg = ubh_get_ucg(UCPI_UBH(ucpi)); if (goal) diff --git a/fs/ufs/file.c b/fs/ufs/file.c index 33afa20d450..c84ec010a67 100644 --- a/fs/ufs/file.c +++ b/fs/ufs/file.c @@ -35,10 +35,10 @@ const struct file_operations ufs_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .open = generic_file_open, .fsync = generic_file_fsync, diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index d0426d74817..a9cc75ffa92 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -57,7 +57,6 @@ void ufs_free_inode (struct inode * inode) { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; int is_directory; @@ -67,15 +66,14 @@ void ufs_free_inode (struct inode * inode) sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first(uspi); ino = inode->i_ino; - mutex_lock(&UFS_SB(sb)->s_lock); + lock_ufs(sb); if (!((ino > 1) && (ino < (uspi->s_ncg * uspi->s_ipg )))) { ufs_warning(sb, "ufs_free_inode", "reserved inode or nonexistent inode %u\n", ino); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); return; } @@ -83,7 +81,7 @@ void ufs_free_inode (struct inode * inode) bit = ufs_inotocgoff (ino); ucpi = ufs_load_cylinder (sb, cg); if (!ucpi) { - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); return; } ucg = ubh_get_ucg(UCPI_UBH(ucpi)); @@ -117,7 +115,7 @@ void ufs_free_inode (struct inode * inode) ubh_sync_block(UCPI_UBH(ucpi)); ufs_mark_sb_dirty(sb); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); UFSD("EXIT\n"); } @@ -175,7 +173,6 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode) struct super_block * sb; struct ufs_sb_info * sbi; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; struct inode * inode; @@ -195,9 +192,8 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode) ufsi = UFS_I(inode); sbi = UFS_SB(sb); uspi = sbi->s_uspi; - usb1 = ubh_get_usb_first(uspi); - mutex_lock(&sbi->s_lock); + lock_ufs(sb); /* * Try to place the inode in its parent directory @@ -332,21 +328,20 @@ cg_found: sync_dirty_buffer(bh); brelse(bh); } - - mutex_unlock(&sbi->s_lock); + unlock_ufs(sb); UFSD("allocating inode %lu\n", inode->i_ino); UFSD("EXIT\n"); return inode; fail_remove_inode: - mutex_unlock(&sbi->s_lock); + unlock_ufs(sb); clear_nlink(inode); iput(inode); UFSD("EXIT (FAILED): err %d\n", err); return ERR_PTR(err); failed: - mutex_unlock(&sbi->s_lock); + unlock_ufs(sb); make_bad_inode(inode); iput (inode); UFSD("EXIT (FAILED): err %d\n", err); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index c8ca9608678..61e8a9b021d 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -885,7 +885,7 @@ void ufs_evict_inode(struct inode * inode) if (!inode->i_nlink && !is_bad_inode(inode)) want_delete = 1; - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (want_delete) { loff_t old_i_size; /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/ diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 329f2f53b7e..b879f1ba343 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -524,11 +524,9 @@ static int ufs_read_cylinder_structures(struct super_block *sb) struct ufs_buffer_head * ubh; unsigned char * base, * space; unsigned size, blks, i; - struct ufs_super_block_third *usb3; UFSD("ENTER\n"); - usb3 = ubh_get_usb_third(uspi); /* * Read cs structures from (usually) first data block * on the device. @@ -699,7 +697,6 @@ static int ufs_sync_fs(struct super_block *sb, int wait) unsigned flags; lock_ufs(sb); - mutex_lock(&UFS_SB(sb)->s_lock); UFSD("ENTER\n"); @@ -717,7 +714,6 @@ static int ufs_sync_fs(struct super_block *sb, int wait) ufs_put_cstotal(sb); UFSD("EXIT\n"); - mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return 0; @@ -762,6 +758,7 @@ static void ufs_put_super(struct super_block *sb) ubh_brelse_uspi (sbi->s_uspi); kfree (sbi->s_uspi); + mutex_destroy(&sbi->mutex); kfree (sbi); sb->s_fs_info = NULL; UFSD("EXIT\n"); @@ -788,6 +785,14 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) flags = 0; UFSD("ENTER\n"); + +#ifndef CONFIG_UFS_FS_WRITE + if (!(sb->s_flags & MS_RDONLY)) { + printk("ufs was compiled with read-only support, " + "can't be mounted as read-write\n"); + return -EROFS; + } +#endif sbi = kzalloc(sizeof(struct ufs_sb_info), GFP_KERNEL); if (!sbi) @@ -797,15 +802,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY)); -#ifndef CONFIG_UFS_FS_WRITE - if (!(sb->s_flags & MS_RDONLY)) { - printk("ufs was compiled with read-only support, " - "can't be mounted as read-write\n"); - goto failed; - } -#endif mutex_init(&sbi->mutex); - mutex_init(&sbi->s_lock); spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs); /* @@ -1259,6 +1256,7 @@ magic_found: return 0; failed: + mutex_destroy(&sbi->mutex); if (ubh) ubh_brelse_uspi (uspi); kfree (uspi); @@ -1280,8 +1278,8 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) unsigned new_mount_opt, ufstype; unsigned flags; + sync_filesystem(sb); lock_ufs(sb); - mutex_lock(&UFS_SB(sb)->s_lock); uspi = UFS_SB(sb)->s_uspi; flags = UFS_SB(sb)->s_flags; usb1 = ubh_get_usb_first(uspi); @@ -1295,7 +1293,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) new_mount_opt = 0; ufs_set_opt (new_mount_opt, ONERROR_LOCK); if (!ufs_parse_options (data, &new_mount_opt)) { - mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EINVAL; } @@ -1303,14 +1300,12 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) new_mount_opt |= ufstype; } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { printk("ufstype can't be changed during remount\n"); - mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EINVAL; } if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { UFS_SB(sb)->s_mount_opt = new_mount_opt; - mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return 0; } @@ -1335,7 +1330,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) #ifndef CONFIG_UFS_FS_WRITE printk("ufs was compiled with read-only support, " "can't be mounted as read-write\n"); - mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EINVAL; #else @@ -1345,13 +1339,11 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufstype != UFS_MOUNT_UFSTYPE_SUNx86 && ufstype != UFS_MOUNT_UFSTYPE_UFS2) { printk("this ufstype is read-only supported\n"); - mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EINVAL; } if (!ufs_read_cylinder_structures(sb)) { printk("failed during remounting\n"); - mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EPERM; } @@ -1359,7 +1351,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) #endif } UFS_SB(sb)->s_mount_opt = new_mount_opt; - mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return 0; } @@ -1389,15 +1380,11 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) struct super_block *sb = dentry->d_sb; struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi; unsigned flags = UFS_SB(sb)->s_flags; - struct ufs_super_block_first *usb1; - struct ufs_super_block_second *usb2; struct ufs_super_block_third *usb3; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); lock_ufs(sb); - usb1 = ubh_get_usb_first(uspi); - usb2 = ubh_get_usb_second(uspi); usb3 = ubh_get_usb_third(uspi); if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { @@ -1453,7 +1440,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { ufs_inode_cachep = kmem_cache_create("ufs_inode_cache", sizeof(struct ufs_inode_info), diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index ff2c15ab81a..343e6fc571e 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -24,7 +24,6 @@ struct ufs_sb_info { int work_queued; /* non-zero if the delayed work is queued */ struct delayed_work sync_work; /* FS sync delayed work */ spinlock_t work_lock; /* protects sync_work and work_queued */ - struct mutex s_lock; }; struct ufs_inode_info { diff --git a/fs/utimes.c b/fs/utimes.c index f4fb7eca10e..aa138d64560 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -53,6 +53,7 @@ static int utimes_common(struct path *path, struct timespec *times) int error; struct iattr newattrs; struct inode *inode = path->dentry->d_inode; + struct inode *delegated_inode = NULL; error = mnt_want_write(path->mnt); if (error) @@ -101,9 +102,15 @@ static int utimes_common(struct path *path, struct timespec *times) goto mnt_drop_write_and_out; } } +retry_deleg: mutex_lock(&inode->i_mutex); - error = notify_change(path->dentry, &newattrs); + error = notify_change(path->dentry, &newattrs, &delegated_inode); mutex_unlock(&inode->i_mutex); + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } mnt_drop_write_and_out: mnt_drop_write(path->mnt); diff --git a/fs/xattr.c b/fs/xattr.c index 3377dff1840..c69e6d43a0d 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -843,7 +843,7 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) /* wrap around? */ len = sizeof(*new_xattr) + size; - if (len <= sizeof(*new_xattr)) + if (len < sizeof(*new_xattr)) return NULL; new_xattr = kmalloc(len, GFP_KERNEL); diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c deleted file mode 100644 index 9fbea87fdb6..00000000000 --- a/fs/xattr_acl.c +++ /dev/null @@ -1,180 +0,0 @@ -/* - * linux/fs/xattr_acl.c - * - * Almost all from linux/fs/ext2/acl.c: - * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> - */ - -#include <linux/export.h> -#include <linux/fs.h> -#include <linux/posix_acl_xattr.h> -#include <linux/gfp.h> -#include <linux/user_namespace.h> - -/* - * Fix up the uids and gids in posix acl extended attributes in place. - */ -static void posix_acl_fix_xattr_userns( - struct user_namespace *to, struct user_namespace *from, - void *value, size_t size) -{ - posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; - posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; - int count; - kuid_t uid; - kgid_t gid; - - if (!value) - return; - if (size < sizeof(posix_acl_xattr_header)) - return; - if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) - return; - - count = posix_acl_xattr_count(size); - if (count < 0) - return; - if (count == 0) - return; - - for (end = entry + count; entry != end; entry++) { - switch(le16_to_cpu(entry->e_tag)) { - case ACL_USER: - uid = make_kuid(from, le32_to_cpu(entry->e_id)); - entry->e_id = cpu_to_le32(from_kuid(to, uid)); - break; - case ACL_GROUP: - gid = make_kgid(from, le32_to_cpu(entry->e_id)); - entry->e_id = cpu_to_le32(from_kgid(to, gid)); - break; - default: - break; - } - } -} - -void posix_acl_fix_xattr_from_user(void *value, size_t size) -{ - struct user_namespace *user_ns = current_user_ns(); - if (user_ns == &init_user_ns) - return; - posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size); -} - -void posix_acl_fix_xattr_to_user(void *value, size_t size) -{ - struct user_namespace *user_ns = current_user_ns(); - if (user_ns == &init_user_ns) - return; - posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size); -} - -/* - * Convert from extended attribute to in-memory representation. - */ -struct posix_acl * -posix_acl_from_xattr(struct user_namespace *user_ns, - const void *value, size_t size) -{ - posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; - posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; - int count; - struct posix_acl *acl; - struct posix_acl_entry *acl_e; - - if (!value) - return NULL; - if (size < sizeof(posix_acl_xattr_header)) - return ERR_PTR(-EINVAL); - if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) - return ERR_PTR(-EOPNOTSUPP); - - count = posix_acl_xattr_count(size); - if (count < 0) - return ERR_PTR(-EINVAL); - if (count == 0) - return NULL; - - acl = posix_acl_alloc(count, GFP_NOFS); - if (!acl) - return ERR_PTR(-ENOMEM); - acl_e = acl->a_entries; - - for (end = entry + count; entry != end; acl_e++, entry++) { - acl_e->e_tag = le16_to_cpu(entry->e_tag); - acl_e->e_perm = le16_to_cpu(entry->e_perm); - - switch(acl_e->e_tag) { - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - break; - - case ACL_USER: - acl_e->e_uid = - make_kuid(user_ns, - le32_to_cpu(entry->e_id)); - if (!uid_valid(acl_e->e_uid)) - goto fail; - break; - case ACL_GROUP: - acl_e->e_gid = - make_kgid(user_ns, - le32_to_cpu(entry->e_id)); - if (!gid_valid(acl_e->e_gid)) - goto fail; - break; - - default: - goto fail; - } - } - return acl; - -fail: - posix_acl_release(acl); - return ERR_PTR(-EINVAL); -} -EXPORT_SYMBOL (posix_acl_from_xattr); - -/* - * Convert from in-memory to extended attribute representation. - */ -int -posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, - void *buffer, size_t size) -{ - posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer; - posix_acl_xattr_entry *ext_entry = ext_acl->a_entries; - int real_size, n; - - real_size = posix_acl_xattr_size(acl->a_count); - if (!buffer) - return real_size; - if (real_size > size) - return -ERANGE; - - ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); - - for (n=0; n < acl->a_count; n++, ext_entry++) { - const struct posix_acl_entry *acl_e = &acl->a_entries[n]; - ext_entry->e_tag = cpu_to_le16(acl_e->e_tag); - ext_entry->e_perm = cpu_to_le16(acl_e->e_perm); - switch(acl_e->e_tag) { - case ACL_USER: - ext_entry->e_id = - cpu_to_le32(from_kuid(user_ns, acl_e->e_uid)); - break; - case ACL_GROUP: - ext_entry->e_id = - cpu_to_le32(from_kgid(user_ns, acl_e->e_gid)); - break; - default: - ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); - break; - } - } - return real_size; -} -EXPORT_SYMBOL (posix_acl_to_xattr); diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 0719e4db93f..c21f4350666 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -66,12 +66,14 @@ xfs-y += xfs_alloc.o \ xfs_bmap_btree.o \ xfs_btree.o \ xfs_da_btree.o \ + xfs_da_format.o \ xfs_dir2.o \ xfs_dir2_block.o \ xfs_dir2_data.o \ xfs_dir2_leaf.o \ xfs_dir2_node.o \ xfs_dir2_sf.o \ + xfs_dquot_buf.o \ xfs_ialloc.o \ xfs_ialloc_btree.o \ xfs_icreate_item.o \ @@ -103,7 +105,11 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \ xfs_qm_bhv.o \ xfs_qm.o \ xfs_quotaops.o -xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o + +# xfs_rtbitmap is shared with libxfs +xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \ + xfs_rtbitmap.o + xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o xfs-$(CONFIG_PROC_FS) += xfs_stats.o xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index a02cfb9e3bc..844e288b957 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -63,25 +63,33 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) } void * -kmem_zalloc(size_t size, xfs_km_flags_t flags) -{ - void *ptr; - - ptr = kmem_alloc(size, flags); - if (ptr) - memset((char *)ptr, 0, (int)size); - return ptr; -} - -void * kmem_zalloc_large(size_t size, xfs_km_flags_t flags) { + unsigned noio_flag = 0; void *ptr; + gfp_t lflags; ptr = kmem_zalloc(size, flags | KM_MAYFAIL); if (ptr) return ptr; - return vzalloc(size); + + /* + * __vmalloc() will allocate data pages and auxillary structures (e.g. + * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context + * here. Hence we need to tell memory reclaim that we are in such a + * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering + * the filesystem here and potentially deadlocking. + */ + if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + noio_flag = memalloc_noio_save(); + + lflags = kmem_flags_convert(flags); + ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); + + if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + memalloc_noio_restore(noio_flag); + + return ptr; } void @@ -128,14 +136,3 @@ kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags) congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); } - -void * -kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags) -{ - void *ptr; - - ptr = kmem_zone_alloc(zone, flags); - if (ptr) - memset((char *)ptr, 0, kmem_cache_size(zone)); - return ptr; -} diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 3a7371cab50..64db0e53ede 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -32,6 +32,7 @@ typedef unsigned __bitwise xfs_km_flags_t; #define KM_NOSLEEP ((__force xfs_km_flags_t)0x0002u) #define KM_NOFS ((__force xfs_km_flags_t)0x0004u) #define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u) +#define KM_ZERO ((__force xfs_km_flags_t)0x0010u) /* * We use a special process flag to avoid recursive callbacks into @@ -43,7 +44,7 @@ kmem_flags_convert(xfs_km_flags_t flags) { gfp_t lflags; - BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL)); + BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_ZERO)); if (flags & KM_NOSLEEP) { lflags = GFP_ATOMIC | __GFP_NOWARN; @@ -52,11 +53,14 @@ kmem_flags_convert(xfs_km_flags_t flags) if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) lflags &= ~__GFP_FS; } + + if (flags & KM_ZERO) + lflags |= __GFP_ZERO; + return lflags; } extern void *kmem_alloc(size_t, xfs_km_flags_t); -extern void *kmem_zalloc(size_t, xfs_km_flags_t); extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t); extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t); extern void kmem_free(const void *); @@ -64,6 +68,12 @@ extern void kmem_free(const void *); extern void *kmem_zalloc_greedy(size_t *, size_t, size_t); +static inline void * +kmem_zalloc(size_t size, xfs_km_flags_t flags) +{ + return kmem_alloc(size, flags | KM_ZERO); +} + /* * Zone interfaces */ @@ -102,6 +112,11 @@ kmem_zone_destroy(kmem_zone_t *zone) } extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t); -extern void *kmem_zone_zalloc(kmem_zone_t *, xfs_km_flags_t); + +static inline void * +kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags) +{ + return kmem_zone_alloc(zone, flags | KM_ZERO); +} #endif /* __XFS_SUPPORT_KMEM_H__ */ diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 0e2f37efedd..6888ad886ff 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -16,15 +16,15 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" +#include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_acl.h" -#include "xfs_attr.h" -#include "xfs_bmap_btree.h" -#include "xfs_inode.h" #include "xfs_ag.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_acl.h" +#include "xfs_attr.h" #include "xfs_trace.h" #include <linux/slab.h> #include <linux/xattr.h> @@ -124,16 +124,12 @@ struct posix_acl * xfs_get_acl(struct inode *inode, int type) { struct xfs_inode *ip = XFS_I(inode); - struct posix_acl *acl; + struct posix_acl *acl = NULL; struct xfs_acl *xfs_acl; unsigned char *ea_name; int error; int len; - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - trace_xfs_get_acl(ip); switch (type) { @@ -164,10 +160,8 @@ xfs_get_acl(struct inode *inode, int type) * cache entry, for any other error assume it is transient and * leave the cache entry as ACL_NOT_CACHED. */ - if (error == -ENOATTR) { - acl = NULL; + if (error == -ENOATTR) goto out_update_cache; - } goto out; } @@ -183,15 +177,12 @@ out: } STATIC int -xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) +__xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) { struct xfs_inode *ip = XFS_I(inode); unsigned char *ea_name; int error; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - switch (type) { case ACL_TYPE_ACCESS: ea_name = SGI_ACL_FILE; @@ -282,131 +273,23 @@ posix_acl_default_exists(struct inode *inode) return xfs_acl_exists(inode, SGI_ACL_DEFAULT); } -/* - * No need for i_mutex because the inode is not yet exposed to the VFS. - */ -int -xfs_inherit_acl(struct inode *inode, struct posix_acl *acl) -{ - umode_t mode = inode->i_mode; - int error = 0, inherit = 0; - - if (S_ISDIR(inode->i_mode)) { - error = xfs_set_acl(inode, ACL_TYPE_DEFAULT, acl); - if (error) - goto out; - } - - error = posix_acl_create(&acl, GFP_KERNEL, &mode); - if (error < 0) - return error; - - /* - * If posix_acl_create returns a positive value we need to - * inherit a permission that can't be represented using the Unix - * mode bits and we actually need to set an ACL. - */ - if (error > 0) - inherit = 1; - - error = xfs_set_mode(inode, mode); - if (error) - goto out; - - if (inherit) - error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl); - -out: - posix_acl_release(acl); - return error; -} - int -xfs_acl_chmod(struct inode *inode) +xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - struct posix_acl *acl; - int error; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - acl = xfs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - - error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (error) - return error; - - error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl); - posix_acl_release(acl); - return error; -} - -static int -xfs_xattr_acl_get(struct dentry *dentry, const char *name, - void *value, size_t size, int type) -{ - struct posix_acl *acl; - int error; - - acl = xfs_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - - error = posix_acl_to_xattr(&init_user_ns, acl, value, size); - posix_acl_release(acl); - - return error; -} - -static int -xfs_xattr_acl_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - struct inode *inode = dentry->d_inode; - struct posix_acl *acl = NULL; int error = 0; - if (flags & XATTR_CREATE) - return -EINVAL; - if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) - return value ? -EACCES : 0; - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (!value) + if (!acl) goto set_acl; - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (!acl) { - /* - * acl_set_file(3) may request that we set default ACLs with - * zero length -- defend (gracefully) against that here. - */ - goto out; - } - if (IS_ERR(acl)) { - error = PTR_ERR(acl); - goto out; - } - - error = posix_acl_valid(acl); - if (error) - goto out_release; - - error = -EINVAL; + error = -E2BIG; if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb))) - goto out_release; + return error; if (type == ACL_TYPE_ACCESS) { umode_t mode = inode->i_mode; error = posix_acl_equiv_mode(acl, &mode); if (error <= 0) { - posix_acl_release(acl); acl = NULL; if (error < 0) @@ -415,27 +298,9 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name, error = xfs_set_mode(inode, mode); if (error) - goto out_release; + return error; } set_acl: - error = xfs_set_acl(inode, type, acl); - out_release: - posix_acl_release(acl); - out: - return error; + return __xfs_set_acl(inode, type, acl); } - -const struct xattr_handler xfs_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .get = xfs_xattr_acl_get, - .set = xfs_xattr_acl_set, -}; - -const struct xattr_handler xfs_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .get = xfs_xattr_acl_get, - .set = xfs_xattr_acl_set, -}; diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 4016a567b83..5dc16374451 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -60,20 +60,15 @@ struct xfs_acl { #ifdef CONFIG_XFS_POSIX_ACL extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); -extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl); -extern int xfs_acl_chmod(struct inode *inode); +extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int posix_acl_access_exists(struct inode *inode); extern int posix_acl_default_exists(struct inode *inode); - -extern const struct xattr_handler xfs_xattr_acl_access_handler; -extern const struct xattr_handler xfs_xattr_acl_default_handler; #else static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type) { return NULL; } -# define xfs_inherit_acl(inode, default_acl) 0 -# define xfs_acl_chmod(inode) 0 +# define xfs_set_acl NULL # define posix_acl_access_exists(inode) 0 # define posix_acl_default_exists(inode) 0 #endif /* CONFIG_XFS_POSIX_ACL */ diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 1cb740afd67..6e247a99f5d 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -89,6 +89,8 @@ typedef struct xfs_agf { /* structure must be padded to 64 bit alignment */ } xfs_agf_t; +#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc) + #define XFS_AGF_MAGICNUM 0x00000001 #define XFS_AGF_VERSIONNUM 0x00000002 #define XFS_AGF_SEQNO 0x00000004 @@ -128,8 +130,6 @@ typedef struct xfs_agf { extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); -extern const struct xfs_buf_ops xfs_agf_buf_ops; - /* * Size of the unlinked inode hash table in the agi. */ @@ -160,28 +160,38 @@ typedef struct xfs_agi { * still being referenced. */ __be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS]; - + /* + * This marks the end of logging region 1 and start of logging region 2. + */ uuid_t agi_uuid; /* uuid of filesystem */ __be32 agi_crc; /* crc of agi sector */ __be32 agi_pad32; __be64 agi_lsn; /* last write sequence */ + __be32 agi_free_root; /* root of the free inode btree */ + __be32 agi_free_level;/* levels in free inode btree */ + /* structure must be padded to 64 bit alignment */ } xfs_agi_t; -#define XFS_AGI_MAGICNUM 0x00000001 -#define XFS_AGI_VERSIONNUM 0x00000002 -#define XFS_AGI_SEQNO 0x00000004 -#define XFS_AGI_LENGTH 0x00000008 -#define XFS_AGI_COUNT 0x00000010 -#define XFS_AGI_ROOT 0x00000020 -#define XFS_AGI_LEVEL 0x00000040 -#define XFS_AGI_FREECOUNT 0x00000080 -#define XFS_AGI_NEWINO 0x00000100 -#define XFS_AGI_DIRINO 0x00000200 -#define XFS_AGI_UNLINKED 0x00000400 -#define XFS_AGI_NUM_BITS 11 -#define XFS_AGI_ALL_BITS ((1 << XFS_AGI_NUM_BITS) - 1) +#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc) + +#define XFS_AGI_MAGICNUM (1 << 0) +#define XFS_AGI_VERSIONNUM (1 << 1) +#define XFS_AGI_SEQNO (1 << 2) +#define XFS_AGI_LENGTH (1 << 3) +#define XFS_AGI_COUNT (1 << 4) +#define XFS_AGI_ROOT (1 << 5) +#define XFS_AGI_LEVEL (1 << 6) +#define XFS_AGI_FREECOUNT (1 << 7) +#define XFS_AGI_NEWINO (1 << 8) +#define XFS_AGI_DIRINO (1 << 9) +#define XFS_AGI_UNLINKED (1 << 10) +#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */ +#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1) +#define XFS_AGI_FREE_ROOT (1 << 11) +#define XFS_AGI_FREE_LEVEL (1 << 12) +#define XFS_AGI_NUM_BITS_R2 13 /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) @@ -191,8 +201,6 @@ typedef struct xfs_agi { extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); -extern const struct xfs_buf_ops xfs_agi_buf_ops; - /* * The third a.g. block contains the a.g. freelist, an array * of block pointers to blocks owned by the allocation btree code. @@ -226,6 +234,8 @@ typedef struct xfs_agfl { __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */ } xfs_agfl_t; +#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) + /* * tags for inode radix tree */ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 5a1393f5e02..d43813267a8 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -17,25 +17,25 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" +#include "xfs_alloc_btree.h" #include "xfs_alloc.h" #include "xfs_extent_busy.h" #include "xfs_error.h" #include "xfs_cksum.h" #include "xfs_trace.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" +#include "xfs_log.h" struct workqueue_struct *xfs_alloc_wq; @@ -257,16 +257,14 @@ xfs_alloc_fix_len( k = rlen % args->prod; if (k == args->mod) return; - if (k > args->mod) { - if ((int)(rlen = rlen - k - args->mod) < (int)args->minlen) - return; - } else { - if ((int)(rlen = rlen - args->prod - (args->mod - k)) < - (int)args->minlen) - return; - } - ASSERT(rlen >= args->minlen); - ASSERT(rlen <= args->maxlen); + if (k > args->mod) + rlen = rlen - (k - args->mod); + else + rlen = rlen - args->prod + (args->mod - k); + if ((int)rlen < (int)args->minlen) + return; + ASSERT(rlen >= args->minlen && rlen <= args->maxlen); + ASSERT(rlen % args->prod == args->mod); args->len = rlen; } @@ -474,7 +472,6 @@ xfs_agfl_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - int agfl_ok = 1; /* * There is no verification of non-crc AGFLs because mkfs does not @@ -485,15 +482,13 @@ xfs_agfl_read_verify( if (!xfs_sb_version_hascrc(&mp->m_sb)) return; - agfl_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - offsetof(struct xfs_agfl, agfl_crc)); - - agfl_ok = agfl_ok && xfs_agfl_verify(bp); - - if (!agfl_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_agfl_verify(bp)) xfs_buf_ioerror(bp, EFSCORRUPTED); - } + + if (bp->b_error) + xfs_verifier_error(bp); } static void @@ -508,16 +503,15 @@ xfs_agfl_write_verify( return; if (!xfs_agfl_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } if (bip) XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn); - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), - offsetof(struct xfs_agfl, agfl_crc)); + xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF); } const struct xfs_buf_ops xfs_agfl_buf_ops = { @@ -545,7 +539,6 @@ xfs_alloc_read_agfl( XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); if (error) return error; - ASSERT(!xfs_buf_geterror(bp)); xfs_buf_set_ref(bp, XFS_AGFL_REF); *bpp = bp; return 0; @@ -2238,19 +2231,17 @@ xfs_agf_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - int agf_ok = 1; - - if (xfs_sb_version_hascrc(&mp->m_sb)) - agf_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - offsetof(struct xfs_agf, agf_crc)); - - agf_ok = agf_ok && xfs_agf_verify(mp, bp); - if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, - XFS_RANDOM_ALLOC_READ_AGF))) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp, + XFS_ERRTAG_ALLOC_READ_AGF, + XFS_RANDOM_ALLOC_READ_AGF)) xfs_buf_ioerror(bp, EFSCORRUPTED); - } + + if (bp->b_error) + xfs_verifier_error(bp); } static void @@ -2261,8 +2252,8 @@ xfs_agf_write_verify( struct xfs_buf_log_item *bip = bp->b_fspriv; if (!xfs_agf_verify(mp, bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } @@ -2272,8 +2263,7 @@ xfs_agf_write_verify( if (bip) XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), - offsetof(struct xfs_agf, agf_crc)); + xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF); } const struct xfs_buf_ops xfs_agf_buf_ops = { @@ -2294,6 +2284,8 @@ xfs_read_agf( { int error; + trace_xfs_read_agf(mp, agno); + ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, @@ -2324,8 +2316,9 @@ xfs_alloc_read_agf( struct xfs_perag *pag; /* per allocation group data */ int error; - ASSERT(agno != NULLAGNUMBER); + trace_xfs_alloc_read_agf(mp, agno); + ASSERT(agno != NULLAGNUMBER); error = xfs_read_agf(mp, tp, agno, (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, bpp); diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 99d0a610155..feacb061bab 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -231,7 +231,4 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat); /* output: success/failure */ -extern const struct xfs_buf_ops xfs_agf_buf_ops; -extern const struct xfs_buf_ops xfs_agfl_buf_ops; - #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index cafc90251d1..8358f1ded94 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -17,23 +17,21 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" #include "xfs_btree.h" +#include "xfs_alloc_btree.h" #include "xfs_alloc.h" #include "xfs_extent_busy.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_cksum.h" +#include "xfs_trans.h" STATIC struct xfs_btree_cur * @@ -72,7 +70,6 @@ xfs_allocbt_alloc_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *start, union xfs_btree_ptr *new, - int length, int *stat) { int error; @@ -357,12 +354,14 @@ static void xfs_allocbt_read_verify( struct xfs_buf *bp) { - if (!(xfs_btree_sblock_verify_crc(bp) && - xfs_allocbt_verify(bp))) { - trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, - bp->b_target->bt_mount, bp->b_addr); + if (!xfs_btree_sblock_verify_crc(bp)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_allocbt_verify(bp)) xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); } } @@ -372,9 +371,9 @@ xfs_allocbt_write_verify( { if (!xfs_allocbt_verify(bp)) { trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, - bp->b_target->bt_mount, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; } xfs_btree_sblock_calc_crc(bp); diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h index e3a3f742419..45e189e7e81 100644 --- a/fs/xfs/xfs_alloc_btree.h +++ b/fs/xfs/xfs_alloc_btree.h @@ -27,39 +27,6 @@ struct xfs_btree_cur; struct xfs_mount; /* - * There are two on-disk btrees, one sorted by blockno and one sorted - * by blockcount and blockno. All blocks look the same to make the code - * simpler; if we have time later, we'll make the optimizations. - */ -#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */ -#define XFS_ABTB_CRC_MAGIC 0x41423342 /* 'AB3B' */ -#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */ -#define XFS_ABTC_CRC_MAGIC 0x41423343 /* 'AB3C' */ - -/* - * Data record/key structure - */ -typedef struct xfs_alloc_rec { - __be32 ar_startblock; /* starting block number */ - __be32 ar_blockcount; /* count of free blocks */ -} xfs_alloc_rec_t, xfs_alloc_key_t; - -typedef struct xfs_alloc_rec_incore { - xfs_agblock_t ar_startblock; /* starting block number */ - xfs_extlen_t ar_blockcount; /* count of free blocks */ -} xfs_alloc_rec_incore_t; - -/* btree pointer type */ -typedef __be32 xfs_alloc_ptr_t; - -/* - * Block numbers in the AG: - * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3. - */ -#define XFS_BNO_BLOCK(mp) ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1)) -#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1)) - -/* * Btree block header size depends on a superblock flag. */ #define XFS_ALLOC_BLOCK_LEN(mp) \ @@ -95,6 +62,4 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *, xfs_agnumber_t, xfs_btnum_t); extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); -extern const struct xfs_buf_ops xfs_allocbt_buf_ops; - #endif /* __XFS_ALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e51e581454e..faaf716e208 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -16,14 +16,15 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" -#include "xfs_log.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_trans.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_alloc.h" #include "xfs_error.h" @@ -31,6 +32,8 @@ #include "xfs_trace.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_dinode.h" #include <linux/aio.h> #include <linux/gfp.h> #include <linux/mpage.h> @@ -333,7 +336,7 @@ xfs_map_blocks( if (type == XFS_IO_DELALLOC && (!nimaps || isnullstartblock(imap->br_startblock))) { - error = xfs_iomap_write_allocate(ip, offset, count, imap); + error = xfs_iomap_write_allocate(ip, offset, imap); if (!error) trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); return -XFS_ERROR(error); @@ -404,7 +407,7 @@ xfs_alloc_ioend_bio( struct bio *bio = bio_alloc(GFP_NOIO, nvecs); ASSERT(bio->bi_private == NULL); - bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; return bio; } @@ -629,38 +632,46 @@ xfs_map_at_offset( } /* - * Test if a given page is suitable for writing as part of an unwritten - * or delayed allocate extent. + * Test if a given page contains at least one buffer of a given @type. + * If @check_all_buffers is true, then we walk all the buffers in the page to + * try to find one of the type passed in. If it is not set, then the caller only + * needs to check the first buffer on the page for a match. */ -STATIC int +STATIC bool xfs_check_page_type( struct page *page, - unsigned int type) + unsigned int type, + bool check_all_buffers) { - if (PageWriteback(page)) - return 0; + struct buffer_head *bh; + struct buffer_head *head; - if (page->mapping && page_has_buffers(page)) { - struct buffer_head *bh, *head; - int acceptable = 0; + if (PageWriteback(page)) + return false; + if (!page->mapping) + return false; + if (!page_has_buffers(page)) + return false; - bh = head = page_buffers(page); - do { - if (buffer_unwritten(bh)) - acceptable += (type == XFS_IO_UNWRITTEN); - else if (buffer_delay(bh)) - acceptable += (type == XFS_IO_DELALLOC); - else if (buffer_dirty(bh) && buffer_mapped(bh)) - acceptable += (type == XFS_IO_OVERWRITE); - else - break; - } while ((bh = bh->b_this_page) != head); + bh = head = page_buffers(page); + do { + if (buffer_unwritten(bh)) { + if (type == XFS_IO_UNWRITTEN) + return true; + } else if (buffer_delay(bh)) { + if (type == XFS_IO_DELALLOC) + return true; + } else if (buffer_dirty(bh) && buffer_mapped(bh)) { + if (type == XFS_IO_OVERWRITE) + return true; + } - if (acceptable) - return 1; - } + /* If we are only checking the first buffer, we are done now. */ + if (!check_all_buffers) + break; + } while ((bh = bh->b_this_page) != head); - return 0; + return false; } /* @@ -694,7 +705,7 @@ xfs_convert_page( goto fail_unlock_page; if (page->mapping != inode->i_mapping) goto fail_unlock_page; - if (!xfs_check_page_type(page, (*ioendp)->io_type)) + if (!xfs_check_page_type(page, (*ioendp)->io_type, false)) goto fail_unlock_page; /* @@ -739,6 +750,15 @@ xfs_convert_page( p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; page_dirty = p_offset / len; + /* + * The moment we find a buffer that doesn't match our current type + * specification or can't be written, abort the loop and start + * writeback. As per the above xfs_imap_valid() check, only + * xfs_vm_writepage() can handle partial page writeback fully - we are + * limited here to the buffers that are contiguous with the current + * ioend, and hence a buffer we can't write breaks that contiguity and + * we have to defer the rest of the IO to xfs_vm_writepage(). + */ bh = head = page_buffers(page); do { if (offset >= end_offset) @@ -747,7 +767,7 @@ xfs_convert_page( uptodate = 0; if (!(PageUptodate(page) || buffer_uptodate(bh))) { done = 1; - continue; + break; } if (buffer_unwritten(bh) || buffer_delay(bh) || @@ -759,10 +779,11 @@ xfs_convert_page( else type = XFS_IO_OVERWRITE; - if (!xfs_imap_valid(inode, imap, offset)) { - done = 1; - continue; - } + /* + * imap should always be valid because of the above + * partial page end_offset check on the imap. + */ + ASSERT(xfs_imap_valid(inode, imap, offset)); lock_buffer(bh); if (type != XFS_IO_OVERWRITE) @@ -774,6 +795,7 @@ xfs_convert_page( count++; } else { done = 1; + break; } } while (offset += len, (bh = bh->b_this_page) != head); @@ -865,7 +887,7 @@ xfs_aops_discard_page( struct buffer_head *bh, *head; loff_t offset = page_offset(page); - if (!xfs_check_page_type(page, XFS_IO_DELALLOC)) + if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true)) goto out_invalidate; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) @@ -953,14 +975,39 @@ xfs_vm_writepage( * Given that we do not allow direct reclaim to call us, we should * never be called while in a filesystem transaction. */ - if (WARN_ON(current->flags & PF_FSTRANS)) + if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) goto redirty; /* Is this page beyond the end of the file? */ offset = i_size_read(inode); end_index = offset >> PAGE_CACHE_SHIFT; last_index = (offset - 1) >> PAGE_CACHE_SHIFT; - if (page->index >= end_index) { + + /* + * The page index is less than the end_index, adjust the end_offset + * to the highest offset that this page should represent. + * ----------------------------------------------------- + * | file mapping | <EOF> | + * ----------------------------------------------------- + * | Page ... | Page N-2 | Page N-1 | Page N | | + * ^--------------------------------^----------|-------- + * | desired writeback range | see else | + * ---------------------------------^------------------| + */ + if (page->index < end_index) + end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT; + else { + /* + * Check whether the page to write out is beyond or straddles + * i_size or not. + * ------------------------------------------------------- + * | file mapping | <EOF> | + * ------------------------------------------------------- + * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | + * ^--------------------------------^-----------|--------- + * | | Straddles | + * ---------------------------------^-----------|--------| + */ unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1); /* @@ -968,24 +1015,36 @@ xfs_vm_writepage( * truncate operation that is in progress. We must redirty the * page so that reclaim stops reclaiming it. Otherwise * xfs_vm_releasepage() is called on it and gets confused. + * + * Note that the end_index is unsigned long, it would overflow + * if the given offset is greater than 16TB on 32-bit system + * and if we do check the page is fully outside i_size or not + * via "if (page->index >= end_index + 1)" as "end_index + 1" + * will be evaluated to 0. Hence this page will be redirtied + * and be written out repeatedly which would result in an + * infinite loop, the user program that perform this operation + * will hang. Instead, we can verify this situation by checking + * if the page to write is totally beyond the i_size or if it's + * offset is just equal to the EOF. */ - if (page->index >= end_index + 1 || offset_into_page == 0) + if (page->index > end_index || + (page->index == end_index && offset_into_page == 0)) goto redirty; /* * The page straddles i_size. It must be zeroed out on each * and every writepage invocation because it may be mmapped. * "A file is mapped in multiples of the page size. For a file - * that is not a multiple of the page size, the remaining + * that is not a multiple of the page size, the remaining * memory is zeroed when mapped, and writes to that region are * not written out to the file." */ zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE); + + /* Adjust the end_offset to the end of file */ + end_offset = offset; } - end_offset = min_t(unsigned long long, - (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, - offset); len = 1 << inode->i_blkbits; bh = head = page_buffers(page); @@ -1166,9 +1225,9 @@ xfs_vm_releasepage( xfs_count_page_state(page, &delalloc, &unwritten); - if (WARN_ON(delalloc)) + if (WARN_ON_ONCE(delalloc)) return 0; - if (WARN_ON(unwritten)) + if (WARN_ON_ONCE(unwritten)) return 0; return try_to_free_buffers(page); @@ -1214,7 +1273,7 @@ __xfs_get_blocks( lockmode = XFS_ILOCK_EXCL; xfs_ilock(ip, lockmode); } else { - lockmode = xfs_ilock_map_shared(ip); + lockmode = xfs_ilock_data_map_shared(ip); } ASSERT(offset <= mp->m_super->s_maxbytes); @@ -1322,6 +1381,14 @@ __xfs_get_blocks( /* * If this is O_DIRECT or the mpage code calling tell them how large * the mapping is, so that we can avoid repeated get_blocks calls. + * + * If the mapping spans EOF, then we have to break the mapping up as the + * mapping for blocks beyond EOF must be marked new so that sub block + * regions can be correctly zeroed. We can't do this for mappings within + * EOF unless the mapping was just allocated or is unwritten, otherwise + * the callers would overwrite existing data with zeros. Hence we have + * to split the mapping into a range up to and including EOF, and a + * second mapping for beyond EOF. */ if (direct || size > (1 << inode->i_blkbits)) { xfs_off_t mapping_size; @@ -1332,6 +1399,12 @@ __xfs_get_blocks( ASSERT(mapping_size > 0); if (mapping_size > size) mapping_size = size; + if (offset < i_size_read(inode) && + offset + mapping_size >= i_size_read(inode)) { + /* limit mapping to block that spans EOF */ + mapping_size = roundup_64(i_size_read(inode) - offset, + 1 << inode->i_blkbits); + } if (mapping_size > LONG_MAX) mapping_size = LONG_MAX; @@ -1413,9 +1486,8 @@ STATIC ssize_t xfs_vm_direct_IO( int rw, struct kiocb *iocb, - const struct iovec *iov, - loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, + loff_t offset) { struct inode *inode = iocb->ki_filp->f_mapping->host; struct block_device *bdev = xfs_find_bdev_for_inode(inode); @@ -1423,7 +1495,7 @@ xfs_vm_direct_IO( ssize_t ret; if (rw & WRITE) { - size_t size = iov_length(iov, nr_segs); + size_t size = iov_iter_count(iter); /* * We cannot preallocate a size update transaction here as we @@ -1435,16 +1507,15 @@ xfs_vm_direct_IO( if (offset + size > XFS_I(inode)->i_d.di_size) ioend->io_isdirect = 1; - ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, - offset, nr_segs, - xfs_get_blocks_direct, - xfs_end_io_direct_write, NULL, 0); + ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter, + offset, xfs_get_blocks_direct, + xfs_end_io_direct_write, NULL, + DIO_ASYNC_EXTEND); if (ret != -EIOCBQUEUED && iocb->private) goto out_destroy_ioend; } else { - ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, - offset, nr_segs, - xfs_get_blocks_direct, + ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter, + offset, xfs_get_blocks_direct, NULL, NULL, 0); } @@ -1543,6 +1614,16 @@ xfs_vm_write_failed( xfs_vm_kill_delalloc_range(inode, block_offset, block_offset + bh->b_size); + + /* + * This buffer does not contain data anymore. make sure anyone + * who finds it knows that for certain. + */ + clear_buffer_delay(bh); + clear_buffer_uptodate(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_dirty(bh); } } @@ -1569,20 +1650,28 @@ xfs_vm_write_begin( ASSERT(len <= PAGE_CACHE_SIZE); - page = grab_cache_page_write_begin(mapping, index, - flags | AOP_FLAG_NOFS); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; status = __block_write_begin(page, pos, len, xfs_get_blocks); if (unlikely(status)) { struct inode *inode = mapping->host; + size_t isize = i_size_read(inode); xfs_vm_write_failed(inode, page, pos, len); unlock_page(page); - if (pos + len > i_size_read(inode)) - truncate_pagecache(inode, i_size_read(inode)); + /* + * If the write is beyond EOF, we only want to kill blocks + * allocated in this write, not blocks that were previously + * written successfully. + */ + if (pos + len > isize) { + ssize_t start = max_t(ssize_t, pos, isize); + + truncate_pagecache_range(inode, start, pos + len); + } page_cache_release(page); page = NULL; @@ -1593,9 +1682,12 @@ xfs_vm_write_begin( } /* - * On failure, we only need to kill delalloc blocks beyond EOF because they - * will never be written. For blocks within EOF, generic_write_end() zeros them - * so they are safe to leave alone and be written with all the other valid data. + * On failure, we only need to kill delalloc blocks beyond EOF in the range of + * this specific write because they will never be written. Previous writes + * beyond EOF where block allocation succeeded do not need to be trashed, so + * only new blocks from this write should be trashed. For blocks within + * EOF, generic_write_end() zeros them so they are safe to leave alone and be + * written with all the other valid data. */ STATIC int xfs_vm_write_end( @@ -1618,8 +1710,11 @@ xfs_vm_write_end( loff_t to = pos + len; if (to > isize) { - truncate_pagecache(inode, isize); + /* only kill blocks in this write beyond EOF */ + if (pos > isize) + isize = pos; xfs_vm_kill_delalloc_range(inode, isize, to); + truncate_pagecache_range(inode, isize, to); } } return ret; diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index ddcf2267ffa..bfe36fc2cdc 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -17,23 +17,24 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" #include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_alloc.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" #include "xfs_attr_remote.h" @@ -41,6 +42,7 @@ #include "xfs_quota.h" #include "xfs_trans_space.h" #include "xfs_trace.h" +#include "xfs_dinode.h" /* * xfs_attr.c @@ -75,17 +77,27 @@ STATIC int xfs_attr_refillstate(xfs_da_state_t *state); STATIC int -xfs_attr_name_to_xname( - struct xfs_name *xname, - const unsigned char *aname) +xfs_attr_args_init( + struct xfs_da_args *args, + struct xfs_inode *dp, + const unsigned char *name, + int flags) { - if (!aname) + + if (!name) return EINVAL; - xname->name = aname; - xname->len = strlen((char *)aname); - if (xname->len >= MAXNAMELEN) + + memset(args, 0, sizeof(*args)); + args->geo = dp->i_mount->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + args->dp = dp; + args->flags = flags; + args->name = name; + args->namelen = strlen((const char *)name); + if (args->namelen >= MAXNAMELEN) return EFAULT; /* match IRIX behaviour */ + args->hashval = xfs_da_hashname(args->name, args->namelen); return 0; } @@ -104,78 +116,46 @@ xfs_inode_hasattr( * Overall external interface routines. *========================================================================*/ -STATIC int -xfs_attr_get_int( +int +xfs_attr_get( struct xfs_inode *ip, - struct xfs_name *name, + const unsigned char *name, unsigned char *value, int *valuelenp, int flags) { - xfs_da_args_t args; - int error; + struct xfs_da_args args; + uint lock_mode; + int error; + + XFS_STATS_INC(xs_attr_get); + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return EIO; if (!xfs_inode_hasattr(ip)) return ENOATTR; - /* - * Fill in the arg structure for this request. - */ - memset((char *)&args, 0, sizeof(args)); - args.name = name->name; - args.namelen = name->len; + error = xfs_attr_args_init(&args, ip, name, flags); + if (error) + return error; + args.value = value; args.valuelen = *valuelenp; - args.flags = flags; - args.hashval = xfs_da_hashname(args.name, args.namelen); - args.dp = ip; - args.whichfork = XFS_ATTR_FORK; - /* - * Decide on what work routines to call based on the inode size. - */ - if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + lock_mode = xfs_ilock_attr_map_shared(ip); + if (!xfs_inode_hasattr(ip)) + error = ENOATTR; + else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) error = xfs_attr_shortform_getvalue(&args); - } else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) { + else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) error = xfs_attr_leaf_get(&args); - } else { + else error = xfs_attr_node_get(&args); - } + xfs_iunlock(ip, lock_mode); - /* - * Return the number of bytes in the value to the caller. - */ *valuelenp = args.valuelen; - - if (error == EEXIST) - error = 0; - return(error); -} - -int -xfs_attr_get( - xfs_inode_t *ip, - const unsigned char *name, - unsigned char *value, - int *valuelenp, - int flags) -{ - int error; - struct xfs_name xname; - - XFS_STATS_INC(xs_attr_get); - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return(EIO); - - error = xfs_attr_name_to_xname(&xname, name); - if (error) - return error; - - xfs_ilock(ip, XFS_ILOCK_SHARED); - error = xfs_attr_get_int(ip, &xname, value, valuelenp, flags); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return(error); + return error == EEXIST ? 0 : error; } /* @@ -183,12 +163,10 @@ xfs_attr_get( */ STATIC int xfs_attr_calc_size( - struct xfs_inode *ip, - int namelen, - int valuelen, + struct xfs_da_args *args, int *local) { - struct xfs_mount *mp = ip->i_mount; + struct xfs_mount *mp = args->dp->i_mount; int size; int nblks; @@ -196,12 +174,10 @@ xfs_attr_calc_size( * Determine space new attribute will use, and if it would be * "local" or "remote" (note: local != inline). */ - size = xfs_attr_leaf_newentsize(namelen, valuelen, - mp->m_sb.sb_blocksize, local); - + size = xfs_attr_leaf_newentsize(args, local); nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); if (*local) { - if (size > (mp->m_sb.sb_blocksize >> 1)) { + if (size > (args->geo->blksize / 2)) { /* Double split possible */ nblks *= 2; } @@ -210,7 +186,7 @@ xfs_attr_calc_size( * Out of line attribute, cannot double split, but * make room for the attribute value itself. */ - uint dblocks = XFS_B_TO_FSB(mp, valuelen); + uint dblocks = xfs_attr3_rmt_blocks(mp, args->valuelen); nblks += dblocks; nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK); } @@ -218,26 +194,38 @@ xfs_attr_calc_size( return nblks; } -STATIC int -xfs_attr_set_int( - struct xfs_inode *dp, - struct xfs_name *name, - unsigned char *value, - int valuelen, - int flags) +int +xfs_attr_set( + struct xfs_inode *dp, + const unsigned char *name, + unsigned char *value, + int valuelen, + int flags) { - xfs_da_args_t args; - xfs_fsblock_t firstblock; - xfs_bmap_free_t flist; - int error, err2, committed; struct xfs_mount *mp = dp->i_mount; + struct xfs_da_args args; + struct xfs_bmap_free flist; struct xfs_trans_res tres; + xfs_fsblock_t firstblock; int rsvd = (flags & ATTR_ROOT) != 0; - int local; + int error, err2, committed, local; + + XFS_STATS_INC(xs_attr_set); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return EIO; + + error = xfs_attr_args_init(&args, dp, name, flags); + if (error) + return error; + + args.value = value; + args.valuelen = valuelen; + args.firstblock = &firstblock; + args.flist = &flist; + args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; + args.total = xfs_attr_calc_size(&args, &local); - /* - * Attach the dquots to the inode. - */ error = xfs_qm_dqattach(dp, 0); if (error) return error; @@ -248,32 +236,14 @@ xfs_attr_set_int( */ if (XFS_IFORK_Q(dp) == 0) { int sf_size = sizeof(xfs_attr_sf_hdr_t) + - XFS_ATTR_SF_ENTSIZE_BYNAME(name->len, valuelen); + XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen); - if ((error = xfs_bmap_add_attrfork(dp, sf_size, rsvd))) - return(error); + error = xfs_bmap_add_attrfork(dp, sf_size, rsvd); + if (error) + return error; } /* - * Fill in the arg structure for this request. - */ - memset((char *)&args, 0, sizeof(args)); - args.name = name->name; - args.namelen = name->len; - args.value = value; - args.valuelen = valuelen; - args.flags = flags; - args.hashval = xfs_da_hashname(args.name, args.namelen); - args.dp = dp; - args.firstblock = &firstblock; - args.flist = &flist; - args.whichfork = XFS_ATTR_FORK; - args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; - - /* Size is now blocks for attribute data */ - args.total = xfs_attr_calc_size(dp, name->len, valuelen, &local); - - /* * Start our first transaction of the day. * * All future transactions during this code must be "chained" off @@ -300,7 +270,7 @@ xfs_attr_set_int( error = xfs_trans_reserve(args.trans, &tres, args.total, 0); if (error) { xfs_trans_cancel(args.trans, 0); - return(error); + return error; } xfs_ilock(dp, XFS_ILOCK_EXCL); @@ -310,7 +280,7 @@ xfs_attr_set_int( if (error) { xfs_iunlock(dp, XFS_ILOCK_EXCL); xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES); - return (error); + return error; } xfs_trans_ijoin(args.trans, dp, 0); @@ -319,9 +289,9 @@ xfs_attr_set_int( * If the attribute list is non-existent or a shortform list, * upgrade it to a single-leaf-block attribute list. */ - if ((dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) || - ((dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) && - (dp->i_d.di_anextents == 0))) { + if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL || + (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + dp->i_d.di_anextents == 0)) { /* * Build initial attribute list (if required). @@ -346,9 +316,8 @@ xfs_attr_set_int( * the transaction goes to disk before returning * to the user. */ - if (mp->m_flags & XFS_MOUNT_WSYNC) { + if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(args.trans); - } if (!error && (flags & ATTR_KERNOTIME) == 0) { xfs_trans_ichgtime(args.trans, dp, @@ -358,7 +327,7 @@ xfs_attr_set_int( XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); - return(error == 0 ? err2 : error); + return error ? error : err2; } /* @@ -396,22 +365,19 @@ xfs_attr_set_int( } - if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) error = xfs_attr_leaf_addname(&args); - } else { + else error = xfs_attr_node_addname(&args); - } - if (error) { + if (error) goto out; - } /* * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ - if (mp->m_flags & XFS_MOUNT_WSYNC) { + if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(args.trans); - } if ((flags & ATTR_KERNOTIME) == 0) xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); @@ -423,65 +389,47 @@ xfs_attr_set_int( error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); - return(error); + return error; out: - if (args.trans) + if (args.trans) { xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + } xfs_iunlock(dp, XFS_ILOCK_EXCL); - return(error); + return error; } +/* + * Generic handler routine to remove a name from an attribute list. + * Transitions attribute list from Btree to shortform as necessary. + */ int -xfs_attr_set( - xfs_inode_t *dp, - const unsigned char *name, - unsigned char *value, - int valuelen, - int flags) +xfs_attr_remove( + struct xfs_inode *dp, + const unsigned char *name, + int flags) { - int error; - struct xfs_name xname; + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_args args; + struct xfs_bmap_free flist; + xfs_fsblock_t firstblock; + int error; - XFS_STATS_INC(xs_attr_set); + XFS_STATS_INC(xs_attr_remove); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return (EIO); + return EIO; + + if (!xfs_inode_hasattr(dp)) + return ENOATTR; - error = xfs_attr_name_to_xname(&xname, name); + error = xfs_attr_args_init(&args, dp, name, flags); if (error) return error; - return xfs_attr_set_int(dp, &xname, value, valuelen, flags); -} - -/* - * Generic handler routine to remove a name from an attribute list. - * Transitions attribute list from Btree to shortform as necessary. - */ -STATIC int -xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) -{ - xfs_da_args_t args; - xfs_fsblock_t firstblock; - xfs_bmap_free_t flist; - int error; - xfs_mount_t *mp = dp->i_mount; - - /* - * Fill in the arg structure for this request. - */ - memset((char *)&args, 0, sizeof(args)); - args.name = name->name; - args.namelen = name->len; - args.flags = flags; - args.hashval = xfs_da_hashname(args.name, args.namelen); - args.dp = dp; args.firstblock = &firstblock; args.flist = &flist; - args.total = 0; - args.whichfork = XFS_ATTR_FORK; /* * we have no control over the attribute names that userspace passes us @@ -490,9 +438,6 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) */ args.op_flags = XFS_DA_OP_OKNOENT; - /* - * Attach the dquots to the inode. - */ error = xfs_qm_dqattach(dp, 0); if (error) return error; @@ -521,7 +466,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) XFS_ATTRRM_SPACE_RES(mp), 0); if (error) { xfs_trans_cancel(args.trans, 0); - return(error); + return error; } xfs_ilock(dp, XFS_ILOCK_EXCL); @@ -531,35 +476,26 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) */ xfs_trans_ijoin(args.trans, dp, 0); - /* - * Decide on what work routines to call based on the inode size. - */ if (!xfs_inode_hasattr(dp)) { error = XFS_ERROR(ENOATTR); - goto out; - } - if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); error = xfs_attr_shortform_remove(&args); - if (error) { - goto out; - } } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { error = xfs_attr_leaf_removename(&args); } else { error = xfs_attr_node_removename(&args); } - if (error) { + + if (error) goto out; - } /* * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ - if (mp->m_flags & XFS_MOUNT_WSYNC) { + if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(args.trans); - } if ((flags & ATTR_KERNOTIME) == 0) xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); @@ -571,45 +507,17 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); - return(error); + return error; out: - if (args.trans) + if (args.trans) { xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - xfs_iunlock(dp, XFS_ILOCK_EXCL); - return(error); -} - -int -xfs_attr_remove( - xfs_inode_t *dp, - const unsigned char *name, - int flags) -{ - int error; - struct xfs_name xname; - - XFS_STATS_INC(xs_attr_remove); - - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return (EIO); - - error = xfs_attr_name_to_xname(&xname, name); - if (error) - return error; - - xfs_ilock(dp, XFS_ILOCK_SHARED); - if (!xfs_inode_hasattr(dp)) { - xfs_iunlock(dp, XFS_ILOCK_SHARED); - return XFS_ERROR(ENOATTR); } - xfs_iunlock(dp, XFS_ILOCK_SHARED); - - return xfs_attr_remove_int(dp, &xname, flags); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return error; } - /*======================================================================== * External routines when attribute list is inside the inode *========================================================================*/ @@ -695,11 +603,22 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) trace_xfs_attr_leaf_replace(args); + /* save the attribute state for later removal*/ args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; args->rmtblkno2 = args->rmtblkno; args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + + /* + * clear the remote attr state now that it is saved so that the + * values reflect the state of the attribute we are about to + * add, not the attribute we just found and will remove later. + */ + args->rmtblkno = 0; + args->rmtblkcnt = 0; + args->rmtvaluelen = 0; } /* @@ -791,6 +710,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) args->blkno = args->blkno2; args->rmtblkno = args->rmtblkno2; args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; if (args->rmtblkno) { error = xfs_attr_rmtval_remove(args); if (error) @@ -943,7 +863,7 @@ xfs_attr_leaf_get(xfs_da_args_t *args) } /*======================================================================== - * External routines when attribute list size > XFS_LBSIZE(mp). + * External routines when attribute list size > geo->blksize *========================================================================*/ /* @@ -976,8 +896,6 @@ restart: state = xfs_da_state_alloc(); state->args = args; state->mp = mp; - state->blocksize = state->mp->m_sb.sb_blocksize; - state->node_ents = state->mp->m_attr_node_ents; /* * Search to see if name already exists, and get back a pointer @@ -996,13 +914,22 @@ restart: trace_xfs_attr_node_replace(args); + /* save the attribute state for later removal*/ args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; args->rmtblkno2 = args->rmtblkno; args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + + /* + * clear the remote attr state now that it is saved so that the + * values reflect the state of the attribute we are about to + * add, not the attribute we just found and will remove later. + */ args->rmtblkno = 0; args->rmtblkcnt = 0; + args->rmtvaluelen = 0; } retval = xfs_attr3_leaf_add(blk->bp, state->args); @@ -1130,6 +1057,7 @@ restart: args->blkno = args->blkno2; args->rmtblkno = args->rmtblkno2; args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; if (args->rmtblkno) { error = xfs_attr_rmtval_remove(args); if (error) @@ -1145,8 +1073,6 @@ restart: state = xfs_da_state_alloc(); state->args = args; state->mp = mp; - state->blocksize = state->mp->m_sb.sb_blocksize; - state->node_ents = state->mp->m_attr_node_ents; state->inleaf = 0; error = xfs_da3_node_lookup_int(state, &retval); if (error) @@ -1237,8 +1163,6 @@ xfs_attr_node_removename(xfs_da_args_t *args) state = xfs_da_state_alloc(); state->args = args; state->mp = dp->i_mount; - state->blocksize = state->mp->m_sb.sb_blocksize; - state->node_ents = state->mp->m_attr_node_ents; /* * Search to see if name exists, and get back a pointer to it. @@ -1500,8 +1424,6 @@ xfs_attr_node_get(xfs_da_args_t *args) state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; - state->blocksize = state->mp->m_sb.sb_blocksize; - state->node_ents = state->mp->m_attr_node_ents; /* * Search to see if name exists, and get back a pointer to it. diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index bb24b07cbed..09480c57f06 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -18,22 +18,20 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" +#include "xfs_inode.h" #include "xfs_alloc.h" -#include "xfs_btree.h" #include "xfs_attr_remote.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_attr.h" @@ -41,7 +39,8 @@ #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trace.h" -#include "xfs_trans_priv.h" +#include "xfs_dinode.h" +#include "xfs_dir2.h" /* * Look at all the extents for this logical region, @@ -232,13 +231,13 @@ xfs_attr3_node_inactive( } node = bp->b_addr; - xfs_da3_node_hdr_from_disk(&ichdr, node); + dp->d_ops->node_hdr_from_disk(&ichdr, node); parent_blkno = bp->b_bn; if (!ichdr.count) { xfs_trans_brelse(*trans, bp); return 0; } - btree = xfs_da3_node_tree_p(node); + btree = dp->d_ops->node_tree_p(node); child_fsb = be32_to_cpu(btree[0].before); xfs_trans_brelse(*trans, bp); /* no locks for later trans */ diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 86db20a9cc0..28712d29e43 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -18,32 +18,31 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_alloc.h" -#include "xfs_btree.h" -#include "xfs_attr_sf.h" -#include "xfs_attr_remote.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" +#include "xfs_bmap_btree.h" #include "xfs_bmap.h" +#include "xfs_attr_sf.h" +#include "xfs_attr_remote.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_buf_item.h" #include "xfs_cksum.h" +#include "xfs_dinode.h" +#include "xfs_dir2.h" /* @@ -81,11 +80,12 @@ STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state, /* * Utility routines. */ -STATIC void xfs_attr3_leaf_moveents(struct xfs_attr_leafblock *src_leaf, +STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args, + struct xfs_attr_leafblock *src_leaf, struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start, struct xfs_attr_leafblock *dst_leaf, struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start, - int move_count, struct xfs_mount *mp); + int move_count); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); void @@ -214,8 +214,8 @@ xfs_attr3_leaf_write_verify( struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr; if (!xfs_attr3_leaf_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } @@ -225,7 +225,7 @@ xfs_attr3_leaf_write_verify( if (bip) hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_ATTR3_LEAF_CRC_OFF); + xfs_buf_update_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF); } /* @@ -240,13 +240,14 @@ xfs_attr3_leaf_read_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; - if ((xfs_sb_version_hascrc(&mp->m_sb) && - !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_ATTR3_LEAF_CRC_OFF)) || - !xfs_attr3_leaf_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_attr3_leaf_verify(bp)) xfs_buf_ioerror(bp, EFSCORRUPTED); - } + + if (bp->b_error) + xfs_verifier_error(bp); } const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { @@ -711,6 +712,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) memset((char *)&nargs, 0, sizeof(nargs)); nargs.dp = dp; + nargs.geo = args->geo; nargs.firstblock = args->firstblock; nargs.flist = args->flist; nargs.total = args->total; @@ -805,18 +807,18 @@ xfs_attr3_leaf_to_shortform( trace_xfs_attr_leaf_to_sf(args); - tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP); + tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); if (!tmpbuffer) return ENOMEM; - memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(dp->i_mount)); + memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); leaf = (xfs_attr_leafblock_t *)tmpbuffer; xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); entry = xfs_attr3_leaf_entryp(leaf); /* XXX (dgc): buffer is about to be marked stale - why zero it? */ - memset(bp->b_addr, 0, XFS_LBSIZE(dp->i_mount)); + memset(bp->b_addr, 0, args->geo->blksize); /* * Clean out the prior contents of the attribute list. @@ -838,6 +840,7 @@ xfs_attr3_leaf_to_shortform( * Copy the attributes */ memset((char *)&nargs, 0, sizeof(nargs)); + nargs.geo = args->geo; nargs.dp = dp; nargs.firstblock = args->firstblock; nargs.flist = args->flist; @@ -904,12 +907,12 @@ xfs_attr3_leaf_to_node( /* copy leaf to new buffer, update identifiers */ xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF); bp2->b_ops = bp1->b_ops; - memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(mp)); + memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize); if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_da3_blkinfo *hdr3 = bp2->b_addr; hdr3->blkno = cpu_to_be64(bp2->b_bn); } - xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(mp) - 1); + xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1); /* * Set up the new root node. @@ -918,8 +921,8 @@ xfs_attr3_leaf_to_node( if (error) goto out; node = bp1->b_addr; - xfs_da3_node_hdr_from_disk(&icnodehdr, node); - btree = xfs_da3_node_tree_p(node); + dp->d_ops->node_hdr_from_disk(&icnodehdr, node); + btree = dp->d_ops->node_tree_p(node); leaf = bp2->b_addr; xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf); @@ -929,8 +932,8 @@ xfs_attr3_leaf_to_node( btree[0].hashval = entries[icleafhdr.count - 1].hashval; btree[0].before = cpu_to_be32(blkno); icnodehdr.count = 1; - xfs_da3_node_hdr_to_disk(node, &icnodehdr); - xfs_trans_log_buf(args->trans, bp1, 0, XFS_LBSIZE(mp) - 1); + dp->d_ops->node_hdr_to_disk(node, &icnodehdr); + xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1); error = 0; out: return error; @@ -966,10 +969,10 @@ xfs_attr3_leaf_create( bp->b_ops = &xfs_attr3_leaf_buf_ops; xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF); leaf = bp->b_addr; - memset(leaf, 0, XFS_LBSIZE(mp)); + memset(leaf, 0, args->geo->blksize); memset(&ichdr, 0, sizeof(ichdr)); - ichdr.firstused = XFS_LBSIZE(mp); + ichdr.firstused = args->geo->blksize; if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_da3_blkinfo *hdr3 = bp->b_addr; @@ -988,7 +991,7 @@ xfs_attr3_leaf_create( ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base; xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); - xfs_trans_log_buf(args->trans, bp, 0, XFS_LBSIZE(mp) - 1); + xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1); *bpp = bp; return 0; @@ -1074,8 +1077,7 @@ xfs_attr3_leaf_add( leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); ASSERT(args->index >= 0 && args->index <= ichdr.count); - entsize = xfs_attr_leaf_newentsize(args->namelen, args->valuelen, - args->trans->t_mountp->m_sb.sb_blocksize, NULL); + entsize = xfs_attr_leaf_newentsize(args, NULL); /* * Search through freemap for first-fit on new name length. @@ -1174,17 +1176,14 @@ xfs_attr3_leaf_add_work( * Allocate space for the new string (at the end of the run). */ mp = args->trans->t_mountp; - ASSERT(ichdr->freemap[mapindex].base < XFS_LBSIZE(mp)); + ASSERT(ichdr->freemap[mapindex].base < args->geo->blksize); ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0); ASSERT(ichdr->freemap[mapindex].size >= - xfs_attr_leaf_newentsize(args->namelen, args->valuelen, - mp->m_sb.sb_blocksize, NULL)); - ASSERT(ichdr->freemap[mapindex].size < XFS_LBSIZE(mp)); + xfs_attr_leaf_newentsize(args, NULL)); + ASSERT(ichdr->freemap[mapindex].size < args->geo->blksize); ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0); - ichdr->freemap[mapindex].size -= - xfs_attr_leaf_newentsize(args->namelen, args->valuelen, - mp->m_sb.sb_blocksize, &tmp); + ichdr->freemap[mapindex].size -= xfs_attr_leaf_newentsize(args, &tmp); entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base + ichdr->freemap[mapindex].size); @@ -1229,6 +1228,7 @@ xfs_attr3_leaf_add_work( name_rmt->valueblk = 0; args->rmtblkno = 1; args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); + args->rmtvaluelen = args->valuelen; } xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), @@ -1268,14 +1268,13 @@ xfs_attr3_leaf_compact( struct xfs_attr_leafblock *leaf_dst; struct xfs_attr3_icleaf_hdr ichdr_src; struct xfs_trans *trans = args->trans; - struct xfs_mount *mp = trans->t_mountp; char *tmpbuffer; trace_xfs_attr_leaf_compact(args); - tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); - memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp)); - memset(bp->b_addr, 0, XFS_LBSIZE(mp)); + tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); + memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); + memset(bp->b_addr, 0, args->geo->blksize); leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; leaf_dst = bp->b_addr; @@ -1288,7 +1287,7 @@ xfs_attr3_leaf_compact( /* Initialise the incore headers */ ichdr_src = *ichdr_dst; /* struct copy */ - ichdr_dst->firstused = XFS_LBSIZE(mp); + ichdr_dst->firstused = args->geo->blksize; ichdr_dst->usedbytes = 0; ichdr_dst->count = 0; ichdr_dst->holes = 0; @@ -1303,13 +1302,13 @@ xfs_attr3_leaf_compact( * Copy all entry's in the same (sorted) order, * but allocate name/value pairs packed and in sequence. */ - xfs_attr3_leaf_moveents(leaf_src, &ichdr_src, 0, leaf_dst, ichdr_dst, 0, - ichdr_src.count, mp); + xfs_attr3_leaf_moveents(args, leaf_src, &ichdr_src, 0, + leaf_dst, ichdr_dst, 0, ichdr_src.count); /* * this logs the entire buffer, but the caller must write the header * back to the buffer when it is finished modifying it. */ - xfs_trans_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1); + xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1); kmem_free(tmpbuffer); } @@ -1460,8 +1459,8 @@ xfs_attr3_leaf_rebalance( /* * Move high entries from leaf1 to low end of leaf2. */ - xfs_attr3_leaf_moveents(leaf1, &ichdr1, ichdr1.count - count, - leaf2, &ichdr2, 0, count, state->mp); + xfs_attr3_leaf_moveents(args, leaf1, &ichdr1, + ichdr1.count - count, leaf2, &ichdr2, 0, count); } else if (count > ichdr1.count) { /* @@ -1489,14 +1488,14 @@ xfs_attr3_leaf_rebalance( /* * Move low entries from leaf2 to high end of leaf1. */ - xfs_attr3_leaf_moveents(leaf2, &ichdr2, 0, leaf1, &ichdr1, - ichdr1.count, count, state->mp); + xfs_attr3_leaf_moveents(args, leaf2, &ichdr2, 0, leaf1, &ichdr1, + ichdr1.count, count); } xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1); xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2); - xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); - xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); + xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1); + xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1); /* * Copy out last hashval in each block for B-tree code. @@ -1591,11 +1590,9 @@ xfs_attr3_leaf_figure_balance( max = ichdr1->count + ichdr2->count; half = (max + 1) * sizeof(*entry); half += ichdr1->usedbytes + ichdr2->usedbytes + - xfs_attr_leaf_newentsize(state->args->namelen, - state->args->valuelen, - state->blocksize, NULL); + xfs_attr_leaf_newentsize(state->args, NULL); half /= 2; - lastdelta = state->blocksize; + lastdelta = state->args->geo->blksize; entry = xfs_attr3_leaf_entryp(leaf1); for (count = index = 0; count < max; entry++, index++, count++) { @@ -1605,10 +1602,7 @@ xfs_attr3_leaf_figure_balance( */ if (count == blk1->index) { tmp = totallen + sizeof(*entry) + - xfs_attr_leaf_newentsize( - state->args->namelen, - state->args->valuelen, - state->blocksize, NULL); + xfs_attr_leaf_newentsize(state->args, NULL); if (XFS_ATTR_ABS(half - tmp) > lastdelta) break; lastdelta = XFS_ATTR_ABS(half - tmp); @@ -1644,10 +1638,7 @@ xfs_attr3_leaf_figure_balance( totallen -= count * sizeof(*entry); if (foundit) { totallen -= sizeof(*entry) + - xfs_attr_leaf_newentsize( - state->args->namelen, - state->args->valuelen, - state->blocksize, NULL); + xfs_attr_leaf_newentsize(state->args, NULL); } *countarg = count; @@ -1699,7 +1690,7 @@ xfs_attr3_leaf_toosmall( bytes = xfs_attr3_leaf_hdr_size(leaf) + ichdr.count * sizeof(xfs_attr_leaf_entry_t) + ichdr.usedbytes; - if (bytes > (state->blocksize >> 1)) { + if (bytes > (state->args->geo->blksize >> 1)) { *action = 0; /* blk over 50%, don't try to join */ return(0); } @@ -1753,7 +1744,8 @@ xfs_attr3_leaf_toosmall( xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr); - bytes = state->blocksize - (state->blocksize >> 2) - + bytes = state->args->geo->blksize - + (state->args->geo->blksize >> 2) - ichdr.usedbytes - ichdr2.usedbytes - ((ichdr.count + ichdr2.count) * sizeof(xfs_attr_leaf_entry_t)) - @@ -1804,7 +1796,6 @@ xfs_attr3_leaf_remove( struct xfs_attr_leafblock *leaf; struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entry; - struct xfs_mount *mp = args->trans->t_mountp; int before; int after; int smallest; @@ -1818,7 +1809,7 @@ xfs_attr3_leaf_remove( leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); - ASSERT(ichdr.count > 0 && ichdr.count < XFS_LBSIZE(mp) / 8); + ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8); ASSERT(args->index >= 0 && args->index < ichdr.count); ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) + xfs_attr3_leaf_hdr_size(leaf)); @@ -1826,7 +1817,7 @@ xfs_attr3_leaf_remove( entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused); - ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp)); + ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize); /* * Scan through free region table: @@ -1841,8 +1832,8 @@ xfs_attr3_leaf_remove( smallest = XFS_ATTR_LEAF_MAPSIZE - 1; entsize = xfs_attr_leaf_entsize(leaf, args->index); for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { - ASSERT(ichdr.freemap[i].base < XFS_LBSIZE(mp)); - ASSERT(ichdr.freemap[i].size < XFS_LBSIZE(mp)); + ASSERT(ichdr.freemap[i].base < args->geo->blksize); + ASSERT(ichdr.freemap[i].size < args->geo->blksize); if (ichdr.freemap[i].base == tablesize) { ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t); ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t); @@ -1919,11 +1910,11 @@ xfs_attr3_leaf_remove( * removing the name. */ if (smallest) { - tmp = XFS_LBSIZE(mp); + tmp = args->geo->blksize; entry = xfs_attr3_leaf_entryp(leaf); for (i = ichdr.count - 1; i >= 0; entry++, i--) { ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused); - ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp)); + ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize); if (be16_to_cpu(entry->nameidx) < tmp) tmp = be16_to_cpu(entry->nameidx); @@ -1946,7 +1937,7 @@ xfs_attr3_leaf_remove( tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) + ichdr.count * sizeof(xfs_attr_leaf_entry_t); - return tmp < mp->m_attr_magicpct; /* leaf is < 37% full */ + return tmp < args->geo->magicpct; /* leaf is < 37% full */ } /* @@ -1963,7 +1954,6 @@ xfs_attr3_leaf_unbalance( struct xfs_attr3_icleaf_hdr drophdr; struct xfs_attr3_icleaf_hdr savehdr; struct xfs_attr_leaf_entry *entry; - struct xfs_mount *mp = state->mp; trace_xfs_attr_leaf_unbalance(state->args); @@ -1990,13 +1980,15 @@ xfs_attr3_leaf_unbalance( */ if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, drop_blk->bp, &drophdr)) { - xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, + xfs_attr3_leaf_moveents(state->args, + drop_leaf, &drophdr, 0, save_leaf, &savehdr, 0, - drophdr.count, mp); + drophdr.count); } else { - xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, + xfs_attr3_leaf_moveents(state->args, + drop_leaf, &drophdr, 0, save_leaf, &savehdr, - savehdr.count, drophdr.count, mp); + savehdr.count, drophdr.count); } } else { /* @@ -2006,7 +1998,7 @@ xfs_attr3_leaf_unbalance( struct xfs_attr_leafblock *tmp_leaf; struct xfs_attr3_icleaf_hdr tmphdr; - tmp_leaf = kmem_zalloc(state->blocksize, KM_SLEEP); + tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP); /* * Copy the header into the temp leaf so that all the stuff @@ -2019,35 +2011,39 @@ xfs_attr3_leaf_unbalance( tmphdr.magic = savehdr.magic; tmphdr.forw = savehdr.forw; tmphdr.back = savehdr.back; - tmphdr.firstused = state->blocksize; + tmphdr.firstused = state->args->geo->blksize; /* write the header to the temp buffer to initialise it */ xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr); if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, drop_blk->bp, &drophdr)) { - xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, + xfs_attr3_leaf_moveents(state->args, + drop_leaf, &drophdr, 0, tmp_leaf, &tmphdr, 0, - drophdr.count, mp); - xfs_attr3_leaf_moveents(save_leaf, &savehdr, 0, + drophdr.count); + xfs_attr3_leaf_moveents(state->args, + save_leaf, &savehdr, 0, tmp_leaf, &tmphdr, tmphdr.count, - savehdr.count, mp); + savehdr.count); } else { - xfs_attr3_leaf_moveents(save_leaf, &savehdr, 0, + xfs_attr3_leaf_moveents(state->args, + save_leaf, &savehdr, 0, tmp_leaf, &tmphdr, 0, - savehdr.count, mp); - xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, + savehdr.count); + xfs_attr3_leaf_moveents(state->args, + drop_leaf, &drophdr, 0, tmp_leaf, &tmphdr, tmphdr.count, - drophdr.count, mp); + drophdr.count); } - memcpy(save_leaf, tmp_leaf, state->blocksize); + memcpy(save_leaf, tmp_leaf, state->args->geo->blksize); savehdr = tmphdr; /* struct copy */ kmem_free(tmp_leaf); } xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr); xfs_trans_log_buf(state->args->trans, save_blk->bp, 0, - state->blocksize - 1); + state->args->geo->blksize - 1); /* * Copy out last hashval in each block for B-tree code. @@ -2093,7 +2089,7 @@ xfs_attr3_leaf_lookup_int( leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); - ASSERT(ichdr.count < XFS_LBSIZE(args->dp->i_mount) / 8); + ASSERT(ichdr.count < args->geo->blksize / 8); /* * Binary search. (note: small blocks will skip this loop) @@ -2167,11 +2163,11 @@ xfs_attr3_leaf_lookup_int( if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; args->index = probe; - args->valuelen = be32_to_cpu(name_rmt->valuelen); + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = xfs_attr3_rmt_blocks( args->dp->i_mount, - args->valuelen); + args->rmtvaluelen); return XFS_ERROR(EEXIST); } } @@ -2197,7 +2193,7 @@ xfs_attr3_leaf_getvalue( leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); - ASSERT(ichdr.count < XFS_LBSIZE(args->dp->i_mount) / 8); + ASSERT(ichdr.count < args->geo->blksize / 8); ASSERT(args->index < ichdr.count); entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; @@ -2220,19 +2216,19 @@ xfs_attr3_leaf_getvalue( name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); ASSERT(name_rmt->namelen == args->namelen); ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); - valuelen = be32_to_cpu(name_rmt->valuelen); + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, - valuelen); + args->rmtvaluelen); if (args->flags & ATTR_KERNOVAL) { - args->valuelen = valuelen; + args->valuelen = args->rmtvaluelen; return 0; } - if (args->valuelen < valuelen) { - args->valuelen = valuelen; + if (args->valuelen < args->rmtvaluelen) { + args->valuelen = args->rmtvaluelen; return XFS_ERROR(ERANGE); } - args->valuelen = valuelen; + args->valuelen = args->rmtvaluelen; } return 0; } @@ -2248,14 +2244,14 @@ xfs_attr3_leaf_getvalue( /*ARGSUSED*/ STATIC void xfs_attr3_leaf_moveents( + struct xfs_da_args *args, struct xfs_attr_leafblock *leaf_s, struct xfs_attr3_icleaf_hdr *ichdr_s, int start_s, struct xfs_attr_leafblock *leaf_d, struct xfs_attr3_icleaf_hdr *ichdr_d, int start_d, - int count, - struct xfs_mount *mp) + int count) { struct xfs_attr_leaf_entry *entry_s; struct xfs_attr_leaf_entry *entry_d; @@ -2275,10 +2271,10 @@ xfs_attr3_leaf_moveents( ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC || ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC); ASSERT(ichdr_s->magic == ichdr_d->magic); - ASSERT(ichdr_s->count > 0 && ichdr_s->count < XFS_LBSIZE(mp) / 8); + ASSERT(ichdr_s->count > 0 && ichdr_s->count < args->geo->blksize / 8); ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s)) + xfs_attr3_leaf_hdr_size(leaf_s)); - ASSERT(ichdr_d->count < XFS_LBSIZE(mp) / 8); + ASSERT(ichdr_d->count < args->geo->blksize / 8); ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d)) + xfs_attr3_leaf_hdr_size(leaf_d)); @@ -2330,11 +2326,11 @@ xfs_attr3_leaf_moveents( entry_d->nameidx = cpu_to_be16(ichdr_d->firstused); entry_d->flags = entry_s->flags; ASSERT(be16_to_cpu(entry_d->nameidx) + tmp - <= XFS_LBSIZE(mp)); + <= args->geo->blksize); memmove(xfs_attr3_leaf_name(leaf_d, desti), xfs_attr3_leaf_name(leaf_s, start_s + i), tmp); ASSERT(be16_to_cpu(entry_s->nameidx) + tmp - <= XFS_LBSIZE(mp)); + <= args->geo->blksize); memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp); ichdr_s->usedbytes -= tmp; ichdr_d->usedbytes += tmp; @@ -2355,7 +2351,7 @@ xfs_attr3_leaf_moveents( tmp = count * sizeof(xfs_attr_leaf_entry_t); entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; ASSERT(((char *)entry_s + tmp) <= - ((char *)leaf_s + XFS_LBSIZE(mp))); + ((char *)leaf_s + args->geo->blksize)); memset(entry_s, 0, tmp); } else { /* @@ -2370,7 +2366,7 @@ xfs_attr3_leaf_moveents( tmp = count * sizeof(xfs_attr_leaf_entry_t); entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count]; ASSERT(((char *)entry_s + tmp) <= - ((char *)leaf_s + XFS_LBSIZE(mp))); + ((char *)leaf_s + args->geo->blksize)); memset(entry_s, 0, tmp); } @@ -2438,22 +2434,21 @@ xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index) * a "local" or a "remote" attribute. */ int -xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local) +xfs_attr_leaf_newentsize( + struct xfs_da_args *args, + int *local) { - int size; + int size; - size = xfs_attr_leaf_entsize_local(namelen, valuelen); - if (size < xfs_attr_leaf_entsize_local_max(blocksize)) { - if (local) { + size = xfs_attr_leaf_entsize_local(args->namelen, args->valuelen); + if (size < xfs_attr_leaf_entsize_local_max(args->geo->blksize)) { + if (local) *local = 1; - } - } else { - size = xfs_attr_leaf_entsize_remote(namelen); - if (local) { - *local = 0; - } + return size; } - return size; + if (local) + *local = 0; + return xfs_attr_leaf_entsize_remote(args->namelen); } @@ -2519,7 +2514,7 @@ xfs_attr3_leaf_clearflag( ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0); name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); - name_rmt->valuelen = cpu_to_be32(args->valuelen); + name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } @@ -2677,7 +2672,7 @@ xfs_attr3_leaf_flipflags( ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0); name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); - name_rmt->valuelen = cpu_to_be32(args->valuelen); + name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); xfs_trans_log_buf(args->trans, bp1, XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt))); } diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index c1022138c7e..e2929da7c3b 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -19,16 +19,6 @@ #ifndef __XFS_ATTR_LEAF_H__ #define __XFS_ATTR_LEAF_H__ -/* - * Attribute storage layout, internal structure, access macros, etc. - * - * Attribute lists are structured around Btrees where all the data - * elements are in the leaf nodes. Attribute names are hashed into an int, - * then that int is used as the index into the Btree. Since the hashval - * of an attribute name may not be unique, we may have duplicate keys. The - * internal links in the Btree are logical block offsets into the file. - */ - struct attrlist; struct attrlist_cursor_kern; struct xfs_attr_list_context; @@ -38,226 +28,6 @@ struct xfs_da_state_blk; struct xfs_inode; struct xfs_trans; -/*======================================================================== - * Attribute structure when equal to XFS_LBSIZE(mp) bytes. - *========================================================================*/ - -/* - * This is the structure of the leaf nodes in the Btree. - * - * Struct leaf_entry's are packed from the top. Name/values grow from the - * bottom but are not packed. The freemap contains run-length-encoded entries - * for the free bytes after the leaf_entry's, but only the N largest such, - * smaller runs are dropped. When the freemap doesn't show enough space - * for an allocation, we compact the name/value area and try again. If we - * still don't have enough space, then we have to split the block. The - * name/value structs (both local and remote versions) must be 32bit aligned. - * - * Since we have duplicate hash keys, for each key that matches, compare - * the actual name string. The root and intermediate node search always - * takes the first-in-the-block key match found, so we should only have - * to work "forw"ard. If none matches, continue with the "forw"ard leaf - * nodes until the hash key changes or the attribute name is found. - * - * We store the fact that an attribute is a ROOT/USER/SECURE attribute in - * the leaf_entry. The namespaces are independent only because we also look - * at the namespace bit when we are looking for a matching attribute name. - * - * We also store an "incomplete" bit in the leaf_entry. It shows that an - * attribute is in the middle of being created and should not be shown to - * the user if we crash during the time that the bit is set. We clear the - * bit when we have finished setting up the attribute. We do this because - * we cannot create some large attributes inside a single transaction, and we - * need some indication that we weren't finished if we crash in the middle. - */ -#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */ - -typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ - __be16 base; /* base of free region */ - __be16 size; /* length of free region */ -} xfs_attr_leaf_map_t; - -typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */ - xfs_da_blkinfo_t info; /* block type, links, etc. */ - __be16 count; /* count of active leaf_entry's */ - __be16 usedbytes; /* num bytes of names/values stored */ - __be16 firstused; /* first used byte in name area */ - __u8 holes; /* != 0 if blk needs compaction */ - __u8 pad1; - xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE]; - /* N largest free regions */ -} xfs_attr_leaf_hdr_t; - -typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */ - __be32 hashval; /* hash value of name */ - __be16 nameidx; /* index into buffer of name/value */ - __u8 flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */ - __u8 pad2; /* unused pad byte */ -} xfs_attr_leaf_entry_t; - -typedef struct xfs_attr_leaf_name_local { - __be16 valuelen; /* number of bytes in value */ - __u8 namelen; /* length of name bytes */ - __u8 nameval[1]; /* name/value bytes */ -} xfs_attr_leaf_name_local_t; - -typedef struct xfs_attr_leaf_name_remote { - __be32 valueblk; /* block number of value bytes */ - __be32 valuelen; /* number of bytes in value */ - __u8 namelen; /* length of name bytes */ - __u8 name[1]; /* name bytes */ -} xfs_attr_leaf_name_remote_t; - -typedef struct xfs_attr_leafblock { - xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */ - xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */ - xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */ - xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */ -} xfs_attr_leafblock_t; - -/* - * CRC enabled leaf structures. Called "version 3" structures to match the - * version number of the directory and dablk structures for this feature, and - * attr2 is already taken by the variable inode attribute fork size feature. - */ -struct xfs_attr3_leaf_hdr { - struct xfs_da3_blkinfo info; - __be16 count; - __be16 usedbytes; - __be16 firstused; - __u8 holes; - __u8 pad1; - struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE]; - __be32 pad2; /* 64 bit alignment */ -}; - -#define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc)) - -struct xfs_attr3_leafblock { - struct xfs_attr3_leaf_hdr hdr; - struct xfs_attr_leaf_entry entries[1]; - - /* - * The rest of the block contains the following structures after the - * leaf entries, growing from the bottom up. The variables are never - * referenced, the locations accessed purely from helper functions. - * - * struct xfs_attr_leaf_name_local - * struct xfs_attr_leaf_name_remote - */ -}; - -/* - * incore, neutral version of the attribute leaf header - */ -struct xfs_attr3_icleaf_hdr { - __uint32_t forw; - __uint32_t back; - __uint16_t magic; - __uint16_t count; - __uint16_t usedbytes; - __uint16_t firstused; - __u8 holes; - struct { - __uint16_t base; - __uint16_t size; - } freemap[XFS_ATTR_LEAF_MAPSIZE]; -}; - -/* - * Flags used in the leaf_entry[i].flags field. - * NOTE: the INCOMPLETE bit must not collide with the flags bits specified - * on the system call, they are "or"ed together for various operations. - */ -#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ -#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ -#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */ -#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */ -#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT) -#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT) -#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT) -#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT) - -/* - * Conversion macros for converting namespace bits from argument flags - * to ondisk flags. - */ -#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE) -#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) -#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK) -#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK) -#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\ - ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0)) -#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\ - ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0)) - -/* - * Alignment for namelist and valuelist entries (since they are mixed - * there can be only one alignment value) - */ -#define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t)) - -static inline int -xfs_attr3_leaf_hdr_size(struct xfs_attr_leafblock *leafp) -{ - if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) - return sizeof(struct xfs_attr3_leaf_hdr); - return sizeof(struct xfs_attr_leaf_hdr); -} - -static inline struct xfs_attr_leaf_entry * -xfs_attr3_leaf_entryp(xfs_attr_leafblock_t *leafp) -{ - if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) - return &((struct xfs_attr3_leafblock *)leafp)->entries[0]; - return &leafp->entries[0]; -} - -/* - * Cast typed pointers for "local" and "remote" name/value structs. - */ -static inline char * -xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx) -{ - struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp); - - return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)]; -} - -static inline xfs_attr_leaf_name_remote_t * -xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx) -{ - return (xfs_attr_leaf_name_remote_t *)xfs_attr3_leaf_name(leafp, idx); -} - -static inline xfs_attr_leaf_name_local_t * -xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) -{ - return (xfs_attr_leaf_name_local_t *)xfs_attr3_leaf_name(leafp, idx); -} - -/* - * Calculate total bytes used (including trailing pad for alignment) for - * a "local" name/value structure, a "remote" name/value structure, and - * a pointer which might be either. - */ -static inline int xfs_attr_leaf_entsize_remote(int nlen) -{ - return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \ - XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); -} - -static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen) -{ - return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) + - XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); -} - -static inline int xfs_attr_leaf_entsize_local_max(int bsize) -{ - return (((bsize) >> 1) + ((bsize) >> 2)); -} - /* * Used to keep a list of "remote value" extents when unlinking an inode. */ @@ -326,8 +96,7 @@ int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_buf *bp, int *count); int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp, struct xfs_buf *leaf2_bp); -int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, - int *local); +int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local); int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); @@ -336,6 +105,4 @@ void xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to, void xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to, struct xfs_attr3_icleaf_hdr *from); -extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; - #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index cbc80d48517..90e2eeb2120 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -18,31 +18,29 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_alloc.h" -#include "xfs_btree.h" -#include "xfs_attr_sf.h" -#include "xfs_attr_remote.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_attr.h" +#include "xfs_attr_sf.h" +#include "xfs_attr_remote.h" #include "xfs_attr_leaf.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_buf_item.h" #include "xfs_cksum.h" +#include "xfs_dinode.h" +#include "xfs_dir2.h" STATIC int xfs_attr_shortform_compare(const void *a, const void *b) @@ -229,6 +227,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) struct xfs_da_node_entry *btree; int error, i; struct xfs_buf *bp; + struct xfs_inode *dp = context->dp; trace_xfs_attr_node_list(context); @@ -242,7 +241,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) */ bp = NULL; if (cursor->blkno > 0) { - error = xfs_da3_node_read(NULL, context->dp, cursor->blkno, -1, + error = xfs_da3_node_read(NULL, dp, cursor->blkno, -1, &bp, XFS_ATTR_FORK); if ((error != 0) && (error != EFSCORRUPTED)) return(error); @@ -292,7 +291,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) for (;;) { __uint16_t magic; - error = xfs_da3_node_read(NULL, context->dp, + error = xfs_da3_node_read(NULL, dp, cursor->blkno, -1, &bp, XFS_ATTR_FORK); if (error) @@ -312,8 +311,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) return XFS_ERROR(EFSCORRUPTED); } - xfs_da3_node_hdr_from_disk(&nodehdr, node); - btree = xfs_da3_node_tree_p(node); + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); for (i = 0; i < nodehdr.count; btree++, i++) { if (cursor->hashval <= be32_to_cpu(btree->hashval)) { @@ -349,8 +348,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) break; cursor->blkno = leafhdr.forw; xfs_trans_brelse(NULL, bp); - error = xfs_attr3_leaf_read(NULL, context->dp, cursor->blkno, -1, - &bp); + error = xfs_attr3_leaf_read(NULL, dp, cursor->blkno, -1, &bp); if (error) return error; } @@ -446,9 +444,11 @@ xfs_attr3_leaf_list_int( xfs_da_args_t args; memset((char *)&args, 0, sizeof(args)); + args.geo = context->dp->i_mount->m_attr_geo; args.dp = context->dp; args.whichfork = XFS_ATTR_FORK; args.valuelen = valuelen; + args.rmtvaluelen = valuelen; args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); args.rmtblkno = be32_to_cpu(name_rmt->valueblk); args.rmtblkcnt = xfs_attr3_rmt_blocks( @@ -509,17 +509,17 @@ xfs_attr_list_int( { int error; xfs_inode_t *dp = context->dp; + uint lock_mode; XFS_STATS_INC(xs_attr_list); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return EIO; - xfs_ilock(dp, XFS_ILOCK_SHARED); - /* * Decide on what work routines to call based on the inode size. */ + lock_mode = xfs_ilock_attr_map_shared(dp); if (!xfs_inode_hasattr(dp)) { error = 0; } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { @@ -529,9 +529,7 @@ xfs_attr_list_int( } else { error = xfs_attr_node_list(context); } - - xfs_iunlock(dp, XFS_ILOCK_SHARED); - + xfs_iunlock(dp, lock_mode); return error; } diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c index 712a502de61..b5adfecbb8e 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/xfs_attr_remote.c @@ -18,20 +18,19 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_error.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_alloc.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" @@ -42,6 +41,7 @@ #include "xfs_trace.h" #include "xfs_cksum.h" #include "xfs_buf_item.h" +#include "xfs_error.h" #define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ @@ -68,7 +68,6 @@ xfs_attr3_rmt_blocks( */ static bool xfs_attr3_rmt_hdr_ok( - struct xfs_mount *mp, void *ptr, xfs_ino_t ino, uint32_t offset, @@ -110,7 +109,7 @@ xfs_attr3_rmt_verify( if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt)) return false; if (be32_to_cpu(rmt->rm_offset) + - be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX) + be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX) return false; if (rmt->rm_owner == 0) return false; @@ -125,8 +124,8 @@ xfs_attr3_rmt_read_verify( struct xfs_mount *mp = bp->b_target->bt_mount; char *ptr; int len; - bool corrupt = false; xfs_daddr_t bno; + int blksize = mp->m_attr_geo->blksize; /* no verification of non-crc buffers */ if (!xfs_sb_version_hascrc(&mp->m_sb)) @@ -135,27 +134,25 @@ xfs_attr3_rmt_read_verify( ptr = bp->b_addr; bno = bp->b_bn; len = BBTOB(bp->b_length); - ASSERT(len >= XFS_LBSIZE(mp)); + ASSERT(len >= blksize); while (len > 0) { - if (!xfs_verify_cksum(ptr, XFS_LBSIZE(mp), - XFS_ATTR3_RMT_CRC_OFF)) { - corrupt = true; + if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) { + xfs_buf_ioerror(bp, EFSBADCRC); break; } - if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) { - corrupt = true; + if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); break; } - len -= XFS_LBSIZE(mp); - ptr += XFS_LBSIZE(mp); - bno += mp->m_bsize; + len -= blksize; + ptr += blksize; + bno += BTOBB(blksize); } - if (corrupt) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } else + if (bp->b_error) + xfs_verifier_error(bp); + else ASSERT(len == 0); } @@ -168,6 +165,7 @@ xfs_attr3_rmt_write_verify( char *ptr; int len; xfs_daddr_t bno; + int blksize = mp->m_attr_geo->blksize; /* no verification of non-crc buffers */ if (!xfs_sb_version_hascrc(&mp->m_sb)) @@ -176,13 +174,12 @@ xfs_attr3_rmt_write_verify( ptr = bp->b_addr; bno = bp->b_bn; len = BBTOB(bp->b_length); - ASSERT(len >= XFS_LBSIZE(mp)); + ASSERT(len >= blksize); while (len > 0) { - if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) { - XFS_CORRUPTION_ERROR(__func__, - XFS_ERRLEVEL_LOW, mp, bp->b_addr); + if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } if (bip) { @@ -191,11 +188,11 @@ xfs_attr3_rmt_write_verify( rmt = (struct xfs_attr3_rmt_hdr *)ptr; rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); } - xfs_update_cksum(ptr, XFS_LBSIZE(mp), XFS_ATTR3_RMT_CRC_OFF); + xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF); - len -= XFS_LBSIZE(mp); - ptr += XFS_LBSIZE(mp); - bno += mp->m_bsize; + len -= blksize; + ptr += blksize; + bno += BTOBB(blksize); } ASSERT(len == 0); } @@ -244,17 +241,18 @@ xfs_attr_rmtval_copyout( char *src = bp->b_addr; xfs_daddr_t bno = bp->b_bn; int len = BBTOB(bp->b_length); + int blksize = mp->m_attr_geo->blksize; - ASSERT(len >= XFS_LBSIZE(mp)); + ASSERT(len >= blksize); while (len > 0 && *valuelen > 0) { int hdr_size = 0; - int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp)); + int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize); byte_cnt = min(*valuelen, byte_cnt); if (xfs_sb_version_hascrc(&mp->m_sb)) { - if (!xfs_attr3_rmt_hdr_ok(mp, src, ino, *offset, + if (!xfs_attr3_rmt_hdr_ok(src, ino, *offset, byte_cnt, bno)) { xfs_alert(mp, "remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)", @@ -267,9 +265,9 @@ xfs_attr_rmtval_copyout( memcpy(*dst, src + hdr_size, byte_cnt); /* roll buffer forwards */ - len -= XFS_LBSIZE(mp); - src += XFS_LBSIZE(mp); - bno += mp->m_bsize; + len -= blksize; + src += blksize; + bno += BTOBB(blksize); /* roll attribute data forwards */ *valuelen -= byte_cnt; @@ -291,12 +289,13 @@ xfs_attr_rmtval_copyin( char *dst = bp->b_addr; xfs_daddr_t bno = bp->b_bn; int len = BBTOB(bp->b_length); + int blksize = mp->m_attr_geo->blksize; - ASSERT(len >= XFS_LBSIZE(mp)); + ASSERT(len >= blksize); while (len > 0 && *valuelen > 0) { int hdr_size; - int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp)); + int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize); byte_cnt = min(*valuelen, byte_cnt); hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset, @@ -308,17 +307,17 @@ xfs_attr_rmtval_copyin( * If this is the last block, zero the remainder of it. * Check that we are actually the last block, too. */ - if (byte_cnt + hdr_size < XFS_LBSIZE(mp)) { + if (byte_cnt + hdr_size < blksize) { ASSERT(*valuelen - byte_cnt == 0); - ASSERT(len == XFS_LBSIZE(mp)); + ASSERT(len == blksize); memset(dst + hdr_size + byte_cnt, 0, - XFS_LBSIZE(mp) - hdr_size - byte_cnt); + blksize - hdr_size - byte_cnt); } /* roll buffer forwards */ - len -= XFS_LBSIZE(mp); - dst += XFS_LBSIZE(mp); - bno += mp->m_bsize; + len -= blksize; + dst += blksize; + bno += BTOBB(blksize); /* roll attribute data forwards */ *valuelen -= byte_cnt; @@ -340,7 +339,7 @@ xfs_attr_rmtval_get( struct xfs_buf *bp; xfs_dablk_t lblkno = args->rmtblkno; __uint8_t *dst = args->value; - int valuelen = args->valuelen; + int valuelen; int nmap; int error; int blkcnt = args->rmtblkcnt; @@ -350,7 +349,9 @@ xfs_attr_rmtval_get( trace_xfs_attr_rmtval_get(args); ASSERT(!(args->flags & ATTR_KERNOVAL)); + ASSERT(args->rmtvaluelen == args->valuelen); + valuelen = args->rmtvaluelen; while (valuelen > 0) { nmap = ATTR_RMTVALUE_MAPSIZE; error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, @@ -418,7 +419,7 @@ xfs_attr_rmtval_set( * attributes have headers, we can't just do a straight byte to FSB * conversion and have to take the header space into account. */ - blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); + blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen); error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, XFS_ATTR_FORK); if (error) @@ -483,7 +484,7 @@ xfs_attr_rmtval_set( */ lblkno = args->rmtblkno; blkcnt = args->rmtblkcnt; - valuelen = args->valuelen; + valuelen = args->rmtvaluelen; while (valuelen > 0) { struct xfs_buf *bp; xfs_daddr_t dblkno; diff --git a/fs/xfs/xfs_attr_remote.h b/fs/xfs/xfs_attr_remote.h index 92a8fd7977c..5a9acfa156d 100644 --- a/fs/xfs/xfs_attr_remote.h +++ b/fs/xfs/xfs_attr_remote.h @@ -18,35 +18,6 @@ #ifndef __XFS_ATTR_REMOTE_H__ #define __XFS_ATTR_REMOTE_H__ -#define XFS_ATTR3_RMT_MAGIC 0x5841524d /* XARM */ - -/* - * There is one of these headers per filesystem block in a remote attribute. - * This is done to ensure there is a 1:1 mapping between the attribute value - * length and the number of blocks needed to store the attribute. This makes the - * verification of a buffer a little more complex, but greatly simplifies the - * allocation, reading and writing of these attributes as we don't have to guess - * the number of blocks needed to store the attribute data. - */ -struct xfs_attr3_rmt_hdr { - __be32 rm_magic; - __be32 rm_offset; - __be32 rm_bytes; - __be32 rm_crc; - uuid_t rm_uuid; - __be64 rm_owner; - __be64 rm_blkno; - __be64 rm_lsn; -}; - -#define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc) - -#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \ - ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ - sizeof(struct xfs_attr3_rmt_hdr) : 0)) - -extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; - int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); int xfs_attr_rmtval_get(struct xfs_da_args *args); diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c index 48228848f5a..0e8885a5964 100644 --- a/fs/xfs/xfs_bit.c +++ b/fs/xfs/xfs_bit.c @@ -16,10 +16,8 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" +#include "xfs_log_format.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_buf_item.h" /* * XFS bit manipulation routines, used in non-realtime code. diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h index f1e3c907044..e1649c0d3e0 100644 --- a/fs/xfs/xfs_bit.h +++ b/fs/xfs/xfs_bit.h @@ -66,8 +66,11 @@ static inline int xfs_lowbit64(__uint64_t v) n = ffs(w); } else { /* upper bits */ w = (__uint32_t)(v >> 32); - if (w && (n = ffs(w))) - n += 32; + if (w) { + n = ffs(w); + if (n) + n += 32; + } } return n - 1; } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index f47e65c30be..75c3fe5f3d9 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -17,39 +17,37 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" #include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_dir2_format.h" #include "xfs_dir2.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" -#include "xfs_mount.h" -#include "xfs_itable.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_extfree_item.h" #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" #include "xfs_rtalloc.h" #include "xfs_error.h" -#include "xfs_attr_leaf.h" #include "xfs_quota.h" #include "xfs_trans_space.h" #include "xfs_buf_item.h" -#include "xfs_filestream.h" #include "xfs_trace.h" #include "xfs_symlink.h" +#include "xfs_attr_leaf.h" +#include "xfs_dinode.h" +#include "xfs_filestream.h" kmem_zone_t *xfs_bmap_free_item_zone; @@ -96,7 +94,7 @@ xfs_bmap_compute_maxlevels( maxleafents = MAXAEXTNUM; sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); } - maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0); + maxrootrecs = xfs_bmdr_maxrecs(sz, 0); minleafrecs = mp->m_bmap_dmnr[0]; minnoderecs = mp->m_bmap_dmnr[1]; maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; @@ -235,7 +233,6 @@ xfs_default_attroffset( */ STATIC void xfs_bmap_forkoff_reset( - xfs_mount_t *mp, xfs_inode_t *ip, int whichfork) { @@ -907,7 +904,7 @@ xfs_bmap_local_to_extents_empty( ASSERT(ifp->if_bytes == 0); ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); - xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork); + xfs_bmap_forkoff_reset(ip, whichfork); ifp->if_flags &= ~XFS_IFINLINE; ifp->if_flags |= XFS_IFEXTENTS; XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); @@ -1101,10 +1098,11 @@ xfs_bmap_add_attrfork_local( if (S_ISDIR(ip->i_d.di_mode)) { memset(&dargs, 0, sizeof(dargs)); + dargs.geo = ip->i_mount->m_dir_geo; dargs.dp = ip; dargs.firstblock = firstblock; dargs.flist = flist; - dargs.total = ip->i_mount->m_dirblkfsbs; + dargs.total = dargs.geo->fsbcount; dargs.whichfork = XFS_DATA_FORK; dargs.trans = tp; return xfs_dir2_sf_to_block(&dargs); @@ -1139,6 +1137,7 @@ xfs_bmap_add_attrfork( int committed; /* xaction was committed */ int logflags; /* logging flags */ int error; /* error return value */ + int cancel_flags = 0; ASSERT(XFS_IFORK_Q(ip) == 0); @@ -1149,19 +1148,20 @@ xfs_bmap_add_attrfork( if (rsvd) tp->t_flags |= XFS_TRANS_RESERVE; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0); - if (error) - goto error0; + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : XFS_QMOPT_RES_REGBLKS); - if (error) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); - return error; - } + if (error) + goto trans_cancel; + cancel_flags |= XFS_TRANS_ABORT; if (XFS_IFORK_Q(ip)) - goto error1; + goto trans_cancel; if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { /* * For inodes coming from pre-6.2 filesystems. @@ -1171,7 +1171,7 @@ xfs_bmap_add_attrfork( } ASSERT(ip->i_d.di_anextents == 0); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); switch (ip->i_d.di_format) { @@ -1193,7 +1193,7 @@ xfs_bmap_add_attrfork( default: ASSERT(0); error = XFS_ERROR(EINVAL); - goto error1; + goto trans_cancel; } ASSERT(ip->i_afp == NULL); @@ -1221,7 +1221,7 @@ xfs_bmap_add_attrfork( if (logflags) xfs_trans_log_inode(tp, ip, logflags); if (error) - goto error2; + goto bmap_cancel; if (!xfs_sb_version_hasattr(&mp->m_sb) || (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) { __int64_t sbfields = 0; @@ -1244,14 +1244,16 @@ xfs_bmap_add_attrfork( error = xfs_bmap_finish(&tp, &flist, &committed); if (error) - goto error2; - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); -error2: + goto bmap_cancel; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + +bmap_cancel: xfs_bmap_cancel(&flist); -error1: +trans_cancel: + xfs_trans_cancel(tp, cancel_flags); xfs_iunlock(ip, XFS_ILOCK_EXCL); -error0: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); return error; } @@ -1482,7 +1484,7 @@ xfs_bmap_search_extents( xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, "Access to block zero in inode %llu " "start_block: %llx start_off: %llx " - "blkcnt: %llx extent-state: %x lastx: %x\n", + "blkcnt: %llx extent-state: %x lastx: %x", (unsigned long long)ip->i_ino, (unsigned long long)gotp->br_startblock, (unsigned long long)gotp->br_startoff, @@ -1633,7 +1635,7 @@ xfs_bmap_last_extent( * blocks at the end of the file which do not start at the previous data block, * we will try to align the new blocks at stripe unit boundaries. * - * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be + * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be * at, or past the EOF. */ STATIC int @@ -1648,9 +1650,14 @@ xfs_bmap_isaeof( bma->aeof = 0; error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec, &is_empty); - if (error || is_empty) + if (error) return error; + if (is_empty) { + bma->aeof = 1; + return 0; + } + /* * Check if we are allocation or past the last extent, or at least into * the last delayed allocated extent. @@ -1668,7 +1675,6 @@ xfs_bmap_isaeof( */ int xfs_bmap_last_offset( - struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *last_block, int whichfork) @@ -3510,6 +3516,67 @@ xfs_bmap_adjacent( #undef ISVALID } +static int +xfs_bmap_longest_free_extent( + struct xfs_trans *tp, + xfs_agnumber_t ag, + xfs_extlen_t *blen, + int *notinit) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_perag *pag; + xfs_extlen_t longest; + int error = 0; + + pag = xfs_perag_get(mp, ag); + if (!pag->pagf_init) { + error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK); + if (error) + goto out; + + if (!pag->pagf_init) { + *notinit = 1; + goto out; + } + } + + longest = xfs_alloc_longest_free_extent(mp, pag); + if (*blen < longest) + *blen = longest; + +out: + xfs_perag_put(pag); + return error; +} + +static void +xfs_bmap_select_minlen( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_extlen_t *blen, + int notinit) +{ + if (notinit || *blen < ap->minlen) { + /* + * Since we did a BUF_TRYLOCK above, it is possible that + * there is space for this request. + */ + args->minlen = ap->minlen; + } else if (*blen < args->maxlen) { + /* + * If the best seen length is less than the request length, + * use the best as the minimum. + */ + args->minlen = *blen; + } else { + /* + * Otherwise we've seen an extent as big as maxlen, use that + * as the minimum. + */ + args->minlen = args->maxlen; + } +} + STATIC int xfs_bmap_btalloc_nullfb( struct xfs_bmalloca *ap, @@ -3517,111 +3584,74 @@ xfs_bmap_btalloc_nullfb( xfs_extlen_t *blen) { struct xfs_mount *mp = ap->ip->i_mount; - struct xfs_perag *pag; xfs_agnumber_t ag, startag; int notinit = 0; int error; - if (ap->userdata && xfs_inode_is_filestream(ap->ip)) - args->type = XFS_ALLOCTYPE_NEAR_BNO; - else - args->type = XFS_ALLOCTYPE_START_BNO; + args->type = XFS_ALLOCTYPE_START_BNO; args->total = ap->total; - /* - * Search for an allocation group with a single extent large enough - * for the request. If one isn't found, then adjust the minimum - * allocation size to the largest space found. - */ startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno); if (startag == NULLAGNUMBER) startag = ag = 0; - pag = xfs_perag_get(mp, ag); while (*blen < args->maxlen) { - if (!pag->pagf_init) { - error = xfs_alloc_pagf_init(mp, args->tp, ag, - XFS_ALLOC_FLAG_TRYLOCK); - if (error) { - xfs_perag_put(pag); - return error; - } - } - - /* - * See xfs_alloc_fix_freelist... - */ - if (pag->pagf_init) { - xfs_extlen_t longest; - longest = xfs_alloc_longest_free_extent(mp, pag); - if (*blen < longest) - *blen = longest; - } else - notinit = 1; - - if (xfs_inode_is_filestream(ap->ip)) { - if (*blen >= args->maxlen) - break; - - if (ap->userdata) { - /* - * If startag is an invalid AG, we've - * come here once before and - * xfs_filestream_new_ag picked the - * best currently available. - * - * Don't continue looping, since we - * could loop forever. - */ - if (startag == NULLAGNUMBER) - break; - - error = xfs_filestream_new_ag(ap, &ag); - xfs_perag_put(pag); - if (error) - return error; + error = xfs_bmap_longest_free_extent(args->tp, ag, blen, + ¬init); + if (error) + return error; - /* loop again to set 'blen'*/ - startag = NULLAGNUMBER; - pag = xfs_perag_get(mp, ag); - continue; - } - } if (++ag == mp->m_sb.sb_agcount) ag = 0; if (ag == startag) break; - xfs_perag_put(pag); - pag = xfs_perag_get(mp, ag); } - xfs_perag_put(pag); - /* - * Since the above loop did a BUF_TRYLOCK, it is - * possible that there is space for this request. - */ - if (notinit || *blen < ap->minlen) - args->minlen = ap->minlen; - /* - * If the best seen length is less than the request - * length, use the best as the minimum. - */ - else if (*blen < args->maxlen) - args->minlen = *blen; - /* - * Otherwise we've seen an extent as big as maxlen, - * use that as the minimum. - */ - else - args->minlen = args->maxlen; + xfs_bmap_select_minlen(ap, args, blen, notinit); + return 0; +} + +STATIC int +xfs_bmap_btalloc_filestreams( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_extlen_t *blen) +{ + struct xfs_mount *mp = ap->ip->i_mount; + xfs_agnumber_t ag; + int notinit = 0; + int error; + + args->type = XFS_ALLOCTYPE_NEAR_BNO; + args->total = ap->total; + + ag = XFS_FSB_TO_AGNO(mp, args->fsbno); + if (ag == NULLAGNUMBER) + ag = 0; + + error = xfs_bmap_longest_free_extent(args->tp, ag, blen, ¬init); + if (error) + return error; + + if (*blen < args->maxlen) { + error = xfs_filestream_new_ag(ap, &ag); + if (error) + return error; + + error = xfs_bmap_longest_free_extent(args->tp, ag, blen, + ¬init); + if (error) + return error; + + } + + xfs_bmap_select_minlen(ap, args, blen, notinit); /* - * set the failure fallback case to look in the selected - * AG as the stream may have moved. + * Set the failure fallback case to look in the selected AG as stream + * may have moved. */ - if (xfs_inode_is_filestream(ap->ip)) - ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); - + ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); return 0; } @@ -3641,10 +3671,19 @@ xfs_bmap_btalloc( int isaligned; int tryagain; int error; + int stripe_align; ASSERT(ap->length); mp = ap->ip->i_mount; + + /* stripe alignment for allocation is determined by mount parameters */ + stripe_align = 0; + if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) + stripe_align = mp->m_swidth; + else if (mp->m_dalign) + stripe_align = mp->m_dalign; + align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; if (unlikely(align)) { error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, @@ -3653,6 +3692,8 @@ xfs_bmap_btalloc( ASSERT(!error); ASSERT(ap->length); } + + nullfb = *ap->firstblock == NULLFSBLOCK; fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); if (nullfb) { @@ -3690,7 +3731,15 @@ xfs_bmap_btalloc( args.firstblock = *ap->firstblock; blen = 0; if (nullfb) { - error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); + /* + * Search for an allocation group with a single extent large + * enough for the request. If one isn't found, then adjust + * the minimum allocation size to the largest space found. + */ + if (ap->userdata && xfs_inode_is_filestream(ap->ip)) + error = xfs_bmap_btalloc_filestreams(ap, &args, &blen); + else + error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); if (error) return error; } else if (ap->flist->xbf_low) { @@ -3728,7 +3777,7 @@ xfs_bmap_btalloc( */ if (!ap->flist->xbf_low && ap->aeof) { if (!ap->offset) { - args.alignment = mp->m_dalign; + args.alignment = stripe_align; atype = args.type; isaligned = 1; /* @@ -3753,13 +3802,13 @@ xfs_bmap_btalloc( * of minlen+alignment+slop doesn't go up * between the calls. */ - if (blen > mp->m_dalign && blen <= args.maxlen) - nextminlen = blen - mp->m_dalign; + if (blen > stripe_align && blen <= args.maxlen) + nextminlen = blen - stripe_align; else nextminlen = args.minlen; - if (nextminlen + mp->m_dalign > args.minlen + 1) + if (nextminlen + stripe_align > args.minlen + 1) args.minalignslop = - nextminlen + mp->m_dalign - + nextminlen + stripe_align - args.minlen - 1; else args.minalignslop = 0; @@ -3781,7 +3830,7 @@ xfs_bmap_btalloc( */ args.type = atype; args.fsbno = ap->blkno; - args.alignment = mp->m_dalign; + args.alignment = stripe_align; args.minlen = nextminlen; args.minalignslop = 0; isaligned = 1; @@ -3995,6 +4044,7 @@ xfs_bmapi_read( ASSERT(*nmap >= 1); ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE| XFS_BMAPI_IGSTATE))); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)); if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && @@ -4189,6 +4239,7 @@ xfs_bmapi_delay( ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); ASSERT(!(flags & ~XFS_BMAPI_ENTIRE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && @@ -4247,8 +4298,8 @@ xfs_bmapi_delay( } -int -__xfs_bmapi_allocate( +static int +xfs_bmapi_allocate( struct xfs_bmalloca *bma) { struct xfs_mount *mp = bma->ip->i_mount; @@ -4482,6 +4533,7 @@ xfs_bmapi_write( ASSERT(tp != NULL); ASSERT(len > 0); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && @@ -4526,9 +4578,6 @@ xfs_bmapi_write( bma.flist = flist; bma.firstblock = firstblock; - if (flags & XFS_BMAPI_STACK_SWITCH) - bma.stack_switch = 1; - while (bno < end && n < *nmap) { inhole = eof || bma.got.br_startoff > bno; wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); @@ -5033,6 +5082,7 @@ xfs_bunmapi( if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(len > 0); ASSERT(nexts >= 0); @@ -5356,3 +5406,201 @@ error0: } return error; } + +/* + * Shift extent records to the left to cover a hole. + * + * The maximum number of extents to be shifted in a single operation + * is @num_exts, and @current_ext keeps track of the current extent + * index we have shifted. @offset_shift_fsb is the length by which each + * extent is shifted. If there is no hole to shift the extents + * into, this will be considered invalid operation and we abort immediately. + */ +int +xfs_bmap_shift_extents( + struct xfs_trans *tp, + struct xfs_inode *ip, + int *done, + xfs_fileoff_t start_fsb, + xfs_fileoff_t offset_shift_fsb, + xfs_extnum_t *current_ext, + xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist, + int num_exts) +{ + struct xfs_btree_cur *cur; + struct xfs_bmbt_rec_host *gotp; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec left; + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + xfs_extnum_t nexts = 0; + xfs_fileoff_t startoff; + int error = 0; + int i; + int whichfork = XFS_DATA_FORK; + int logflags; + xfs_filblks_t blockcount = 0; + int total_extents; + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmap_shift_extents", + XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + ASSERT(current_ext != NULL); + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + /* Read in all the extents */ + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + /* + * If *current_ext is 0, we would need to lookup the extent + * from where we would start shifting and store it in gotp. + */ + if (!*current_ext) { + gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext); + /* + * gotp can be null in 2 cases: 1) if there are no extents + * or 2) start_fsb lies in a hole beyond which there are + * no extents. Either way, we are done. + */ + if (!gotp) { + *done = 1; + return 0; + } + } + + /* We are going to change core inode */ + logflags = XFS_ILOG_CORE; + if (ifp->if_flags & XFS_IFBROOT) { + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.flist = flist; + cur->bc_private.b.flags = 0; + } else { + cur = NULL; + logflags |= XFS_ILOG_DEXT; + } + + /* + * There may be delalloc extents in the data fork before the range we + * are collapsing out, so we cannot + * use the count of real extents here. Instead we have to calculate it + * from the incore fork. + */ + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + while (nexts++ < num_exts && *current_ext < total_extents) { + + gotp = xfs_iext_get_ext(ifp, *current_ext); + xfs_bmbt_get_all(gotp, &got); + startoff = got.br_startoff - offset_shift_fsb; + + /* + * Before shifting extent into hole, make sure that the hole + * is large enough to accomodate the shift. + */ + if (*current_ext) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, + *current_ext - 1), &left); + + if (startoff < left.br_startoff + left.br_blockcount) + error = XFS_ERROR(EINVAL); + } else if (offset_shift_fsb > got.br_startoff) { + /* + * When first extent is shifted, offset_shift_fsb + * should be less than the stating offset of + * the first extent. + */ + error = XFS_ERROR(EINVAL); + } + + if (error) + goto del_cursor; + + if (cur) { + error = xfs_bmbt_lookup_eq(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount, + &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); + } + + /* Check if we can merge 2 adjacent extents */ + if (*current_ext && + left.br_startoff + left.br_blockcount == startoff && + left.br_startblock + left.br_blockcount == + got.br_startblock && + left.br_state == got.br_state && + left.br_blockcount + got.br_blockcount <= MAXEXTLEN) { + blockcount = left.br_blockcount + + got.br_blockcount; + xfs_iext_remove(ip, *current_ext, 1, 0); + if (cur) { + error = xfs_btree_delete(cur, &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); + } + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + gotp = xfs_iext_get_ext(ifp, --*current_ext); + xfs_bmbt_get_all(gotp, &got); + + /* Make cursor point to the extent we will update */ + if (cur) { + error = xfs_bmbt_lookup_eq(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount, + &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); + } + + xfs_bmbt_set_blockcount(gotp, blockcount); + got.br_blockcount = blockcount; + } else { + /* We have to update the startoff */ + xfs_bmbt_set_startoff(gotp, startoff); + got.br_startoff = startoff; + } + + if (cur) { + error = xfs_bmbt_update(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount, + got.br_state); + if (error) + goto del_cursor; + } + + (*current_ext)++; + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + } + + /* Check if we are done */ + if (*current_ext == total_extents) + *done = 1; + +del_cursor: + if (cur) + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + + xfs_trans_log_inode(tp, ip, logflags); + return error; +} diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 33b41f35122..b879ca56a64 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -77,7 +77,6 @@ typedef struct xfs_bmap_free * from written to unwritten, otherwise convert from unwritten to written. */ #define XFS_BMAPI_CONVERT 0x040 -#define XFS_BMAPI_STACK_SWITCH 0x080 #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ @@ -86,8 +85,7 @@ typedef struct xfs_bmap_free { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \ - { XFS_BMAPI_CONVERT, "CONVERT" }, \ - { XFS_BMAPI_STACK_SWITCH, "STACK_SWITCH" } + { XFS_BMAPI_CONVERT, "CONVERT" } static inline int xfs_bmapi_aflag(int w) @@ -127,6 +125,16 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) { BMAP_RIGHT_FILLING, "RF" }, \ { BMAP_ATTRFORK, "ATTR" } + +/* + * This macro is used to determine how many extents will be shifted + * in one write transaction. We could require two splits, + * an extent move on the first and an extent merge on the second, + * So it is proper that one extent is shifted inside write transaction + * at a time. + */ +#define XFS_BMAP_MAX_SHIFT_EXTENTS 1 + #ifdef DEBUG void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, int whichfork, unsigned long caller_ip); @@ -146,8 +154,8 @@ int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *last_block, int whichfork); -int xfs_bmap_last_offset(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t *unused, int whichfork); +int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused, + int whichfork); int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork); int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork); @@ -169,5 +177,10 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, xfs_extnum_t num); uint xfs_default_attroffset(struct xfs_inode *ip); +int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, + int *done, xfs_fileoff_t start_fsb, + xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext, + xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist, + int num_exts); #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index bb8de8e399c..948836c4fd9 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -17,27 +17,26 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_alloc.h" #include "xfs_btree.h" -#include "xfs_itable.h" +#include "xfs_bmap_btree.h" #include "xfs_bmap.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trace.h" #include "xfs_cksum.h" +#include "xfs_dinode.h" /* * Determine the extent state. @@ -85,7 +84,7 @@ xfs_bmdr_to_bmbt( rblock->bb_level = dblock->bb_level; ASSERT(be16_to_cpu(rblock->bb_level) > 0); rblock->bb_numrecs = dblock->bb_numrecs; - dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0); + dmxr = xfs_bmdr_maxrecs(dblocklen, 0); fkp = XFS_BMDR_KEY_ADDR(dblock, 1); tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr); @@ -444,7 +443,7 @@ xfs_bmbt_to_bmdr( ASSERT(rblock->bb_level != 0); dblock->bb_level = rblock->bb_level; dblock->bb_numrecs = rblock->bb_numrecs; - dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0); + dmxr = xfs_bmdr_maxrecs(dblocklen, 0); fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); tkp = XFS_BMDR_KEY_ADDR(dblock, 1); fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen); @@ -520,7 +519,6 @@ xfs_bmbt_alloc_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *start, union xfs_btree_ptr *new, - int length, int *stat) { xfs_alloc_arg_t args; /* block allocation args */ @@ -673,8 +671,7 @@ xfs_bmbt_get_dmaxrecs( { if (level != cur->bc_nlevels - 1) return cur->bc_mp->m_bmap_dmxr[level != 0]; - return xfs_bmdr_maxrecs(cur->bc_mp, cur->bc_private.b.forksize, - level == 0); + return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0); } STATIC void @@ -781,12 +778,14 @@ static void xfs_bmbt_read_verify( struct xfs_buf *bp) { - if (!(xfs_btree_lblock_verify_crc(bp) && - xfs_bmbt_verify(bp))) { - trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, - bp->b_target->bt_mount, bp->b_addr); + if (!xfs_btree_lblock_verify_crc(bp)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_bmbt_verify(bp)) xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); } } @@ -795,11 +794,9 @@ xfs_bmbt_write_verify( struct xfs_buf *bp) { if (!xfs_bmbt_verify(bp)) { - xfs_warn(bp->b_target->bt_mount, "bmbt daddr 0x%llx failed", bp->b_bn); trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, - bp->b_target->bt_mount, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } xfs_btree_lblock_calc_crc(bp); @@ -915,7 +912,6 @@ xfs_bmbt_maxrecs( */ int xfs_bmdr_maxrecs( - struct xfs_mount *mp, int blocklen, int leaf) { diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index e367461a638..819a8a4dee9 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -18,9 +18,6 @@ #ifndef __XFS_BMAP_BTREE_H__ #define __XFS_BMAP_BTREE_H__ -#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */ -#define XFS_BMAP_CRC_MAGIC 0x424d4133 /* 'BMA3' */ - struct xfs_btree_cur; struct xfs_btree_block; struct xfs_mount; @@ -28,85 +25,6 @@ struct xfs_inode; struct xfs_trans; /* - * Bmap root header, on-disk form only. - */ -typedef struct xfs_bmdr_block { - __be16 bb_level; /* 0 is a leaf */ - __be16 bb_numrecs; /* current # of data records */ -} xfs_bmdr_block_t; - -/* - * Bmap btree record and extent descriptor. - * l0:63 is an extent flag (value 1 indicates non-normal). - * l0:9-62 are startoff. - * l0:0-8 and l1:21-63 are startblock. - * l1:0-20 are blockcount. - */ -#define BMBT_EXNTFLAG_BITLEN 1 -#define BMBT_STARTOFF_BITLEN 54 -#define BMBT_STARTBLOCK_BITLEN 52 -#define BMBT_BLOCKCOUNT_BITLEN 21 - -typedef struct xfs_bmbt_rec { - __be64 l0, l1; -} xfs_bmbt_rec_t; - -typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */ -typedef xfs_bmbt_rec_t xfs_bmdr_rec_t; - -typedef struct xfs_bmbt_rec_host { - __uint64_t l0, l1; -} xfs_bmbt_rec_host_t; - -/* - * Values and macros for delayed-allocation startblock fields. - */ -#define STARTBLOCKVALBITS 17 -#define STARTBLOCKMASKBITS (15 + XFS_BIG_BLKNOS * 20) -#define DSTARTBLOCKMASKBITS (15 + 20) -#define STARTBLOCKMASK \ - (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) -#define DSTARTBLOCKMASK \ - (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) - -static inline int isnullstartblock(xfs_fsblock_t x) -{ - return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK; -} - -static inline int isnulldstartblock(xfs_dfsbno_t x) -{ - return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK; -} - -static inline xfs_fsblock_t nullstartblock(int k) -{ - ASSERT(k < (1 << STARTBLOCKVALBITS)); - return STARTBLOCKMASK | (k); -} - -static inline xfs_filblks_t startblockval(xfs_fsblock_t x) -{ - return (xfs_filblks_t)((x) & ~STARTBLOCKMASK); -} - -/* - * Possible extent formats. - */ -typedef enum { - XFS_EXTFMT_NOSTATE = 0, - XFS_EXTFMT_HASSTATE -} xfs_exntfmt_t; - -/* - * Possible extent states. - */ -typedef enum { - XFS_EXT_NORM, XFS_EXT_UNWRITTEN, - XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID -} xfs_exntst_t; - -/* * Extent state and extent format macros. */ #define XFS_EXTFMT_INODE(x) \ @@ -115,27 +33,6 @@ typedef enum { #define ISUNWRITTEN(x) ((x)->br_state == XFS_EXT_UNWRITTEN) /* - * Incore version of above. - */ -typedef struct xfs_bmbt_irec -{ - xfs_fileoff_t br_startoff; /* starting file offset */ - xfs_fsblock_t br_startblock; /* starting block number */ - xfs_filblks_t br_blockcount; /* number of blocks */ - xfs_exntst_t br_state; /* extent state */ -} xfs_bmbt_irec_t; - -/* - * Key structure for non-leaf levels of the tree. - */ -typedef struct xfs_bmbt_key { - __be64 br_startoff; /* starting file offset */ -} xfs_bmbt_key_t, xfs_bmdr_key_t; - -/* btree pointer type */ -typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; - -/* * Btree block header size depends on a superblock flag. */ #define XFS_BMBT_BLOCK_LEN(mp) \ @@ -233,7 +130,7 @@ extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int, xfs_bmdr_block_t *, int); extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level); -extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf); +extern int xfs_bmdr_maxrecs(int blocklen, int leaf); extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, @@ -243,6 +140,4 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); -extern const struct xfs_buf_ops xfs_bmbt_buf_ops; - #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 97f952caea7..64731ef3324 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -18,31 +18,31 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" +#include "xfs_da_format.h" #include "xfs_inode.h" #include "xfs_btree.h" +#include "xfs_trans.h" #include "xfs_extfree_item.h" #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trans_space.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_log.h" +#include "xfs_dinode.h" /* Kernel only BMAP related definitions and functions */ @@ -249,48 +249,6 @@ xfs_bmap_rtalloc( } /* - * Stack switching interfaces for allocation - */ -static void -xfs_bmapi_allocate_worker( - struct work_struct *work) -{ - struct xfs_bmalloca *args = container_of(work, - struct xfs_bmalloca, work); - unsigned long pflags; - - /* we are in a transaction context here */ - current_set_flags_nested(&pflags, PF_FSTRANS); - - args->result = __xfs_bmapi_allocate(args); - complete(args->done); - - current_restore_flags_nested(&pflags, PF_FSTRANS); -} - -/* - * Some allocation requests often come in with little stack to work on. Push - * them off to a worker thread so there is lots of stack to use. Otherwise just - * call directly to avoid the context switch overhead here. - */ -int -xfs_bmapi_allocate( - struct xfs_bmalloca *args) -{ - DECLARE_COMPLETION_ONSTACK(done); - - if (!args->stack_switch) - return __xfs_bmapi_allocate(args); - - - args->done = &done; - INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker); - queue_work(xfs_alloc_wq, &args->work); - wait_for_completion(&done); - return args->result; -} - -/* * Check if the endoff is outside the last extent. If so the caller will grow * the allocation to a stripe unit boundary. All offsets are considered outside * the end of file for an empty fork, so 1 is returned in *eof in that case. @@ -617,22 +575,27 @@ xfs_getbmap( return XFS_ERROR(ENOMEM); xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { - if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) { + if (whichfork == XFS_DATA_FORK) { + if (!(iflags & BMV_IF_DELALLOC) && + (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) { error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); if (error) goto out_unlock_iolock; + + /* + * Even after flushing the inode, there can still be + * delalloc blocks on the inode beyond EOF due to + * speculative preallocation. These are not removed + * until the release function is called or the inode + * is inactivated. Hence we cannot assert here that + * ip->i_delayed_blks == 0. + */ } - /* - * even after flushing the inode, there can still be delalloc - * blocks on the inode beyond EOF due to speculative - * preallocation. These are not removed until the release - * function is called or the inode is inactivated. Hence we - * cannot assert here that ip->i_delayed_blks == 0. - */ - } - lock = xfs_ilock_map_shared(ip); + lock = xfs_ilock_data_map_shared(ip); + } else { + lock = xfs_ilock_attr_map_shared(ip); + } /* * Don't let nex be bigger than the number of extents @@ -737,7 +700,7 @@ xfs_getbmap( out_free_map: kmem_free(map); out_unlock_ilock: - xfs_iunlock_map_shared(ip, lock); + xfs_iunlock(ip, lock); out_unlock_iolock: xfs_iunlock(ip, XFS_IOLOCK_SHARED); @@ -965,32 +928,12 @@ xfs_free_eofblocks( return error; } -/* - * xfs_alloc_file_space() - * This routine allocates disk space for the given file. - * - * If alloc_type == 0, this request is for an ALLOCSP type - * request which will change the file size. In this case, no - * DMAPI event will be generated by the call. A TRUNCATE event - * will be generated later by xfs_setattr. - * - * If alloc_type != 0, this request is for a RESVSP type - * request, and a DMAPI DM_EVENT_WRITE will be generated if the - * lower block boundary byte address is less than the file's - * length. - * - * RETURNS: - * 0 on success - * errno on error - * - */ -STATIC int +int xfs_alloc_file_space( - xfs_inode_t *ip, + struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len, - int alloc_type, - int attr_flags) + int alloc_type) { xfs_mount_t *mp = ip->i_mount; xfs_off_t count; @@ -1188,9 +1131,15 @@ xfs_zero_remaining_bytes( xfs_buf_unlock(bp); for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { + uint lock_mode; + offset_fsb = XFS_B_TO_FSBT(mp, offset); nimap = 1; + + lock_mode = xfs_ilock_data_map_shared(ip); error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0); + xfs_iunlock(ip, lock_mode); + if (error || nimap < 1) break; ASSERT(imap.br_blockcount >= 1); @@ -1207,7 +1156,12 @@ xfs_zero_remaining_bytes( XFS_BUF_UNWRITE(bp); XFS_BUF_READ(bp); XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); - xfsbdstrat(mp, bp); + + if (XFS_FORCED_SHUTDOWN(mp)) { + error = XFS_ERROR(EIO); + break; + } + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) { xfs_buf_ioerror_alert(bp, @@ -1220,7 +1174,12 @@ xfs_zero_remaining_bytes( XFS_BUF_UNDONE(bp); XFS_BUF_UNREAD(bp); XFS_BUF_WRITE(bp); - xfsbdstrat(mp, bp); + + if (XFS_FORCED_SHUTDOWN(mp)) { + error = XFS_ERROR(EIO); + break; + } + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) { xfs_buf_ioerror_alert(bp, @@ -1232,24 +1191,11 @@ xfs_zero_remaining_bytes( return error; } -/* - * xfs_free_file_space() - * This routine frees disk space for the given file. - * - * This routine is only called by xfs_change_file_space - * for an UNRESVSP type call. - * - * RETURNS: - * 0 on success - * errno on error - * - */ -STATIC int +int xfs_free_file_space( - xfs_inode_t *ip, + struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len, - int attr_flags) + xfs_off_t len) { int committed; int done; @@ -1267,7 +1213,6 @@ xfs_free_file_space( int rt; xfs_fileoff_t startoffset_fsb; xfs_trans_t *tp; - int need_iolock = 1; mp = ip->i_mount; @@ -1284,20 +1229,15 @@ xfs_free_file_space( startoffset_fsb = XFS_B_TO_FSB(mp, offset); endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); - if (attr_flags & XFS_ATTR_NOLOCK) - need_iolock = 0; - if (need_iolock) { - xfs_ilock(ip, XFS_IOLOCK_EXCL); - /* wait for the completion of any pending DIOs */ - inode_dio_wait(VFS_I(ip)); - } + /* wait for the completion of any pending DIOs */ + inode_dio_wait(VFS_I(ip)); rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); ioffset = offset & ~(rounding - 1); error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset, -1); if (error) - goto out_unlock_iolock; + goto out; truncate_pagecache_range(VFS_I(ip), ioffset, -1); /* @@ -1311,7 +1251,7 @@ xfs_free_file_space( error = xfs_bmapi_read(ip, startoffset_fsb, 1, &imap, &nimap, 0); if (error) - goto out_unlock_iolock; + goto out; ASSERT(nimap == 0 || nimap == 1); if (nimap && imap.br_startblock != HOLESTARTBLOCK) { xfs_daddr_t block; @@ -1326,7 +1266,7 @@ xfs_free_file_space( error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1, &imap, &nimap, 0); if (error) - goto out_unlock_iolock; + goto out; ASSERT(nimap == 0 || nimap == 1); if (nimap && imap.br_startblock != HOLESTARTBLOCK) { ASSERT(imap.br_startblock != DELAYSTARTBLOCK); @@ -1366,7 +1306,6 @@ xfs_free_file_space( * the freeing of the space succeeds at ENOSPC. */ tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - tp->t_flags |= XFS_TRANS_RESERVE; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0); /* @@ -1412,27 +1351,23 @@ xfs_free_file_space( xfs_iunlock(ip, XFS_ILOCK_EXCL); } - out_unlock_iolock: - if (need_iolock) - xfs_iunlock(ip, XFS_IOLOCK_EXCL); + out: return error; error0: xfs_bmap_cancel(&free_list); error1: xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) : - XFS_ILOCK_EXCL); - return error; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + goto out; } -STATIC int +int xfs_zero_file_space( struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len, - int attr_flags) + xfs_off_t len) { struct xfs_mount *mp = ip->i_mount; uint granularity; @@ -1440,6 +1375,8 @@ xfs_zero_file_space( xfs_off_t end_boundary; int error; + trace_xfs_zero_file_space(ip); + granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); /* @@ -1453,26 +1390,32 @@ xfs_zero_file_space( ASSERT(start_boundary >= offset); ASSERT(end_boundary <= offset + len); - if (!(attr_flags & XFS_ATTR_NOLOCK)) - xfs_ilock(ip, XFS_IOLOCK_EXCL); - if (start_boundary < end_boundary - 1) { - /* punch out the page cache over the conversion range */ + /* + * punch out delayed allocation blocks and the page cache over + * the conversion range + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, + XFS_B_TO_FSBT(mp, start_boundary), + XFS_B_TO_FSB(mp, end_boundary - start_boundary)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); truncate_pagecache_range(VFS_I(ip), start_boundary, end_boundary - 1); + /* convert the blocks */ error = xfs_alloc_file_space(ip, start_boundary, end_boundary - start_boundary - 1, - XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT, - attr_flags); + XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT); if (error) - goto out_unlock; + goto out; /* We've handled the interior of the range, now for the edges */ - if (start_boundary != offset) + if (start_boundary != offset) { error = xfs_iozero(ip, offset, start_boundary - offset); - if (error) - goto out_unlock; + if (error) + goto out; + } if (end_boundary != offset + len) error = xfs_iozero(ip, end_boundary, @@ -1486,194 +1429,103 @@ xfs_zero_file_space( error = xfs_iozero(ip, offset, len); } -out_unlock: - if (!(attr_flags & XFS_ATTR_NOLOCK)) - xfs_iunlock(ip, XFS_IOLOCK_EXCL); +out: return error; } /* - * xfs_change_file_space() - * This routine allocates or frees disk space for the given file. - * The user specified parameters are checked for alignment and size - * limitations. - * + * xfs_collapse_file_space() + * This routine frees disk space and shift extent for the given file. + * The first thing we do is to free data blocks in the specified range + * by calling xfs_free_file_space(). It would also sync dirty data + * and invalidate page cache over the region on which collapse range + * is working. And Shift extent records to the left to cover a hole. * RETURNS: - * 0 on success - * errno on error + * 0 on success + * errno on error * */ int -xfs_change_file_space( - xfs_inode_t *ip, - int cmd, - xfs_flock64_t *bf, - xfs_off_t offset, - int attr_flags) +xfs_collapse_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) { - xfs_mount_t *mp = ip->i_mount; - int clrprealloc; - int error; - xfs_fsize_t fsize; - int setprealloc; - xfs_off_t startoffset; - xfs_trans_t *tp; - struct iattr iattr; - - if (!S_ISREG(ip->i_d.di_mode)) - return XFS_ERROR(EINVAL); - - switch (bf->l_whence) { - case 0: /*SEEK_SET*/ - break; - case 1: /*SEEK_CUR*/ - bf->l_start += offset; - break; - case 2: /*SEEK_END*/ - bf->l_start += XFS_ISIZE(ip); - break; - default: - return XFS_ERROR(EINVAL); - } - - /* - * length of <= 0 for resv/unresv/zero is invalid. length for - * alloc/free is ignored completely and we have no idea what userspace - * might have set it to, so set it to zero to allow range - * checks to pass. - */ - switch (cmd) { - case XFS_IOC_ZERO_RANGE: - case XFS_IOC_RESVSP: - case XFS_IOC_RESVSP64: - case XFS_IOC_UNRESVSP: - case XFS_IOC_UNRESVSP64: - if (bf->l_len <= 0) - return XFS_ERROR(EINVAL); - break; - default: - bf->l_len = 0; - break; - } - - if (bf->l_start < 0 || - bf->l_start > mp->m_super->s_maxbytes || - bf->l_start + bf->l_len < 0 || - bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) - return XFS_ERROR(EINVAL); - - bf->l_whence = 0; + int done = 0; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + xfs_extnum_t current_ext = 0; + struct xfs_bmap_free free_list; + xfs_fsblock_t first_block; + int committed; + xfs_fileoff_t start_fsb; + xfs_fileoff_t shift_fsb; - startoffset = bf->l_start; - fsize = XFS_ISIZE(ip); + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - setprealloc = clrprealloc = 0; - switch (cmd) { - case XFS_IOC_ZERO_RANGE: - error = xfs_zero_file_space(ip, startoffset, bf->l_len, - attr_flags); - if (error) - return error; - setprealloc = 1; - break; + trace_xfs_collapse_file_space(ip); - case XFS_IOC_RESVSP: - case XFS_IOC_RESVSP64: - error = xfs_alloc_file_space(ip, startoffset, bf->l_len, - XFS_BMAPI_PREALLOC, attr_flags); - if (error) - return error; - setprealloc = 1; - break; + start_fsb = XFS_B_TO_FSB(mp, offset + len); + shift_fsb = XFS_B_TO_FSB(mp, len); - case XFS_IOC_UNRESVSP: - case XFS_IOC_UNRESVSP64: - if ((error = xfs_free_file_space(ip, startoffset, bf->l_len, - attr_flags))) - return error; - break; + error = xfs_free_file_space(ip, offset, len); + if (error) + return error; - case XFS_IOC_ALLOCSP: - case XFS_IOC_ALLOCSP64: - case XFS_IOC_FREESP: - case XFS_IOC_FREESP64: + while (!error && !done) { + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); /* - * These operations actually do IO when extending the file, but - * the allocation is done seperately to the zeroing that is - * done. This set of operations need to be serialised against - * other IO operations, such as truncate and buffered IO. We - * need to take the IOLOCK here to serialise the allocation and - * zeroing IO to prevent other IOLOCK holders (e.g. getbmap, - * truncate, direct IO) from racing against the transient - * allocated but not written state we can have here. + * We would need to reserve permanent block for transaction. + * This will come into picture when after shifting extent into + * hole we found that adjacent extents can be merged which + * may lead to freeing of a block during record update. */ - xfs_ilock(ip, XFS_IOLOCK_EXCL); - if (startoffset > fsize) { - error = xfs_alloc_file_space(ip, fsize, - startoffset - fsize, 0, - attr_flags | XFS_ATTR_NOLOCK); - if (error) { - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - break; - } + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); + if (error) { + xfs_trans_cancel(tp, 0); + break; } - iattr.ia_valid = ATTR_SIZE; - iattr.ia_size = startoffset; - - error = xfs_setattr_size(ip, &iattr, - attr_flags | XFS_ATTR_NOLOCK); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, + ip->i_gdquot, ip->i_pdquot, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, + XFS_QMOPT_RES_REGBLKS); if (error) - return error; - - clrprealloc = 1; - break; - - default: - ASSERT(0); - return XFS_ERROR(EINVAL); - } - - /* - * update the inode timestamp, mode, and prealloc flag bits - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_writeid, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return error; - } + goto out; - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); - if ((attr_flags & XFS_ATTR_DMI) == 0) { - ip->i_d.di_mode &= ~S_ISUID; + xfs_bmap_init(&free_list, &first_block); /* - * Note that we don't have to worry about mandatory - * file locking being disabled here because we only - * clear the S_ISGID bit if the Group execute bit is - * on, but if it was on then mandatory locking wouldn't - * have been enabled. + * We are using the write transaction in which max 2 bmbt + * updates are allowed */ - if (ip->i_d.di_mode & S_IXGRP) - ip->i_d.di_mode &= ~S_ISGID; + error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb, + shift_fsb, ¤t_ext, + &first_block, &free_list, + XFS_BMAP_MAX_SHIFT_EXTENTS); + if (error) + goto out; + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out; - xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); } - if (setprealloc) - ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; - else if (clrprealloc) - ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (attr_flags & XFS_ATTR_SYNC) - xfs_trans_set_sync(tp); - return xfs_trans_commit(tp, 0); + return error; + +out: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; } /* diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 061260946f7..2fdb72d2c90 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -50,12 +50,11 @@ struct xfs_bmalloca { xfs_extlen_t total; /* total blocks needed for xaction */ xfs_extlen_t minlen; /* minimum allocation size (blocks) */ xfs_extlen_t minleft; /* amount must be left after alloc */ - char eof; /* set if allocating past last extent */ - char wasdel; /* replacing a delayed allocation */ - char userdata;/* set if is user data */ - char aeof; /* allocated space at eof */ - char conv; /* overwriting unwritten extents */ - char stack_switch; + bool eof; /* set if allocating past last extent */ + bool wasdel; /* replacing a delayed allocation */ + bool userdata;/* set if is user data */ + bool aeof; /* allocated space at eof */ + bool conv; /* overwriting unwritten extents */ int flags; struct completion *done; struct work_struct work; @@ -65,8 +64,6 @@ struct xfs_bmalloca { int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, int *committed); int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); -int xfs_bmapi_allocate(struct xfs_bmalloca *args); -int __xfs_bmapi_allocate(struct xfs_bmalloca *args); int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, int whichfork, int *eof); int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, @@ -93,9 +90,14 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip, int *is_empty); /* preallocation and hole punch interface */ -int xfs_change_file_space(struct xfs_inode *ip, int cmd, - xfs_flock64_t *bf, xfs_off_t offset, - int attr_flags); +int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t len, int alloc_type); +int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t len); +int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t len); +int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, + xfs_off_t len); /* EOF block manipulation functions */ bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 5690e102243..cf893bc1e37 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -17,24 +17,23 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_buf_item.h" #include "xfs_btree.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_cksum.h" +#include "xfs_alloc.h" /* * Cursor allocation zone. @@ -45,9 +44,10 @@ kmem_zone_t *xfs_btree_cur_zone; * Btree magic numbers. */ static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { - { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC }, + { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC, + XFS_FIBT_MAGIC }, { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, - XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC } + XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC } }; #define xfs_btree_magic(cur) \ xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum] @@ -236,8 +236,7 @@ xfs_btree_lblock_calc_crc( return; if (bip) block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_BTREE_LBLOCK_CRC_OFF); + xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); } bool @@ -245,8 +244,8 @@ xfs_btree_lblock_verify_crc( struct xfs_buf *bp) { if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) - return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_BTREE_LBLOCK_CRC_OFF); + return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); + return true; } @@ -269,8 +268,7 @@ xfs_btree_sblock_calc_crc( return; if (bip) block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_BTREE_SBLOCK_CRC_OFF); + xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); } bool @@ -278,8 +276,8 @@ xfs_btree_sblock_verify_crc( struct xfs_buf *bp) { if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) - return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_BTREE_SBLOCK_CRC_OFF); + return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); + return true; } @@ -556,14 +554,11 @@ xfs_btree_get_bufl( xfs_fsblock_t fsbno, /* file system block number */ uint lock) /* lock flags for get_buf */ { - xfs_buf_t *bp; /* buffer pointer (return value) */ xfs_daddr_t d; /* real disk block address */ ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); - ASSERT(!xfs_buf_geterror(bp)); - return bp; + return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); } /* @@ -578,15 +573,12 @@ xfs_btree_get_bufs( xfs_agblock_t agbno, /* allocation group block number */ uint lock) /* lock flags for get_buf */ { - xfs_buf_t *bp; /* buffer pointer (return value) */ xfs_daddr_t d; /* real disk block address */ ASSERT(agno != NULLAGNUMBER); ASSERT(agbno != NULLAGBLOCK); d = XFS_AGB_TO_DADDR(mp, agno, agbno); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); - ASSERT(!xfs_buf_geterror(bp)); - return bp; + return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); } /* @@ -726,7 +718,6 @@ xfs_btree_read_bufl( mp->m_bsize, lock, &bp, ops); if (error) return error; - ASSERT(!xfs_buf_geterror(bp)); if (bp) xfs_buf_set_ref(bp, refval); *bpp = bp; @@ -1119,6 +1110,7 @@ xfs_btree_set_refs( xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF); break; case XFS_BTNUM_INO: + case XFS_BTNUM_FINO: xfs_buf_set_ref(bp, XFS_INO_BTREE_REF); break; case XFS_BTNUM_BMAP: @@ -1163,7 +1155,6 @@ STATIC int xfs_btree_read_buf_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, - int level, int flags, struct xfs_btree_block **block, struct xfs_buf **bpp) @@ -1182,7 +1173,6 @@ xfs_btree_read_buf_block( if (error) return error; - ASSERT(!xfs_buf_geterror(*bpp)); xfs_btree_set_refs(cur, *bpp); *block = XFS_BUF_TO_BLOCK(*bpp); return 0; @@ -1521,8 +1511,8 @@ xfs_btree_increment( union xfs_btree_ptr *ptrp; ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block); - error = xfs_btree_read_buf_block(cur, ptrp, --lev, - 0, &block, &bp); + --lev; + error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); if (error) goto error0; @@ -1620,8 +1610,8 @@ xfs_btree_decrement( union xfs_btree_ptr *ptrp; ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block); - error = xfs_btree_read_buf_block(cur, ptrp, --lev, - 0, &block, &bp); + --lev; + error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); if (error) goto error0; xfs_btree_setbuf(cur, lev, bp); @@ -1671,7 +1661,7 @@ xfs_btree_lookup_get_block( return 0; } - error = xfs_btree_read_buf_block(cur, pp, level, 0, blkp, &bp); + error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp); if (error) return error; @@ -2022,7 +2012,7 @@ xfs_btree_lshift( goto out0; /* Set up the left neighbor as "left". */ - error = xfs_btree_read_buf_block(cur, &lptr, level, 0, &left, &lbp); + error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); if (error) goto error0; @@ -2206,7 +2196,7 @@ xfs_btree_rshift( goto out0; /* Set up the right neighbor as "right". */ - error = xfs_btree_read_buf_block(cur, &rptr, level, 0, &right, &rbp); + error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); if (error) goto error0; @@ -2334,7 +2324,7 @@ error1: * record (to be inserted into parent). */ STATIC int /* error */ -xfs_btree_split( +__xfs_btree_split( struct xfs_btree_cur *cur, int level, union xfs_btree_ptr *ptrp, @@ -2376,7 +2366,7 @@ xfs_btree_split( xfs_btree_buf_to_ptr(cur, lbp, &lptr); /* Allocate the new block. If we can't do it, we're toast. Give up. */ - error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, 1, stat); + error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat); if (error) goto error0; if (*stat == 0) @@ -2474,7 +2464,7 @@ xfs_btree_split( * point back to right instead of to left. */ if (!xfs_btree_ptr_is_null(cur, &rrptr)) { - error = xfs_btree_read_buf_block(cur, &rrptr, level, + error = xfs_btree_read_buf_block(cur, &rrptr, 0, &rrblock, &rrbp); if (error) goto error0; @@ -2514,6 +2504,85 @@ error0: return error; } +struct xfs_btree_split_args { + struct xfs_btree_cur *cur; + int level; + union xfs_btree_ptr *ptrp; + union xfs_btree_key *key; + struct xfs_btree_cur **curp; + int *stat; /* success/failure */ + int result; + bool kswapd; /* allocation in kswapd context */ + struct completion *done; + struct work_struct work; +}; + +/* + * Stack switching interfaces for allocation + */ +static void +xfs_btree_split_worker( + struct work_struct *work) +{ + struct xfs_btree_split_args *args = container_of(work, + struct xfs_btree_split_args, work); + unsigned long pflags; + unsigned long new_pflags = PF_FSTRANS; + + /* + * we are in a transaction context here, but may also be doing work + * in kswapd context, and hence we may need to inherit that state + * temporarily to ensure that we don't block waiting for memory reclaim + * in any way. + */ + if (args->kswapd) + new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; + + current_set_flags_nested(&pflags, new_pflags); + + args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, + args->key, args->curp, args->stat); + complete(args->done); + + current_restore_flags_nested(&pflags, new_pflags); +} + +/* + * BMBT split requests often come in with little stack to work on. Push + * them off to a worker thread so there is lots of stack to use. For the other + * btree types, just call directly to avoid the context switch overhead here. + */ +STATIC int /* error */ +xfs_btree_split( + struct xfs_btree_cur *cur, + int level, + union xfs_btree_ptr *ptrp, + union xfs_btree_key *key, + struct xfs_btree_cur **curp, + int *stat) /* success/failure */ +{ + struct xfs_btree_split_args args; + DECLARE_COMPLETION_ONSTACK(done); + + if (cur->bc_btnum != XFS_BTNUM_BMAP) + return __xfs_btree_split(cur, level, ptrp, key, curp, stat); + + args.cur = cur; + args.level = level; + args.ptrp = ptrp; + args.key = key; + args.curp = curp; + args.stat = stat; + args.done = &done; + args.kswapd = current_is_kswapd(); + INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker); + queue_work(xfs_alloc_wq, &args.work); + wait_for_completion(&done); + destroy_work_on_stack(&args.work); + return args.result; +} + + /* * Copy the old inode root contents into a real block and make the * broot point to it. @@ -2549,7 +2618,7 @@ xfs_btree_new_iroot( pp = xfs_btree_ptr_addr(cur, 1, block); /* Allocate the new block. If we can't do it, we're toast. Give up. */ - error = cur->bc_ops->alloc_block(cur, pp, &nptr, 1, stat); + error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat); if (error) goto error0; if (*stat == 0) { @@ -2653,7 +2722,7 @@ xfs_btree_new_root( cur->bc_ops->init_ptr_from_cur(cur, &rptr); /* Allocate the new block. If we can't do it, we're toast. Give up. */ - error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, 1, stat); + error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat); if (error) goto error0; if (*stat == 0) @@ -2688,8 +2757,7 @@ xfs_btree_new_root( lbp = bp; xfs_btree_buf_to_ptr(cur, lbp, &lptr); left = block; - error = xfs_btree_read_buf_block(cur, &rptr, - cur->bc_nlevels - 1, 0, &right, &rbp); + error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); if (error) goto error0; bp = rbp; @@ -2700,8 +2768,7 @@ xfs_btree_new_root( xfs_btree_buf_to_ptr(cur, rbp, &rptr); right = block; xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); - error = xfs_btree_read_buf_block(cur, &lptr, - cur->bc_nlevels - 1, 0, &left, &lbp); + error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); if (error) goto error0; bp = lbp; @@ -3653,8 +3720,7 @@ xfs_btree_delrec( rptr = cptr; right = block; rbp = bp; - error = xfs_btree_read_buf_block(cur, &lptr, level, - 0, &left, &lbp); + error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); if (error) goto error0; @@ -3671,8 +3737,7 @@ xfs_btree_delrec( lptr = cptr; left = block; lbp = bp; - error = xfs_btree_read_buf_block(cur, &rptr, level, - 0, &right, &rbp); + error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); if (error) goto error0; @@ -3744,8 +3809,7 @@ xfs_btree_delrec( /* If there is a right sibling, point it to the remaining block. */ xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); if (!xfs_btree_ptr_is_null(cur, &cptr)) { - error = xfs_btree_read_buf_block(cur, &cptr, level, - 0, &rrblock, &rrbp); + error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp); if (error) goto error0; xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB); diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 06729b67ad5..a04b69422f6 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -27,73 +27,6 @@ struct xfs_trans; extern kmem_zone_t *xfs_btree_cur_zone; /* - * This nonsense is to make -wlint happy. - */ -#define XFS_LOOKUP_EQ ((xfs_lookup_t)XFS_LOOKUP_EQi) -#define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi) -#define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi) - -#define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi) -#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi) -#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi) -#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi) - -/* - * Generic btree header. - * - * This is a combination of the actual format used on disk for short and long - * format btrees. The first three fields are shared by both format, but the - * pointers are different and should be used with care. - * - * To get the size of the actual short or long form headers please use the size - * macros below. Never use sizeof(xfs_btree_block). - * - * The blkno, crc, lsn, owner and uuid fields are only available in filesystems - * with the crc feature bit, and all accesses to them must be conditional on - * that flag. - */ -struct xfs_btree_block { - __be32 bb_magic; /* magic number for block type */ - __be16 bb_level; /* 0 is a leaf */ - __be16 bb_numrecs; /* current # of data records */ - union { - struct { - __be32 bb_leftsib; - __be32 bb_rightsib; - - __be64 bb_blkno; - __be64 bb_lsn; - uuid_t bb_uuid; - __be32 bb_owner; - __le32 bb_crc; - } s; /* short form pointers */ - struct { - __be64 bb_leftsib; - __be64 bb_rightsib; - - __be64 bb_blkno; - __be64 bb_lsn; - uuid_t bb_uuid; - __be64 bb_owner; - __le32 bb_crc; - __be32 bb_pad; /* padding for alignment */ - } l; /* long form pointers */ - } bb_u; /* rest */ -}; - -#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */ -#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */ - -/* sizes of CRC enabled btree blocks */ -#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40) -#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48) - -#define XFS_BTREE_SBLOCK_CRC_OFF \ - offsetof(struct xfs_btree_block, bb_u.s.bb_crc) -#define XFS_BTREE_LBLOCK_CRC_OFF \ - offsetof(struct xfs_btree_block, bb_u.l.bb_crc) - -/* * Generic key, ptr and record wrapper structures. * * These are disk format structures, and are converted where necessary @@ -119,6 +52,19 @@ union xfs_btree_rec { }; /* + * This nonsense is to make -wlint happy. + */ +#define XFS_LOOKUP_EQ ((xfs_lookup_t)XFS_LOOKUP_EQi) +#define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi) +#define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi) + +#define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi) +#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi) +#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi) +#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi) +#define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi) + +/* * For logging record fields. */ #define XFS_BB_MAGIC (1 << 0) @@ -147,6 +93,7 @@ do { \ case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break; \ case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break; \ case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break; \ + case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break; \ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ } \ } while (0) @@ -160,6 +107,7 @@ do { \ case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \ case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \ case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \ + case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ } \ } while (0) @@ -184,7 +132,7 @@ struct xfs_btree_ops { int (*alloc_block)(struct xfs_btree_cur *cur, union xfs_btree_ptr *start_bno, union xfs_btree_ptr *new_bno, - int length, int *stat); + int *stat); int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp); /* update last record information */ diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 263470075ea..7a34a1ae655 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -34,12 +34,13 @@ #include <linux/backing-dev.h> #include <linux/freezer.h> -#include "xfs_sb.h" +#include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_log.h" +#include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_trace.h" +#include "xfs_log.h" static kmem_zone_t *xfs_buf_zone; @@ -215,8 +216,7 @@ _xfs_buf_alloc( STATIC int _xfs_buf_get_pages( xfs_buf_t *bp, - int page_count, - xfs_buf_flags_t flags) + int page_count) { /* Make sure that we have a page list */ if (bp->b_pages == NULL) { @@ -329,7 +329,7 @@ use_alloc_page: end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1) >> PAGE_SHIFT; page_count = end - start; - error = _xfs_buf_get_pages(bp, page_count, flags); + error = _xfs_buf_get_pages(bp, page_count); if (unlikely(error)) return error; @@ -395,7 +395,17 @@ _xfs_buf_map_pages( bp->b_addr = NULL; } else { int retried = 0; + unsigned noio_flag; + /* + * vm_map_ram() will allocate auxillary structures (e.g. + * pagetables) with GFP_KERNEL, yet we are likely to be under + * GFP_NOFS context here. Hence we need to tell memory reclaim + * that we are in such a context via PF_MEMALLOC_NOIO to prevent + * memory reclaim re-entering the filesystem here and + * potentially deadlocking. + */ + noio_flag = memalloc_noio_save(); do { bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, -1, PAGE_KERNEL); @@ -403,6 +413,7 @@ _xfs_buf_map_pages( break; vm_unmap_aliases(); } while (retried++ <= 1); + memalloc_noio_restore(noio_flag); if (!bp->b_addr) return -ENOMEM; @@ -444,8 +455,8 @@ _xfs_buf_find( numbytes = BBTOB(numblks); /* Check for IOs smaller than the sector size / not sector aligned */ - ASSERT(!(numbytes < (1 << btp->bt_sshift))); - ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask)); + ASSERT(!(numbytes < btp->bt_meta_sectorsize)); + ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask)); /* * Corrupted block numbers can get through to here, unfortunately, so we @@ -590,7 +601,7 @@ found: error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { xfs_warn(target->bt_mount, - "%s: failed to map pages\n", __func__); + "%s: failed to map pagesn", __func__); xfs_buf_relse(bp); return NULL; } @@ -697,7 +708,11 @@ xfs_buf_read_uncached( bp->b_flags |= XBF_READ; bp->b_ops = ops; - xfsbdstrat(target->bt_mount, bp); + if (XFS_FORCED_SHUTDOWN(target->bt_mount)) { + xfs_buf_relse(bp); + return NULL; + } + xfs_buf_iorequest(bp); xfs_buf_iowait(bp); return bp; } @@ -762,7 +777,7 @@ xfs_buf_associate_memory( bp->b_pages = NULL; bp->b_addr = mem; - rval = _xfs_buf_get_pages(bp, page_count, 0); + rval = _xfs_buf_get_pages(bp, page_count); if (rval) return rval; @@ -795,7 +810,7 @@ xfs_buf_get_uncached( goto fail; page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; - error = _xfs_buf_get_pages(bp, page_count, 0); + error = _xfs_buf_get_pages(bp, page_count); if (error) goto fail_free_buf; @@ -809,7 +824,7 @@ xfs_buf_get_uncached( error = _xfs_buf_map_pages(bp, 0); if (unlikely(error)) { xfs_warn(target->bt_mount, - "%s: failed to map pages\n", __func__); + "%s: failed to map pages", __func__); goto fail_free_mem; } @@ -1088,7 +1103,7 @@ xfs_bioerror( * This is meant for userdata errors; metadata bufs come with * iodone functions attached, so that we can track down errors. */ -STATIC int +int xfs_bioerror_relse( struct xfs_buf *bp) { @@ -1151,7 +1166,7 @@ xfs_bwrite( ASSERT(xfs_buf_islocked(bp)); bp->b_flags |= XBF_WRITE; - bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q); + bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL); xfs_bdstrat_cb(bp); @@ -1163,25 +1178,6 @@ xfs_bwrite( return error; } -/* - * Wrapper around bdstrat so that we can stop data from going to disk in case - * we are shutting down the filesystem. Typically user data goes thru this - * path; one of the exceptions is the superblock. - */ -void -xfsbdstrat( - struct xfs_mount *mp, - struct xfs_buf *bp) -{ - if (XFS_FORCED_SHUTDOWN(mp)) { - trace_xfs_bdstrat_shut(bp, _RET_IP_); - xfs_bioerror_relse(bp); - return; - } - - xfs_buf_iorequest(bp); -} - STATIC void _xfs_buf_ioend( xfs_buf_t *bp, @@ -1254,7 +1250,7 @@ next_chunk: bio = bio_alloc(GFP_NOIO, nr_pages); bio->bi_bdev = bp->b_target->bt_bdev; - bio->bi_sector = sector; + bio->bi_iter.bi_sector = sector; bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; @@ -1276,7 +1272,7 @@ next_chunk: total_nr_pages--; } - if (likely(bio->bi_size)) { + if (likely(bio->bi_iter.bi_size)) { if (xfs_buf_is_vmapped(bp)) { flush_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); @@ -1375,21 +1371,29 @@ xfs_buf_iorequest( xfs_buf_wait_unpin(bp); xfs_buf_hold(bp); - /* Set the count to 1 initially, this will stop an I/O + /* + * Set the count to 1 initially, this will stop an I/O * completion callout which happens before we have started * all the I/O from calling xfs_buf_ioend too early. */ atomic_set(&bp->b_io_remaining, 1); _xfs_buf_ioapply(bp); - _xfs_buf_ioend(bp, 1); + /* + * If _xfs_buf_ioapply failed, we'll get back here with + * only the reference we took above. _xfs_buf_ioend will + * drop it to zero, so we'd better not queue it for later, + * or we'll free it before it's done. + */ + _xfs_buf_ioend(bp, bp->b_error ? 0 : 1); xfs_buf_rele(bp); } /* * Waits for I/O to complete on the buffer supplied. It returns immediately if - * no I/O is pending or there is already a pending error on the buffer. It - * returns the I/O error code, if any, or 0 if there was no error. + * no I/O is pending or there is already a pending error on the buffer, in which + * case nothing will ever complete. It returns the I/O error code, if any, or + * 0 if there was no error. */ int xfs_buf_iowait( @@ -1515,6 +1519,12 @@ xfs_wait_buftarg( struct xfs_buf *bp; bp = list_first_entry(&dispose, struct xfs_buf, b_lru); list_del_init(&bp->b_lru); + if (bp->b_flags & XBF_WRITE_FAIL) { + xfs_alert(btp->bt_mount, +"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n" +"Please run xfs_repair to determine the extent of the problem.", + (long long)bp->b_bn); + } xfs_buf_rele(bp); } if (loop++ != 0) @@ -1601,16 +1611,14 @@ xfs_free_buftarg( kmem_free(btp); } -STATIC int -xfs_setsize_buftarg_flags( +int +xfs_setsize_buftarg( xfs_buftarg_t *btp, - unsigned int blocksize, - unsigned int sectorsize, - int verbose) + unsigned int sectorsize) { - btp->bt_bsize = blocksize; - btp->bt_sshift = ffs(sectorsize) - 1; - btp->bt_smask = sectorsize - 1; + /* Set up metadata sector size info */ + btp->bt_meta_sectorsize = sectorsize; + btp->bt_meta_sectormask = sectorsize - 1; if (set_blocksize(btp->bt_bdev, sectorsize)) { char name[BDEVNAME_SIZE]; @@ -1618,43 +1626,35 @@ xfs_setsize_buftarg_flags( bdevname(btp->bt_bdev, name); xfs_warn(btp->bt_mount, - "Cannot set_blocksize to %u on device %s\n", + "Cannot set_blocksize to %u on device %s", sectorsize, name); return EINVAL; } + /* Set up device logical sector size mask */ + btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); + btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; + return 0; } /* - * When allocating the initial buffer target we have not yet - * read in the superblock, so don't know what sized sectors - * are being used at this early stage. Play safe. + * When allocating the initial buffer target we have not yet + * read in the superblock, so don't know what sized sectors + * are being used at this early stage. Play safe. */ STATIC int xfs_setsize_buftarg_early( xfs_buftarg_t *btp, struct block_device *bdev) { - return xfs_setsize_buftarg_flags(btp, - PAGE_SIZE, bdev_logical_block_size(bdev), 0); -} - -int -xfs_setsize_buftarg( - xfs_buftarg_t *btp, - unsigned int blocksize, - unsigned int sectorsize) -{ - return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); + return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev)); } xfs_buftarg_t * xfs_alloc_buftarg( struct xfs_mount *mp, - struct block_device *bdev, - int external, - const char *fsname) + struct block_device *bdev) { xfs_buftarg_t *btp; @@ -1798,7 +1798,7 @@ __xfs_buf_delwri_submit( blk_start_plug(&plug); list_for_each_entry_safe(bp, n, io_list, b_list) { - bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC); + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); bp->b_flags |= XBF_WRITE; if (!wait) { diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index e6568336101..3a7a5523d3d 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -45,6 +45,7 @@ typedef enum { #define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ #define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ #define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ +#define XBF_WRITE_FAIL (1 << 24)/* async writes have failed on this buffer */ /* I/O hints for the BIO layer */ #define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ @@ -70,6 +71,7 @@ typedef unsigned int xfs_buf_flags_t; { XBF_ASYNC, "ASYNC" }, \ { XBF_DONE, "DONE" }, \ { XBF_STALE, "STALE" }, \ + { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ { XBF_SYNCIO, "SYNCIO" }, \ { XBF_FUA, "FUA" }, \ { XBF_FLUSH, "FLUSH" }, \ @@ -80,19 +82,34 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_DELWRI_Q, "DELWRI_Q" }, \ { _XBF_COMPOUND, "COMPOUND" } + /* * Internal state flags. */ #define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ +/* + * The xfs_buftarg contains 2 notions of "sector size" - + * + * 1) The metadata sector size, which is the minimum unit and + * alignment of IO which will be performed by metadata operations. + * 2) The device logical sector size + * + * The first is specified at mkfs time, and is stored on-disk in the + * superblock's sb_sectsize. + * + * The latter is derived from the underlying device, and controls direct IO + * alignment constraints. + */ typedef struct xfs_buftarg { dev_t bt_dev; struct block_device *bt_bdev; struct backing_dev_info *bt_bdi; struct xfs_mount *bt_mount; - unsigned int bt_bsize; - unsigned int bt_sshift; - size_t bt_smask; + unsigned int bt_meta_sectorsize; + size_t bt_meta_sectormask; + size_t bt_logical_sectorsize; + size_t bt_logical_sectormask; /* LRU control structures */ struct shrinker bt_shrinker; @@ -269,9 +286,6 @@ extern void xfs_buf_unlock(xfs_buf_t *); /* Buffer Read and Write Routines */ extern int xfs_bwrite(struct xfs_buf *bp); - -extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *); - extern void xfs_buf_ioend(xfs_buf_t *, int); extern void xfs_buf_ioerror(xfs_buf_t *, int); extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); @@ -282,10 +296,7 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, #define xfs_buf_zero(bp, off, len) \ xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) -static inline int xfs_buf_geterror(xfs_buf_t *bp) -{ - return bp ? bp->b_error : ENOMEM; -} +extern int xfs_bioerror_relse(struct xfs_buf *); /* Buffer Utility Routines */ extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); @@ -301,7 +312,8 @@ extern void xfs_buf_terminate(void); #define XFS_BUF_ZEROFLAGS(bp) \ ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \ - XBF_SYNCIO|XBF_FUA|XBF_FLUSH)) + XBF_SYNCIO|XBF_FUA|XBF_FLUSH| \ + XBF_WRITE_FAIL)) void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) @@ -352,14 +364,28 @@ static inline void xfs_buf_relse(xfs_buf_t *bp) xfs_buf_rele(bp); } +static inline int +xfs_buf_verify_cksum(struct xfs_buf *bp, unsigned long cksum_offset) +{ + return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + cksum_offset); +} + +static inline void +xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) +{ + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + cksum_offset); +} + /* * Handling of buftargs. */ extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *, - struct block_device *, int, const char *); + struct block_device *); extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); extern void xfs_wait_buftarg(xfs_buftarg_t *); -extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); +extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index f1d85cfc0a5..4654338b03f 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -17,17 +17,18 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_log.h" kmem_zone_t *xfs_buf_item_zone; @@ -181,21 +182,47 @@ xfs_buf_item_size( trace_xfs_buf_item_size(bip); } -static struct xfs_log_iovec * +static inline void +xfs_buf_item_copy_iovec( + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp, + struct xfs_buf *bp, + uint offset, + int first_bit, + uint nbits) +{ + offset += first_bit * XFS_BLF_CHUNK; + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK, + xfs_buf_offset(bp, offset), + nbits * XFS_BLF_CHUNK); +} + +static inline bool +xfs_buf_item_straddle( + struct xfs_buf *bp, + uint offset, + int next_bit, + int last_bit) +{ + return xfs_buf_offset(bp, offset + (next_bit << XFS_BLF_SHIFT)) != + (xfs_buf_offset(bp, offset + (last_bit << XFS_BLF_SHIFT)) + + XFS_BLF_CHUNK); +} + +static void xfs_buf_item_format_segment( struct xfs_buf_log_item *bip, - struct xfs_log_iovec *vecp, + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp, uint offset, struct xfs_buf_log_format *blfp) { struct xfs_buf *bp = bip->bli_buf; uint base_size; - uint nvecs; int first_bit; int last_bit; int next_bit; uint nbits; - uint buffer_offset; /* copy the flags across from the base format item */ blfp->blf_flags = bip->__bli_format.blf_flags; @@ -207,21 +234,17 @@ xfs_buf_item_format_segment( */ base_size = xfs_buf_log_format_size(blfp); - nvecs = 0; first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) { /* * If the map is not be dirty in the transaction, mark * the size as zero and do not advance the vector pointer. */ - goto out; + return; } - vecp->i_addr = blfp; - vecp->i_len = base_size; - vecp->i_type = XLOG_REG_TYPE_BFORMAT; - vecp++; - nvecs = 1; + blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size); + blfp->blf_size = 1; if (bip->bli_flags & XFS_BLI_STALE) { /* @@ -231,14 +254,13 @@ xfs_buf_item_format_segment( */ trace_xfs_buf_item_format_stale(bip); ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); - goto out; + return; } /* * Fill in an iovec for each set of contiguous chunks. */ - last_bit = first_bit; nbits = 1; for (;;) { @@ -251,42 +273,22 @@ xfs_buf_item_format_segment( next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, (uint)last_bit + 1); /* - * If we run out of bits fill in the last iovec and get - * out of the loop. - * Else if we start a new set of bits then fill in the - * iovec for the series we were looking at and start - * counting the bits in the new one. - * Else we're still in the same set of bits so just - * keep counting and scanning. + * If we run out of bits fill in the last iovec and get out of + * the loop. Else if we start a new set of bits then fill in + * the iovec for the series we were looking at and start + * counting the bits in the new one. Else we're still in the + * same set of bits so just keep counting and scanning. */ if (next_bit == -1) { - buffer_offset = offset + first_bit * XFS_BLF_CHUNK; - vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLF_CHUNK; - vecp->i_type = XLOG_REG_TYPE_BCHUNK; - nvecs++; + xfs_buf_item_copy_iovec(lv, vecp, bp, offset, + first_bit, nbits); + blfp->blf_size++; break; - } else if (next_bit != last_bit + 1) { - buffer_offset = offset + first_bit * XFS_BLF_CHUNK; - vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLF_CHUNK; - vecp->i_type = XLOG_REG_TYPE_BCHUNK; - nvecs++; - vecp++; - first_bit = next_bit; - last_bit = next_bit; - nbits = 1; - } else if (xfs_buf_offset(bp, offset + - (next_bit << XFS_BLF_SHIFT)) != - (xfs_buf_offset(bp, offset + - (last_bit << XFS_BLF_SHIFT)) + - XFS_BLF_CHUNK)) { - buffer_offset = offset + first_bit * XFS_BLF_CHUNK; - vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLF_CHUNK; - vecp->i_type = XLOG_REG_TYPE_BCHUNK; - nvecs++; - vecp++; + } else if (next_bit != last_bit + 1 || + xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) { + xfs_buf_item_copy_iovec(lv, vecp, bp, offset, + first_bit, nbits); + blfp->blf_size++; first_bit = next_bit; last_bit = next_bit; nbits = 1; @@ -295,9 +297,6 @@ xfs_buf_item_format_segment( nbits++; } } -out: - blfp->blf_size = nvecs; - return vecp; } /* @@ -309,10 +308,11 @@ out: STATIC void xfs_buf_item_format( struct xfs_log_item *lip, - struct xfs_log_iovec *vecp) + struct xfs_log_vec *lv) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; + struct xfs_log_iovec *vecp = NULL; uint offset = 0; int i; @@ -353,8 +353,8 @@ xfs_buf_item_format( } for (i = 0; i < bip->bli_format_count; i++) { - vecp = xfs_buf_item_format_segment(bip, vecp, offset, - &bip->bli_formats[i]); + xfs_buf_item_format_segment(bip, lv, &vecp, offset, + &bip->bli_formats[i]); offset += bp->b_maps[i].bm_len; } @@ -495,6 +495,14 @@ xfs_buf_item_unpin( } } +/* + * Buffer IO error rate limiting. Limit it to no more than 10 messages per 30 + * seconds so as to not spam logs too much on repeated detection of the same + * buffer being bad.. + */ + +DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10); + STATIC uint xfs_buf_item_push( struct xfs_log_item *lip, @@ -523,6 +531,14 @@ xfs_buf_item_push( trace_xfs_buf_item_push(bip); + /* has a previous flush failed due to IO errors? */ + if ((bp->b_flags & XBF_WRITE_FAIL) && + ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) { + xfs_warn(bp->b_target->bt_mount, +"Detected failing async write on buffer block 0x%llx. Retrying async write.\n", + (long long)bp->b_bn); + } + if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_unlock(bp); @@ -780,20 +796,6 @@ xfs_buf_item_init( bip->bli_formats[i].blf_map_size = map_size; } -#ifdef XFS_TRANS_DEBUG - /* - * Allocate the arrays for tracking what needs to be logged - * and what our callers request to be logged. bli_orig - * holds a copy of the original, clean buffer for comparison - * against, and bli_logged keeps a 1 bit flag per byte in - * the buffer to indicate which bytes the callers have asked - * to have logged. - */ - bip->bli_orig = kmem_alloc(BBTOB(bp->b_length), KM_SLEEP); - memcpy(bip->bli_orig, bp->b_addr, BBTOB(bp->b_length)); - bip->bli_logged = kmem_zalloc(BBTOB(bp->b_length) / NBBY, KM_SLEEP); -#endif - /* * Put the buf item into the list of items attached to the * buffer at the front. @@ -808,9 +810,8 @@ xfs_buf_item_init( * Mark bytes first through last inclusive as dirty in the buf * item's bitmap. */ -void +static void xfs_buf_item_log_segment( - struct xfs_buf_log_item *bip, uint first, uint last, uint *map) @@ -918,7 +919,7 @@ xfs_buf_item_log( if (end > last) end = last; - xfs_buf_item_log_segment(bip, first, end, + xfs_buf_item_log_segment(first, end, &bip->bli_formats[i].blf_data_map[0]); start += bp->b_maps[i].bm_len; @@ -941,11 +942,6 @@ STATIC void xfs_buf_item_free( xfs_buf_log_item_t *bip) { -#ifdef XFS_TRANS_DEBUG - kmem_free(bip->bli_orig); - kmem_free(bip->bli_logged); -#endif /* XFS_TRANS_DEBUG */ - xfs_buf_item_free_format(bip); kmem_zone_free(xfs_buf_item_zone, bip); } @@ -1056,7 +1052,7 @@ xfs_buf_iodone_callbacks( static ulong lasttime; static xfs_buftarg_t *lasttarg; - if (likely(!xfs_buf_geterror(bp))) + if (likely(!bp->b_error)) goto do_callbacks; /* @@ -1095,8 +1091,9 @@ xfs_buf_iodone_callbacks( xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ - if (!XFS_BUF_ISSTALE(bp)) { - bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE; + if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) { + bp->b_flags |= XBF_WRITE | XBF_ASYNC | + XBF_DONE | XBF_WRITE_FAIL; xfs_buf_iorequest(bp); } else { xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index db6371087fe..3f3455a4151 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -71,10 +71,6 @@ void xfs_buf_attach_iodone(struct xfs_buf *, void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); -void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *, - enum xfs_blft); -void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, struct xfs_buf *src_bp); - extern kmem_zone_t *xfs_buf_item_zone; #endif /* __XFS_BUF_ITEM_H__ */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 20bf8e8002d..a514ab61665 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -18,20 +18,20 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_dir2_format.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_alloc.h" #include "xfs_bmap.h" @@ -129,56 +129,6 @@ xfs_da_state_free(xfs_da_state_t *state) kmem_zone_free(xfs_da_state_zone, state); } -void -xfs_da3_node_hdr_from_disk( - struct xfs_da3_icnode_hdr *to, - struct xfs_da_intnode *from) -{ - ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || - from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)); - - if (from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) { - struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from; - - to->forw = be32_to_cpu(hdr3->info.hdr.forw); - to->back = be32_to_cpu(hdr3->info.hdr.back); - to->magic = be16_to_cpu(hdr3->info.hdr.magic); - to->count = be16_to_cpu(hdr3->__count); - to->level = be16_to_cpu(hdr3->__level); - return; - } - to->forw = be32_to_cpu(from->hdr.info.forw); - to->back = be32_to_cpu(from->hdr.info.back); - to->magic = be16_to_cpu(from->hdr.info.magic); - to->count = be16_to_cpu(from->hdr.__count); - to->level = be16_to_cpu(from->hdr.__level); -} - -void -xfs_da3_node_hdr_to_disk( - struct xfs_da_intnode *to, - struct xfs_da3_icnode_hdr *from) -{ - ASSERT(from->magic == XFS_DA_NODE_MAGIC || - from->magic == XFS_DA3_NODE_MAGIC); - - if (from->magic == XFS_DA3_NODE_MAGIC) { - struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to; - - hdr3->info.hdr.forw = cpu_to_be32(from->forw); - hdr3->info.hdr.back = cpu_to_be32(from->back); - hdr3->info.hdr.magic = cpu_to_be16(from->magic); - hdr3->__count = cpu_to_be16(from->count); - hdr3->__level = cpu_to_be16(from->level); - return; - } - to->hdr.info.forw = cpu_to_be32(from->forw); - to->hdr.info.back = cpu_to_be32(from->back); - to->hdr.info.magic = cpu_to_be16(from->magic); - to->hdr.__count = cpu_to_be16(from->count); - to->hdr.__level = cpu_to_be16(from->level); -} - static bool xfs_da3_node_verify( struct xfs_buf *bp) @@ -186,8 +136,11 @@ xfs_da3_node_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_da_intnode *hdr = bp->b_addr; struct xfs_da3_icnode_hdr ichdr; + const struct xfs_dir_ops *ops; - xfs_da3_node_hdr_from_disk(&ichdr, hdr); + ops = xfs_dir_get_ops(mp, NULL); + + ops->node_hdr_from_disk(&ichdr, hdr); if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_da3_node_hdr *hdr3 = bp->b_addr; @@ -214,8 +167,8 @@ xfs_da3_node_verify( * we don't know if the node is for and attribute or directory tree, * so only fail if the count is outside both bounds */ - if (ichdr.count > mp->m_dir_node_ents && - ichdr.count > mp->m_attr_node_ents) + if (ichdr.count > mp->m_dir_geo->node_ents && + ichdr.count > mp->m_attr_geo->node_ents) return false; /* XXX: hash order check? */ @@ -232,8 +185,8 @@ xfs_da3_node_write_verify( struct xfs_da3_node_hdr *hdr3 = bp->b_addr; if (!xfs_da3_node_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } @@ -243,7 +196,7 @@ xfs_da3_node_write_verify( if (bip) hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DA3_NODE_CRC_OFF); + xfs_buf_update_cksum(bp, XFS_DA3_NODE_CRC_OFF); } /* @@ -256,18 +209,20 @@ static void xfs_da3_node_read_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_da_blkinfo *info = bp->b_addr; switch (be16_to_cpu(info->magic)) { case XFS_DA3_NODE_MAGIC: - if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_DA3_NODE_CRC_OFF)) + if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) { + xfs_buf_ioerror(bp, EFSBADCRC); break; + } /* fall through */ case XFS_DA_NODE_MAGIC: - if (!xfs_da3_node_verify(bp)) + if (!xfs_da3_node_verify(bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); break; + } return; case XFS_ATTR_LEAF_MAGIC: case XFS_ATTR3_LEAF_MAGIC: @@ -284,8 +239,7 @@ xfs_da3_node_read_verify( } /* corrupt block */ - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); } const struct xfs_buf_ops xfs_da3_node_buf_ops = { @@ -354,11 +308,12 @@ xfs_da3_node_create( struct xfs_da3_icnode_hdr ichdr = {0}; struct xfs_buf *bp; int error; + struct xfs_inode *dp = args->dp; trace_xfs_da_node_create(args); ASSERT(level <= XFS_DA_NODE_MAXDEPTH); - error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork); + error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork); if (error) return(error); bp->b_ops = &xfs_da3_node_buf_ops; @@ -377,9 +332,9 @@ xfs_da3_node_create( } ichdr.level = level; - xfs_da3_node_hdr_to_disk(node, &ichdr); + dp->d_ops->node_hdr_to_disk(node, &ichdr); xfs_trans_log_buf(tp, bp, - XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node))); + XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size)); *bpp = bp; return(0); @@ -589,8 +544,8 @@ xfs_da3_root_split( oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) { struct xfs_da3_icnode_hdr nodehdr; - xfs_da3_node_hdr_from_disk(&nodehdr, oldroot); - btree = xfs_da3_node_tree_p(oldroot); + dp->d_ops->node_hdr_from_disk(&nodehdr, oldroot); + btree = dp->d_ops->node_tree_p(oldroot); size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot); level = nodehdr.level; @@ -604,8 +559,8 @@ xfs_da3_root_split( struct xfs_dir2_leaf_entry *ents; leaf = (xfs_dir2_leaf_t *)oldroot; - xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); - ents = xfs_dir3_leaf_ents_p(leaf); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); @@ -643,28 +598,28 @@ xfs_da3_root_split( * Set up the new root node. */ error = xfs_da3_node_create(args, - (args->whichfork == XFS_DATA_FORK) ? mp->m_dirleafblk : 0, + (args->whichfork == XFS_DATA_FORK) ? args->geo->leafblk : 0, level + 1, &bp, args->whichfork); if (error) return error; node = bp->b_addr; - xfs_da3_node_hdr_from_disk(&nodehdr, node); - btree = xfs_da3_node_tree_p(node); + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); btree[0].hashval = cpu_to_be32(blk1->hashval); btree[0].before = cpu_to_be32(blk1->blkno); btree[1].hashval = cpu_to_be32(blk2->hashval); btree[1].before = cpu_to_be32(blk2->blkno); nodehdr.count = 2; - xfs_da3_node_hdr_to_disk(node, &nodehdr); + dp->d_ops->node_hdr_to_disk(node, &nodehdr); #ifdef DEBUG if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { - ASSERT(blk1->blkno >= mp->m_dirleafblk && - blk1->blkno < mp->m_dirfreeblk); - ASSERT(blk2->blkno >= mp->m_dirleafblk && - blk2->blkno < mp->m_dirfreeblk); + ASSERT(blk1->blkno >= args->geo->leafblk && + blk1->blkno < args->geo->freeblk); + ASSERT(blk2->blkno >= args->geo->leafblk && + blk2->blkno < args->geo->freeblk); } #endif @@ -693,11 +648,12 @@ xfs_da3_node_split( int newcount; int error; int useextra; + struct xfs_inode *dp = state->args->dp; trace_xfs_da_node_split(state->args); node = oldblk->bp->b_addr; - xfs_da3_node_hdr_from_disk(&nodehdr, node); + dp->d_ops->node_hdr_from_disk(&nodehdr, node); /* * With V2 dirs the extra block is data or freespace. @@ -707,7 +663,7 @@ xfs_da3_node_split( /* * Do we have to split the node? */ - if (nodehdr.count + newcount > state->node_ents) { + if (nodehdr.count + newcount > state->args->geo->node_ents) { /* * Allocate a new node, add to the doubly linked chain of * nodes, then move some of our excess entries into it. @@ -744,7 +700,7 @@ xfs_da3_node_split( * If we had double-split op below us, then add the extra block too. */ node = oldblk->bp->b_addr; - xfs_da3_node_hdr_from_disk(&nodehdr, node); + dp->d_ops->node_hdr_from_disk(&nodehdr, node); if (oldblk->index <= nodehdr.count) { oldblk->index++; xfs_da3_node_add(state, oldblk, addblk); @@ -793,15 +749,16 @@ xfs_da3_node_rebalance( int count; int tmp; int swap = 0; + struct xfs_inode *dp = state->args->dp; trace_xfs_da_node_rebalance(state->args); node1 = blk1->bp->b_addr; node2 = blk2->bp->b_addr; - xfs_da3_node_hdr_from_disk(&nodehdr1, node1); - xfs_da3_node_hdr_from_disk(&nodehdr2, node2); - btree1 = xfs_da3_node_tree_p(node1); - btree2 = xfs_da3_node_tree_p(node2); + dp->d_ops->node_hdr_from_disk(&nodehdr1, node1); + dp->d_ops->node_hdr_from_disk(&nodehdr2, node2); + btree1 = dp->d_ops->node_tree_p(node1); + btree2 = dp->d_ops->node_tree_p(node2); /* * Figure out how many entries need to move, and in which direction. @@ -814,10 +771,10 @@ xfs_da3_node_rebalance( tmpnode = node1; node1 = node2; node2 = tmpnode; - xfs_da3_node_hdr_from_disk(&nodehdr1, node1); - xfs_da3_node_hdr_from_disk(&nodehdr2, node2); - btree1 = xfs_da3_node_tree_p(node1); - btree2 = xfs_da3_node_tree_p(node2); + dp->d_ops->node_hdr_from_disk(&nodehdr1, node1); + dp->d_ops->node_hdr_from_disk(&nodehdr2, node2); + btree1 = dp->d_ops->node_tree_p(node1); + btree2 = dp->d_ops->node_tree_p(node2); swap = 1; } @@ -879,15 +836,14 @@ xfs_da3_node_rebalance( /* * Log header of node 1 and all current bits of node 2. */ - xfs_da3_node_hdr_to_disk(node1, &nodehdr1); + dp->d_ops->node_hdr_to_disk(node1, &nodehdr1); xfs_trans_log_buf(tp, blk1->bp, - XFS_DA_LOGRANGE(node1, &node1->hdr, - xfs_da3_node_hdr_size(node1))); + XFS_DA_LOGRANGE(node1, &node1->hdr, dp->d_ops->node_hdr_size)); - xfs_da3_node_hdr_to_disk(node2, &nodehdr2); + dp->d_ops->node_hdr_to_disk(node2, &nodehdr2); xfs_trans_log_buf(tp, blk2->bp, XFS_DA_LOGRANGE(node2, &node2->hdr, - xfs_da3_node_hdr_size(node2) + + dp->d_ops->node_hdr_size + (sizeof(btree2[0]) * nodehdr2.count))); /* @@ -897,10 +853,10 @@ xfs_da3_node_rebalance( if (swap) { node1 = blk1->bp->b_addr; node2 = blk2->bp->b_addr; - xfs_da3_node_hdr_from_disk(&nodehdr1, node1); - xfs_da3_node_hdr_from_disk(&nodehdr2, node2); - btree1 = xfs_da3_node_tree_p(node1); - btree2 = xfs_da3_node_tree_p(node2); + dp->d_ops->node_hdr_from_disk(&nodehdr1, node1); + dp->d_ops->node_hdr_from_disk(&nodehdr2, node2); + btree1 = dp->d_ops->node_tree_p(node1); + btree2 = dp->d_ops->node_tree_p(node2); } blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval); blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval); @@ -927,18 +883,19 @@ xfs_da3_node_add( struct xfs_da3_icnode_hdr nodehdr; struct xfs_da_node_entry *btree; int tmp; + struct xfs_inode *dp = state->args->dp; trace_xfs_da_node_add(state->args); node = oldblk->bp->b_addr; - xfs_da3_node_hdr_from_disk(&nodehdr, node); - btree = xfs_da3_node_tree_p(node); + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count); ASSERT(newblk->blkno != 0); if (state->args->whichfork == XFS_DATA_FORK) - ASSERT(newblk->blkno >= state->mp->m_dirleafblk && - newblk->blkno < state->mp->m_dirfreeblk); + ASSERT(newblk->blkno >= state->args->geo->leafblk && + newblk->blkno < state->args->geo->freeblk); /* * We may need to make some room before we insert the new node. @@ -955,9 +912,9 @@ xfs_da3_node_add( tmp + sizeof(*btree))); nodehdr.count += 1; - xfs_da3_node_hdr_to_disk(node, &nodehdr); + dp->d_ops->node_hdr_to_disk(node, &nodehdr); xfs_trans_log_buf(state->args->trans, oldblk->bp, - XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node))); + XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size)); /* * Copy the last hash value from the oldblk to propagate upwards. @@ -1094,6 +1051,7 @@ xfs_da3_root_join( struct xfs_da3_icnode_hdr oldroothdr; struct xfs_da_node_entry *btree; int error; + struct xfs_inode *dp = state->args->dp; trace_xfs_da_root_join(state->args); @@ -1101,7 +1059,7 @@ xfs_da3_root_join( args = state->args; oldroot = root_blk->bp->b_addr; - xfs_da3_node_hdr_from_disk(&oldroothdr, oldroot); + dp->d_ops->node_hdr_from_disk(&oldroothdr, oldroot); ASSERT(oldroothdr.forw == 0); ASSERT(oldroothdr.back == 0); @@ -1115,10 +1073,10 @@ xfs_da3_root_join( * Read in the (only) child block, then copy those bytes into * the root block's buffer and free the original child block. */ - btree = xfs_da3_node_tree_p(oldroot); + btree = dp->d_ops->node_tree_p(oldroot); child = be32_to_cpu(btree[0].before); ASSERT(child != 0); - error = xfs_da3_node_read(args->trans, args->dp, child, -1, &bp, + error = xfs_da3_node_read(args->trans, dp, child, -1, &bp, args->whichfork); if (error) return error; @@ -1131,14 +1089,15 @@ xfs_da3_root_join( * that could occur. For dir3 blocks we also need to update the block * number in the buffer header. */ - memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize); + memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize); root_blk->bp->b_ops = bp->b_ops; xfs_trans_buf_copy_type(root_blk->bp, bp); if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) { struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr; da3->blkno = cpu_to_be64(root_blk->bp->b_bn); } - xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); + xfs_trans_log_buf(args->trans, root_blk->bp, 0, + args->geo->blksize - 1); error = xfs_da_shrink_inode(args, child, bp); return(error); } @@ -1168,6 +1127,7 @@ xfs_da3_node_toosmall( int error; int retval; int i; + struct xfs_inode *dp = state->args->dp; trace_xfs_da_node_toosmall(state->args); @@ -1179,8 +1139,8 @@ xfs_da3_node_toosmall( blk = &state->path.blk[ state->path.active-1 ]; info = blk->bp->b_addr; node = (xfs_da_intnode_t *)info; - xfs_da3_node_hdr_from_disk(&nodehdr, node); - if (nodehdr.count > (state->node_ents >> 1)) { + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + if (nodehdr.count > (state->args->geo->node_ents >> 1)) { *action = 0; /* blk over 50%, don't try to join */ return(0); /* blk over 50%, don't try to join */ } @@ -1217,8 +1177,8 @@ xfs_da3_node_toosmall( * We prefer coalescing with the lower numbered sibling so as * to shrink a directory over time. */ - count = state->node_ents; - count -= state->node_ents >> 2; + count = state->args->geo->node_ents; + count -= state->args->geo->node_ents >> 2; count -= nodehdr.count; /* start with smaller blk num */ @@ -1231,13 +1191,13 @@ xfs_da3_node_toosmall( blkno = nodehdr.back; if (blkno == 0) continue; - error = xfs_da3_node_read(state->args->trans, state->args->dp, + error = xfs_da3_node_read(state->args->trans, dp, blkno, -1, &bp, state->args->whichfork); if (error) return(error); node = bp->b_addr; - xfs_da3_node_hdr_from_disk(&thdr, node); + dp->d_ops->node_hdr_from_disk(&thdr, node); xfs_trans_brelse(state->args->trans, bp); if (count - thdr.count >= 0) @@ -1275,6 +1235,7 @@ xfs_da3_node_toosmall( */ STATIC uint xfs_da3_node_lasthash( + struct xfs_inode *dp, struct xfs_buf *bp, int *count) { @@ -1283,12 +1244,12 @@ xfs_da3_node_lasthash( struct xfs_da3_icnode_hdr nodehdr; node = bp->b_addr; - xfs_da3_node_hdr_from_disk(&nodehdr, node); + dp->d_ops->node_hdr_from_disk(&nodehdr, node); if (count) *count = nodehdr.count; if (!nodehdr.count) return 0; - btree = xfs_da3_node_tree_p(node); + btree = dp->d_ops->node_tree_p(node); return be32_to_cpu(btree[nodehdr.count - 1].hashval); } @@ -1307,6 +1268,7 @@ xfs_da3_fixhashpath( xfs_dahash_t lasthash=0; int level; int count; + struct xfs_inode *dp = state->args->dp; trace_xfs_da_fixhashpath(state->args); @@ -1319,12 +1281,12 @@ xfs_da3_fixhashpath( return; break; case XFS_DIR2_LEAFN_MAGIC: - lasthash = xfs_dir2_leafn_lasthash(blk->bp, &count); + lasthash = xfs_dir2_leafn_lasthash(dp, blk->bp, &count); if (count == 0) return; break; case XFS_DA_NODE_MAGIC: - lasthash = xfs_da3_node_lasthash(blk->bp, &count); + lasthash = xfs_da3_node_lasthash(dp, blk->bp, &count); if (count == 0) return; break; @@ -1333,9 +1295,9 @@ xfs_da3_fixhashpath( struct xfs_da3_icnode_hdr nodehdr; node = blk->bp->b_addr; - xfs_da3_node_hdr_from_disk(&nodehdr, node); - btree = xfs_da3_node_tree_p(node); - if (be32_to_cpu(btree->hashval) == lasthash) + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + if (be32_to_cpu(btree[blk->index].hashval) == lasthash) break; blk->hashval = lasthash; btree[blk->index].hashval = cpu_to_be32(lasthash); @@ -1360,11 +1322,12 @@ xfs_da3_node_remove( struct xfs_da_node_entry *btree; int index; int tmp; + struct xfs_inode *dp = state->args->dp; trace_xfs_da_node_remove(state->args); node = drop_blk->bp->b_addr; - xfs_da3_node_hdr_from_disk(&nodehdr, node); + dp->d_ops->node_hdr_from_disk(&nodehdr, node); ASSERT(drop_blk->index < nodehdr.count); ASSERT(drop_blk->index >= 0); @@ -1372,7 +1335,7 @@ xfs_da3_node_remove( * Copy over the offending entry, or just zero it out. */ index = drop_blk->index; - btree = xfs_da3_node_tree_p(node); + btree = dp->d_ops->node_tree_p(node); if (index < nodehdr.count - 1) { tmp = nodehdr.count - index - 1; tmp *= (uint)sizeof(xfs_da_node_entry_t); @@ -1385,9 +1348,9 @@ xfs_da3_node_remove( xfs_trans_log_buf(state->args->trans, drop_blk->bp, XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index]))); nodehdr.count -= 1; - xfs_da3_node_hdr_to_disk(node, &nodehdr); + dp->d_ops->node_hdr_to_disk(node, &nodehdr); xfs_trans_log_buf(state->args->trans, drop_blk->bp, - XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node))); + XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size)); /* * Copy the last hash value from the block to propagate upwards. @@ -1414,15 +1377,16 @@ xfs_da3_node_unbalance( struct xfs_trans *tp; int sindex; int tmp; + struct xfs_inode *dp = state->args->dp; trace_xfs_da_node_unbalance(state->args); drop_node = drop_blk->bp->b_addr; save_node = save_blk->bp->b_addr; - xfs_da3_node_hdr_from_disk(&drop_hdr, drop_node); - xfs_da3_node_hdr_from_disk(&save_hdr, save_node); - drop_btree = xfs_da3_node_tree_p(drop_node); - save_btree = xfs_da3_node_tree_p(save_node); + dp->d_ops->node_hdr_from_disk(&drop_hdr, drop_node); + dp->d_ops->node_hdr_from_disk(&save_hdr, save_node); + drop_btree = dp->d_ops->node_tree_p(drop_node); + save_btree = dp->d_ops->node_tree_p(save_node); tp = state->args->trans; /* @@ -1456,10 +1420,10 @@ xfs_da3_node_unbalance( memcpy(&save_btree[sindex], &drop_btree[0], tmp); save_hdr.count += drop_hdr.count; - xfs_da3_node_hdr_to_disk(save_node, &save_hdr); + dp->d_ops->node_hdr_to_disk(save_node, &save_hdr); xfs_trans_log_buf(tp, save_blk->bp, XFS_DA_LOGRANGE(save_node, &save_node->hdr, - xfs_da3_node_hdr_size(save_node))); + dp->d_ops->node_hdr_size)); /* * Save the last hashval in the remaining block for upward propagation. @@ -1501,6 +1465,7 @@ xfs_da3_node_lookup_int( int max; int error; int retval; + struct xfs_inode *dp = state->args->dp; args = state->args; @@ -1508,7 +1473,7 @@ xfs_da3_node_lookup_int( * Descend thru the B-tree searching each level for the right * node to use, until the right hashval is found. */ - blkno = (args->whichfork == XFS_DATA_FORK)? state->mp->m_dirleafblk : 0; + blkno = (args->whichfork == XFS_DATA_FORK)? args->geo->leafblk : 0; for (blk = &state->path.blk[0], state->path.active = 1; state->path.active <= XFS_DA_NODE_MAXDEPTH; blk++, state->path.active++) { @@ -1536,7 +1501,8 @@ xfs_da3_node_lookup_int( if (blk->magic == XFS_DIR2_LEAFN_MAGIC || blk->magic == XFS_DIR3_LEAFN_MAGIC) { blk->magic = XFS_DIR2_LEAFN_MAGIC; - blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL); + blk->hashval = xfs_dir2_leafn_lasthash(args->dp, + blk->bp, NULL); break; } @@ -1547,8 +1513,8 @@ xfs_da3_node_lookup_int( * Search an intermediate node for a match. */ node = blk->bp->b_addr; - xfs_da3_node_hdr_from_disk(&nodehdr, node); - btree = xfs_da3_node_tree_p(node); + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); max = nodehdr.count; blk->hashval = be32_to_cpu(btree[max - 1].hashval); @@ -1643,6 +1609,7 @@ xfs_da3_node_lookup_int( */ STATIC int xfs_da3_node_order( + struct xfs_inode *dp, struct xfs_buf *node1_bp, struct xfs_buf *node2_bp) { @@ -1655,10 +1622,10 @@ xfs_da3_node_order( node1 = node1_bp->b_addr; node2 = node2_bp->b_addr; - xfs_da3_node_hdr_from_disk(&node1hdr, node1); - xfs_da3_node_hdr_from_disk(&node2hdr, node2); - btree1 = xfs_da3_node_tree_p(node1); - btree2 = xfs_da3_node_tree_p(node2); + dp->d_ops->node_hdr_from_disk(&node1hdr, node1); + dp->d_ops->node_hdr_from_disk(&node2hdr, node2); + btree1 = dp->d_ops->node_tree_p(node1); + btree2 = dp->d_ops->node_tree_p(node2); if (node1hdr.count > 0 && node2hdr.count > 0 && ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) || @@ -1685,6 +1652,7 @@ xfs_da3_blk_link( struct xfs_buf *bp; int before = 0; int error; + struct xfs_inode *dp = state->args->dp; /* * Set up environment. @@ -1702,10 +1670,10 @@ xfs_da3_blk_link( before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp); break; case XFS_DIR2_LEAFN_MAGIC: - before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp); + before = xfs_dir2_leafn_order(dp, old_blk->bp, new_blk->bp); break; case XFS_DA_NODE_MAGIC: - before = xfs_da3_node_order(old_blk->bp, new_blk->bp); + before = xfs_da3_node_order(dp, old_blk->bp, new_blk->bp); break; } @@ -1720,7 +1688,7 @@ xfs_da3_blk_link( new_info->forw = cpu_to_be32(old_blk->blkno); new_info->back = old_info->back; if (old_info->back) { - error = xfs_da3_node_read(args->trans, args->dp, + error = xfs_da3_node_read(args->trans, dp, be32_to_cpu(old_info->back), -1, &bp, args->whichfork); if (error) @@ -1741,7 +1709,7 @@ xfs_da3_blk_link( new_info->forw = old_info->forw; new_info->back = cpu_to_be32(old_blk->blkno); if (old_info->forw) { - error = xfs_da3_node_read(args->trans, args->dp, + error = xfs_da3_node_read(args->trans, dp, be32_to_cpu(old_info->forw), -1, &bp, args->whichfork); if (error) @@ -1861,6 +1829,7 @@ xfs_da3_path_shift( xfs_dablk_t blkno = 0; int level; int error; + struct xfs_inode *dp = state->args->dp; trace_xfs_da_path_shift(state->args); @@ -1876,8 +1845,8 @@ xfs_da3_path_shift( level = (path->active-1) - 1; /* skip bottom layer in path */ for (blk = &path->blk[level]; level >= 0; blk--, level--) { node = blk->bp->b_addr; - xfs_da3_node_hdr_from_disk(&nodehdr, node); - btree = xfs_da3_node_tree_p(node); + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); if (forward && (blk->index < nodehdr.count - 1)) { blk->index++; @@ -1911,7 +1880,7 @@ xfs_da3_path_shift( * Read the next child block. */ blk->blkno = blkno; - error = xfs_da3_node_read(args->trans, args->dp, blkno, -1, + error = xfs_da3_node_read(args->trans, dp, blkno, -1, &blk->bp, args->whichfork); if (error) return(error); @@ -1933,8 +1902,8 @@ xfs_da3_path_shift( case XFS_DA3_NODE_MAGIC: blk->magic = XFS_DA_NODE_MAGIC; node = (xfs_da_intnode_t *)info; - xfs_da3_node_hdr_from_disk(&nodehdr, node); - btree = xfs_da3_node_tree_p(node); + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval); if (forward) blk->index = 0; @@ -1947,16 +1916,15 @@ xfs_da3_path_shift( blk->magic = XFS_ATTR_LEAF_MAGIC; ASSERT(level == path->active-1); blk->index = 0; - blk->hashval = xfs_attr_leaf_lasthash(blk->bp, - NULL); + blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); break; case XFS_DIR2_LEAFN_MAGIC: case XFS_DIR3_LEAFN_MAGIC: blk->magic = XFS_DIR2_LEAFN_MAGIC; ASSERT(level == path->active-1); blk->index = 0; - blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, - NULL); + blk->hashval = xfs_dir2_leafn_lasthash(args->dp, + blk->bp, NULL); break; default: ASSERT(0); @@ -2123,20 +2091,12 @@ xfs_da_grow_inode( xfs_dablk_t *new_blkno) { xfs_fileoff_t bno; - int count; int error; trace_xfs_da_grow_inode(args); - if (args->whichfork == XFS_DATA_FORK) { - bno = args->dp->i_mount->m_dirleafblk; - count = args->dp->i_mount->m_dirblkfsbs; - } else { - bno = 0; - count = 1; - } - - error = xfs_da_grow_inode_int(args, &bno, count); + bno = args->geo->leafblk; + error = xfs_da_grow_inode_int(args, &bno, args->geo->fsbcount); if (!error) *new_blkno = (xfs_dablk_t)bno; return error; @@ -2163,7 +2123,7 @@ xfs_da3_swap_lastblock( struct xfs_dir2_leaf *dead_leaf2; struct xfs_da_node_entry *btree; struct xfs_da3_icnode_hdr par_hdr; - struct xfs_inode *ip; + struct xfs_inode *dp; struct xfs_trans *tp; struct xfs_mount *mp; struct xfs_buf *dead_buf; @@ -2187,12 +2147,12 @@ xfs_da3_swap_lastblock( dead_buf = *dead_bufp; dead_blkno = *dead_blknop; tp = args->trans; - ip = args->dp; + dp = args->dp; w = args->whichfork; ASSERT(w == XFS_DATA_FORK); - mp = ip->i_mount; - lastoff = mp->m_dirfreeblk; - error = xfs_bmap_last_before(tp, ip, &lastoff, w); + mp = dp->i_mount; + lastoff = args->geo->freeblk; + error = xfs_bmap_last_before(tp, dp, &lastoff, w); if (error) return error; if (unlikely(lastoff == 0)) { @@ -2203,15 +2163,15 @@ xfs_da3_swap_lastblock( /* * Read the last block in the btree space. */ - last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs; - error = xfs_da3_node_read(tp, ip, last_blkno, -1, &last_buf, w); + last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount; + error = xfs_da3_node_read(tp, dp, last_blkno, -1, &last_buf, w); if (error) return error; /* * Copy the last block into the dead buffer and log it. */ - memcpy(dead_buf->b_addr, last_buf->b_addr, mp->m_dirblksize); - xfs_trans_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1); + memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize); + xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1); dead_info = dead_buf->b_addr; /* * Get values from the moved block. @@ -2222,16 +2182,16 @@ xfs_da3_swap_lastblock( struct xfs_dir2_leaf_entry *ents; dead_leaf2 = (xfs_dir2_leaf_t *)dead_info; - xfs_dir3_leaf_hdr_from_disk(&leafhdr, dead_leaf2); - ents = xfs_dir3_leaf_ents_p(dead_leaf2); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, dead_leaf2); + ents = dp->d_ops->leaf_ents_p(dead_leaf2); dead_level = 0; dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval); } else { struct xfs_da3_icnode_hdr deadhdr; dead_node = (xfs_da_intnode_t *)dead_info; - xfs_da3_node_hdr_from_disk(&deadhdr, dead_node); - btree = xfs_da3_node_tree_p(dead_node); + dp->d_ops->node_hdr_from_disk(&deadhdr, dead_node); + btree = dp->d_ops->node_tree_p(dead_node); dead_level = deadhdr.level; dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval); } @@ -2240,7 +2200,7 @@ xfs_da3_swap_lastblock( * If the moved block has a left sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->back))) { - error = xfs_da3_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); + error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w); if (error) goto done; sib_info = sib_buf->b_addr; @@ -2262,7 +2222,7 @@ xfs_da3_swap_lastblock( * If the moved block has a right sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->forw))) { - error = xfs_da3_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); + error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w); if (error) goto done; sib_info = sib_buf->b_addr; @@ -2280,17 +2240,17 @@ xfs_da3_swap_lastblock( sizeof(sib_info->back))); sib_buf = NULL; } - par_blkno = mp->m_dirleafblk; + par_blkno = args->geo->leafblk; level = -1; /* * Walk down the tree looking for the parent of the moved block. */ for (;;) { - error = xfs_da3_node_read(tp, ip, par_blkno, -1, &par_buf, w); + error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w); if (error) goto done; par_node = par_buf->b_addr; - xfs_da3_node_hdr_from_disk(&par_hdr, par_node); + dp->d_ops->node_hdr_from_disk(&par_hdr, par_node); if (level >= 0 && level != par_hdr.level + 1) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)", XFS_ERRLEVEL_LOW, mp); @@ -2298,7 +2258,7 @@ xfs_da3_swap_lastblock( goto done; } level = par_hdr.level; - btree = xfs_da3_node_tree_p(par_node); + btree = dp->d_ops->node_tree_p(par_node); for (entno = 0; entno < par_hdr.count && be32_to_cpu(btree[entno].hashval) < dead_hash; @@ -2337,18 +2297,18 @@ xfs_da3_swap_lastblock( error = XFS_ERROR(EFSCORRUPTED); goto done; } - error = xfs_da3_node_read(tp, ip, par_blkno, -1, &par_buf, w); + error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w); if (error) goto done; par_node = par_buf->b_addr; - xfs_da3_node_hdr_from_disk(&par_hdr, par_node); + dp->d_ops->node_hdr_from_disk(&par_hdr, par_node); if (par_hdr.level != level) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)", XFS_ERRLEVEL_LOW, mp); error = XFS_ERROR(EFSCORRUPTED); goto done; } - btree = xfs_da3_node_tree_p(par_node); + btree = dp->d_ops->node_tree_p(par_node); entno = 0; } /* @@ -2390,10 +2350,7 @@ xfs_da_shrink_inode( w = args->whichfork; tp = args->trans; mp = dp->i_mount; - if (w == XFS_DATA_FORK) - count = mp->m_dirblkfsbs; - else - count = 1; + count = args->geo->fsbcount; for (;;) { /* * Remove extents. If we get ENOSPC for a dir we have to move @@ -2495,7 +2452,6 @@ xfs_buf_map_from_irec( */ static int xfs_dabuf_map( - struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, @@ -2513,7 +2469,10 @@ xfs_dabuf_map( ASSERT(map && *map); ASSERT(*nmaps == 1); - nfsb = (whichfork == XFS_DATA_FORK) ? mp->m_dirblkfsbs : 1; + if (whichfork == XFS_DATA_FORK) + nfsb = mp->m_dir_geo->fsbcount; + else + nfsb = mp->m_attr_geo->fsbcount; /* * Caller doesn't have a mapping. -2 means don't complain @@ -2591,7 +2550,7 @@ xfs_da_get_buf( *bpp = NULL; mapp = ↦ nmap = 1; - error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork, + error = xfs_dabuf_map(dp, bno, mappedbno, whichfork, &mapp, &nmap); if (error) { /* mapping a hole is not an error, but we don't continue */ @@ -2639,7 +2598,7 @@ xfs_da_read_buf( *bpp = NULL; mapp = ↦ nmap = 1; - error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork, + error = xfs_dabuf_map(dp, bno, mappedbno, whichfork, &mapp, &nmap); if (error) { /* mapping a hole is not an error, but we don't continue */ @@ -2658,47 +2617,6 @@ xfs_da_read_buf( xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF); else xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF); - - /* - * This verification code will be moved to a CRC verification callback - * function so just leave it here unchanged until then. - */ - { - xfs_dir2_data_hdr_t *hdr = bp->b_addr; - xfs_dir2_free_t *free = bp->b_addr; - xfs_da_blkinfo_t *info = bp->b_addr; - uint magic, magic1; - struct xfs_mount *mp = dp->i_mount; - - magic = be16_to_cpu(info->magic); - magic1 = be32_to_cpu(hdr->magic); - if (unlikely( - XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) && - (magic != XFS_DA3_NODE_MAGIC) && - (magic != XFS_ATTR_LEAF_MAGIC) && - (magic != XFS_ATTR3_LEAF_MAGIC) && - (magic != XFS_DIR2_LEAF1_MAGIC) && - (magic != XFS_DIR3_LEAF1_MAGIC) && - (magic != XFS_DIR2_LEAFN_MAGIC) && - (magic != XFS_DIR3_LEAFN_MAGIC) && - (magic1 != XFS_DIR2_BLOCK_MAGIC) && - (magic1 != XFS_DIR3_BLOCK_MAGIC) && - (magic1 != XFS_DIR2_DATA_MAGIC) && - (magic1 != XFS_DIR3_DATA_MAGIC) && - (free->hdr.magic != - cpu_to_be32(XFS_DIR2_FREE_MAGIC)) && - (free->hdr.magic != - cpu_to_be32(XFS_DIR3_FREE_MAGIC)), - mp, XFS_ERRTAG_DA_READ_BUF, - XFS_RANDOM_DA_READ_BUF))) { - trace_xfs_da_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)", - XFS_ERRLEVEL_LOW, mp, info); - error = XFS_ERROR(EFSCORRUPTED); - xfs_trans_brelse(trans, bp); - goto out_free; - } - } *bpp = bp; out_free: if (mapp != &map) @@ -2712,7 +2630,6 @@ out_free: */ xfs_daddr_t xfs_da_reada_buf( - struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, @@ -2726,7 +2643,7 @@ xfs_da_reada_buf( mapp = ↦ nmap = 1; - error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork, + error = xfs_dabuf_map(dp, bno, mappedbno, whichfork, &mapp, &nmap); if (error) { /* mapping a hole is not an error, but we don't continue */ diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index b1f267995de..6e153e399a7 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -23,147 +23,25 @@ struct xfs_bmap_free; struct xfs_inode; struct xfs_trans; struct zone; - -/*======================================================================== - * Directory Structure when greater than XFS_LBSIZE(mp) bytes. - *========================================================================*/ - -/* - * This structure is common to both leaf nodes and non-leaf nodes in the Btree. - * - * It is used to manage a doubly linked list of all blocks at the same - * level in the Btree, and to identify which type of block this is. - */ -#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */ -#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */ -#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */ -#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */ - -typedef struct xfs_da_blkinfo { - __be32 forw; /* previous block in list */ - __be32 back; /* following block in list */ - __be16 magic; /* validity check on block */ - __be16 pad; /* unused */ -} xfs_da_blkinfo_t; - -/* - * CRC enabled directory structure types - * - * The headers change size for the additional verification information, but - * otherwise the tree layouts and contents are unchanged. Hence the da btree - * code can use the struct xfs_da_blkinfo for manipulating the tree links and - * magic numbers without modification for both v2 and v3 nodes. - */ -#define XFS_DA3_NODE_MAGIC 0x3ebe /* magic number: non-leaf blocks */ -#define XFS_ATTR3_LEAF_MAGIC 0x3bee /* magic number: attribute leaf blks */ -#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v2 dirlf single blks */ -#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v2 dirlf multi blks */ - -struct xfs_da3_blkinfo { - /* - * the node link manipulation code relies on the fact that the first - * element of this structure is the struct xfs_da_blkinfo so it can - * ignore the differences in the rest of the structures. - */ - struct xfs_da_blkinfo hdr; - __be32 crc; /* CRC of block */ - __be64 blkno; /* first block of the buffer */ - __be64 lsn; /* sequence number of last write */ - uuid_t uuid; /* filesystem we belong to */ - __be64 owner; /* inode that owns the block */ +struct xfs_dir_ops; + +/* + * Directory/attribute geometry information. There will be one of these for each + * data fork type, and it will be passed around via the xfs_da_args. Global + * structures will be attached to the xfs_mount. + */ +struct xfs_da_geometry { + int blksize; /* da block size in bytes */ + int fsbcount; /* da block size in filesystem blocks */ + uint8_t fsblog; /* log2 of _filesystem_ block size */ + uint8_t blklog; /* log2 of da block size */ + uint node_ents; /* # of entries in a danode */ + int magicpct; /* 37% of block size in bytes */ + xfs_dablk_t datablk; /* blockno of dir data v2 */ + xfs_dablk_t leafblk; /* blockno of leaf data v2 */ + xfs_dablk_t freeblk; /* blockno of free data v2 */ }; -/* - * This is the structure of the root and intermediate nodes in the Btree. - * The leaf nodes are defined above. - * - * Entries are not packed. - * - * Since we have duplicate keys, use a binary search but always follow - * all match in the block, not just the first match found. - */ -#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */ - -typedef struct xfs_da_node_hdr { - struct xfs_da_blkinfo info; /* block type, links, etc. */ - __be16 __count; /* count of active entries */ - __be16 __level; /* level above leaves (leaf == 0) */ -} xfs_da_node_hdr_t; - -struct xfs_da3_node_hdr { - struct xfs_da3_blkinfo info; /* block type, links, etc. */ - __be16 __count; /* count of active entries */ - __be16 __level; /* level above leaves (leaf == 0) */ - __be32 __pad32; -}; - -#define XFS_DA3_NODE_CRC_OFF (offsetof(struct xfs_da3_node_hdr, info.crc)) - -typedef struct xfs_da_node_entry { - __be32 hashval; /* hash value for this descendant */ - __be32 before; /* Btree block before this key */ -} xfs_da_node_entry_t; - -typedef struct xfs_da_intnode { - struct xfs_da_node_hdr hdr; - struct xfs_da_node_entry __btree[]; -} xfs_da_intnode_t; - -struct xfs_da3_intnode { - struct xfs_da3_node_hdr hdr; - struct xfs_da_node_entry __btree[]; -}; - -/* - * In-core version of the node header to abstract the differences in the v2 and - * v3 disk format of the headers. Callers need to convert to/from disk format as - * appropriate. - */ -struct xfs_da3_icnode_hdr { - __uint32_t forw; - __uint32_t back; - __uint16_t magic; - __uint16_t count; - __uint16_t level; -}; - -extern void xfs_da3_node_hdr_from_disk(struct xfs_da3_icnode_hdr *to, - struct xfs_da_intnode *from); -extern void xfs_da3_node_hdr_to_disk(struct xfs_da_intnode *to, - struct xfs_da3_icnode_hdr *from); - -static inline int -__xfs_da3_node_hdr_size(bool v3) -{ - if (v3) - return sizeof(struct xfs_da3_node_hdr); - return sizeof(struct xfs_da_node_hdr); -} -static inline int -xfs_da3_node_hdr_size(struct xfs_da_intnode *dap) -{ - bool v3 = dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC); - - return __xfs_da3_node_hdr_size(v3); -} - -static inline struct xfs_da_node_entry * -xfs_da3_node_tree_p(struct xfs_da_intnode *dap) -{ - if (dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) { - struct xfs_da3_intnode *dap3 = (struct xfs_da3_intnode *)dap; - return dap3->__btree; - } - return dap->__btree; -} - -extern void xfs_da3_intnode_from_disk(struct xfs_da3_icnode_hdr *to, - struct xfs_da_intnode *from); -extern void xfs_da3_intnode_to_disk(struct xfs_da_intnode *to, - struct xfs_da3_icnode_hdr *from); - -#define XFS_LBSIZE(mp) (mp)->m_sb.sb_blocksize - /*======================================================================== * Btree searching and modification structure definitions. *========================================================================*/ @@ -181,6 +59,7 @@ enum xfs_dacmp { * Structure to ease passing around component names. */ typedef struct xfs_da_args { + struct xfs_da_geometry *geo; /* da block geometry */ const __uint8_t *name; /* string (maybe not NULL terminated) */ int namelen; /* length of string (maybe no NULL) */ __uint8_t filetype; /* filetype of inode for directories */ @@ -199,10 +78,12 @@ typedef struct xfs_da_args { int index; /* index of attr of interest in blk */ xfs_dablk_t rmtblkno; /* remote attr value starting blkno */ int rmtblkcnt; /* remote attr value block count */ + int rmtvaluelen; /* remote attr value length in bytes */ xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */ int index2; /* index of 2nd attr in blk */ xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ int rmtblkcnt2; /* remote attr value block count */ + int rmtvaluelen2; /* remote attr value length in bytes */ int op_flags; /* operation flags */ enum xfs_dacmp cmpresult; /* name compare result for lookups */ } xfs_da_args_t; @@ -247,8 +128,6 @@ typedef struct xfs_da_state_path { typedef struct xfs_da_state { xfs_da_args_t *args; /* filename arguments */ struct xfs_mount *mp; /* filesystem mount point */ - unsigned int blocksize; /* logical block size */ - unsigned int node_ents; /* how many entries in danode */ xfs_da_state_path_t path; /* search/split paths */ xfs_da_state_path_t altpath; /* alternate path for join */ unsigned char inleaf; /* insert into 1->lf, 0->splf */ @@ -309,8 +188,6 @@ int xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp, int which_fork); -extern const struct xfs_buf_ops xfs_da3_node_buf_ops; - /* * Utility routines. */ @@ -324,9 +201,9 @@ int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp, int whichfork, const struct xfs_buf_ops *ops); -xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, - xfs_dablk_t bno, xfs_daddr_t mapped_bno, - int whichfork, const struct xfs_buf_ops *ops); +xfs_daddr_t xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, + xfs_daddr_t mapped_bno, int whichfork, + const struct xfs_buf_ops *ops); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, struct xfs_buf *dead_buf); diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/xfs_da_format.c new file mode 100644 index 00000000000..c9aee52a37e --- /dev/null +++ b/fs/xfs/xfs_da_format.c @@ -0,0 +1,911 @@ +/* + * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" + +/* + * Shortform directory ops + */ +static int +xfs_dir2_sf_entsize( + struct xfs_dir2_sf_hdr *hdr, + int len) +{ + int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */ + + count += len; /* name */ + count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) : + sizeof(xfs_dir2_ino4_t); /* ino # */ + return count; +} + +static int +xfs_dir3_sf_entsize( + struct xfs_dir2_sf_hdr *hdr, + int len) +{ + return xfs_dir2_sf_entsize(hdr, len) + sizeof(__uint8_t); +} + +static struct xfs_dir2_sf_entry * +xfs_dir2_sf_nextentry( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return (struct xfs_dir2_sf_entry *) + ((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen)); +} + +static struct xfs_dir2_sf_entry * +xfs_dir3_sf_nextentry( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return (struct xfs_dir2_sf_entry *) + ((char *)sfep + xfs_dir3_sf_entsize(hdr, sfep->namelen)); +} + + +/* + * For filetype enabled shortform directories, the file type field is stored at + * the end of the name. Because it's only a single byte, endian conversion is + * not necessary. For non-filetype enable directories, the type is always + * unknown and we never store the value. + */ +static __uint8_t +xfs_dir2_sfe_get_ftype( + struct xfs_dir2_sf_entry *sfep) +{ + return XFS_DIR3_FT_UNKNOWN; +} + +static void +xfs_dir2_sfe_put_ftype( + struct xfs_dir2_sf_entry *sfep, + __uint8_t ftype) +{ + ASSERT(ftype < XFS_DIR3_FT_MAX); +} + +static __uint8_t +xfs_dir3_sfe_get_ftype( + struct xfs_dir2_sf_entry *sfep) +{ + __uint8_t ftype; + + ftype = sfep->name[sfep->namelen]; + if (ftype >= XFS_DIR3_FT_MAX) + return XFS_DIR3_FT_UNKNOWN; + return ftype; +} + +static void +xfs_dir3_sfe_put_ftype( + struct xfs_dir2_sf_entry *sfep, + __uint8_t ftype) +{ + ASSERT(ftype < XFS_DIR3_FT_MAX); + + sfep->name[sfep->namelen] = ftype; +} + +/* + * Inode numbers in short-form directories can come in two versions, + * either 4 bytes or 8 bytes wide. These helpers deal with the + * two forms transparently by looking at the headers i8count field. + * + * For 64-bit inode number the most significant byte must be zero. + */ +static xfs_ino_t +xfs_dir2_sf_get_ino( + struct xfs_dir2_sf_hdr *hdr, + xfs_dir2_inou_t *from) +{ + if (hdr->i8count) + return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL; + else + return get_unaligned_be32(&from->i4.i); +} + +static void +xfs_dir2_sf_put_ino( + struct xfs_dir2_sf_hdr *hdr, + xfs_dir2_inou_t *to, + xfs_ino_t ino) +{ + ASSERT((ino & 0xff00000000000000ULL) == 0); + + if (hdr->i8count) + put_unaligned_be64(ino, &to->i8.i); + else + put_unaligned_be32(ino, &to->i4.i); +} + +static xfs_ino_t +xfs_dir2_sf_get_parent_ino( + struct xfs_dir2_sf_hdr *hdr) +{ + return xfs_dir2_sf_get_ino(hdr, &hdr->parent); +} + +static void +xfs_dir2_sf_put_parent_ino( + struct xfs_dir2_sf_hdr *hdr, + xfs_ino_t ino) +{ + xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino); +} + +/* + * In short-form directory entries the inode numbers are stored at variable + * offset behind the entry name. If the entry stores a filetype value, then it + * sits between the name and the inode number. Hence the inode numbers may only + * be accessed through the helpers below. + */ +static xfs_ino_t +xfs_dir2_sfe_get_ino( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return xfs_dir2_sf_get_ino(hdr, + (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]); +} + +static void +xfs_dir2_sfe_put_ino( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, + xfs_ino_t ino) +{ + xfs_dir2_sf_put_ino(hdr, + (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino); +} + +static xfs_ino_t +xfs_dir3_sfe_get_ino( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return xfs_dir2_sf_get_ino(hdr, + (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]); +} + +static void +xfs_dir3_sfe_put_ino( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, + xfs_ino_t ino) +{ + xfs_dir2_sf_put_ino(hdr, + (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino); +} + + +/* + * Directory data block operations + */ + +/* + * For special situations, the dirent size ends up fixed because we always know + * what the size of the entry is. That's true for the "." and "..", and + * therefore we know that they are a fixed size and hence their offsets are + * constant, as is the first entry. + * + * Hence, this calculation is written as a macro to be able to be calculated at + * compile time and so certain offsets can be calculated directly in the + * structure initaliser via the macro. There are two macros - one for dirents + * with ftype and without so there are no unresolvable conditionals in the + * calculations. We also use round_up() as XFS_DIR2_DATA_ALIGN is always a power + * of 2 and the compiler doesn't reject it (unlike roundup()). + */ +#define XFS_DIR2_DATA_ENTSIZE(n) \ + round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \ + sizeof(xfs_dir2_data_off_t)), XFS_DIR2_DATA_ALIGN) + +#define XFS_DIR3_DATA_ENTSIZE(n) \ + round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \ + sizeof(xfs_dir2_data_off_t) + sizeof(__uint8_t)), \ + XFS_DIR2_DATA_ALIGN) + +static int +xfs_dir2_data_entsize( + int n) +{ + return XFS_DIR2_DATA_ENTSIZE(n); +} + +static int +xfs_dir3_data_entsize( + int n) +{ + return XFS_DIR3_DATA_ENTSIZE(n); +} + +static __uint8_t +xfs_dir2_data_get_ftype( + struct xfs_dir2_data_entry *dep) +{ + return XFS_DIR3_FT_UNKNOWN; +} + +static void +xfs_dir2_data_put_ftype( + struct xfs_dir2_data_entry *dep, + __uint8_t ftype) +{ + ASSERT(ftype < XFS_DIR3_FT_MAX); +} + +static __uint8_t +xfs_dir3_data_get_ftype( + struct xfs_dir2_data_entry *dep) +{ + __uint8_t ftype = dep->name[dep->namelen]; + + ASSERT(ftype < XFS_DIR3_FT_MAX); + if (ftype >= XFS_DIR3_FT_MAX) + return XFS_DIR3_FT_UNKNOWN; + return ftype; +} + +static void +xfs_dir3_data_put_ftype( + struct xfs_dir2_data_entry *dep, + __uint8_t type) +{ + ASSERT(type < XFS_DIR3_FT_MAX); + ASSERT(dep->namelen != 0); + + dep->name[dep->namelen] = type; +} + +/* + * Pointer to an entry's tag word. + */ +static __be16 * +xfs_dir2_data_entry_tag_p( + struct xfs_dir2_data_entry *dep) +{ + return (__be16 *)((char *)dep + + xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16)); +} + +static __be16 * +xfs_dir3_data_entry_tag_p( + struct xfs_dir2_data_entry *dep) +{ + return (__be16 *)((char *)dep + + xfs_dir3_data_entsize(dep->namelen) - sizeof(__be16)); +} + +/* + * location of . and .. in data space (always block 0) + */ +static struct xfs_dir2_data_entry * +xfs_dir2_data_dot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr)); +} + +static struct xfs_dir2_data_entry * +xfs_dir2_data_dotdot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR2_DATA_ENTSIZE(1)); +} + +static struct xfs_dir2_data_entry * +xfs_dir2_data_first_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR2_DATA_ENTSIZE(1) + + XFS_DIR2_DATA_ENTSIZE(2)); +} + +static struct xfs_dir2_data_entry * +xfs_dir2_ftype_data_dotdot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1)); +} + +static struct xfs_dir2_data_entry * +xfs_dir2_ftype_data_first_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1) + + XFS_DIR3_DATA_ENTSIZE(2)); +} + +static struct xfs_dir2_data_entry * +xfs_dir3_data_dot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr)); +} + +static struct xfs_dir2_data_entry * +xfs_dir3_data_dotdot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1)); +} + +static struct xfs_dir2_data_entry * +xfs_dir3_data_first_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1) + + XFS_DIR3_DATA_ENTSIZE(2)); +} + +static struct xfs_dir2_data_free * +xfs_dir2_data_bestfree_p(struct xfs_dir2_data_hdr *hdr) +{ + return hdr->bestfree; +} + +static struct xfs_dir2_data_free * +xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr) +{ + return ((struct xfs_dir3_data_hdr *)hdr)->best_free; +} + +static struct xfs_dir2_data_entry * +xfs_dir2_data_entry_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr)); +} + +static struct xfs_dir2_data_unused * +xfs_dir2_data_unused_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_unused *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr)); +} + +static struct xfs_dir2_data_entry * +xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr)); +} + +static struct xfs_dir2_data_unused * +xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_unused *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr)); +} + + +/* + * Directory Leaf block operations + */ +static int +xfs_dir2_max_leaf_ents(struct xfs_da_geometry *geo) +{ + return (geo->blksize - sizeof(struct xfs_dir2_leaf_hdr)) / + (uint)sizeof(struct xfs_dir2_leaf_entry); +} + +static struct xfs_dir2_leaf_entry * +xfs_dir2_leaf_ents_p(struct xfs_dir2_leaf *lp) +{ + return lp->__ents; +} + +static int +xfs_dir3_max_leaf_ents(struct xfs_da_geometry *geo) +{ + return (geo->blksize - sizeof(struct xfs_dir3_leaf_hdr)) / + (uint)sizeof(struct xfs_dir2_leaf_entry); +} + +static struct xfs_dir2_leaf_entry * +xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp) +{ + return ((struct xfs_dir3_leaf *)lp)->__ents; +} + +static void +xfs_dir2_leaf_hdr_from_disk( + struct xfs_dir3_icleaf_hdr *to, + struct xfs_dir2_leaf *from) +{ + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.count); + to->stale = be16_to_cpu(from->hdr.stale); + + ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC || + to->magic == XFS_DIR2_LEAFN_MAGIC); +} + +static void +xfs_dir2_leaf_hdr_to_disk( + struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from) +{ + ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC || + from->magic == XFS_DIR2_LEAFN_MAGIC); + + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.count = cpu_to_be16(from->count); + to->hdr.stale = cpu_to_be16(from->stale); +} + +static void +xfs_dir3_leaf_hdr_from_disk( + struct xfs_dir3_icleaf_hdr *to, + struct xfs_dir2_leaf *from) +{ + struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from; + + to->forw = be32_to_cpu(hdr3->info.hdr.forw); + to->back = be32_to_cpu(hdr3->info.hdr.back); + to->magic = be16_to_cpu(hdr3->info.hdr.magic); + to->count = be16_to_cpu(hdr3->count); + to->stale = be16_to_cpu(hdr3->stale); + + ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC || + to->magic == XFS_DIR3_LEAFN_MAGIC); +} + +static void +xfs_dir3_leaf_hdr_to_disk( + struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from) +{ + struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to; + + ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC || + from->magic == XFS_DIR3_LEAFN_MAGIC); + + hdr3->info.hdr.forw = cpu_to_be32(from->forw); + hdr3->info.hdr.back = cpu_to_be32(from->back); + hdr3->info.hdr.magic = cpu_to_be16(from->magic); + hdr3->count = cpu_to_be16(from->count); + hdr3->stale = cpu_to_be16(from->stale); +} + + +/* + * Directory/Attribute Node block operations + */ +static struct xfs_da_node_entry * +xfs_da2_node_tree_p(struct xfs_da_intnode *dap) +{ + return dap->__btree; +} + +static struct xfs_da_node_entry * +xfs_da3_node_tree_p(struct xfs_da_intnode *dap) +{ + return ((struct xfs_da3_intnode *)dap)->__btree; +} + +static void +xfs_da2_node_hdr_from_disk( + struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from) +{ + ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.__count); + to->level = be16_to_cpu(from->hdr.__level); +} + +static void +xfs_da2_node_hdr_to_disk( + struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from) +{ + ASSERT(from->magic == XFS_DA_NODE_MAGIC); + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.__count = cpu_to_be16(from->count); + to->hdr.__level = cpu_to_be16(from->level); +} + +static void +xfs_da3_node_hdr_from_disk( + struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from) +{ + struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from; + + ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)); + to->forw = be32_to_cpu(hdr3->info.hdr.forw); + to->back = be32_to_cpu(hdr3->info.hdr.back); + to->magic = be16_to_cpu(hdr3->info.hdr.magic); + to->count = be16_to_cpu(hdr3->__count); + to->level = be16_to_cpu(hdr3->__level); +} + +static void +xfs_da3_node_hdr_to_disk( + struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from) +{ + struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to; + + ASSERT(from->magic == XFS_DA3_NODE_MAGIC); + hdr3->info.hdr.forw = cpu_to_be32(from->forw); + hdr3->info.hdr.back = cpu_to_be32(from->back); + hdr3->info.hdr.magic = cpu_to_be16(from->magic); + hdr3->__count = cpu_to_be16(from->count); + hdr3->__level = cpu_to_be16(from->level); +} + + +/* + * Directory free space block operations + */ +static int +xfs_dir2_free_max_bests(struct xfs_da_geometry *geo) +{ + return (geo->blksize - sizeof(struct xfs_dir2_free_hdr)) / + sizeof(xfs_dir2_data_off_t); +} + +static __be16 * +xfs_dir2_free_bests_p(struct xfs_dir2_free *free) +{ + return (__be16 *)((char *)free + sizeof(struct xfs_dir2_free_hdr)); +} + +/* + * Convert data space db to the corresponding free db. + */ +static xfs_dir2_db_t +xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) + + (db / xfs_dir2_free_max_bests(geo)); +} + +/* + * Convert data space db to the corresponding index in a free db. + */ +static int +xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return db % xfs_dir2_free_max_bests(geo); +} + +static int +xfs_dir3_free_max_bests(struct xfs_da_geometry *geo) +{ + return (geo->blksize - sizeof(struct xfs_dir3_free_hdr)) / + sizeof(xfs_dir2_data_off_t); +} + +static __be16 * +xfs_dir3_free_bests_p(struct xfs_dir2_free *free) +{ + return (__be16 *)((char *)free + sizeof(struct xfs_dir3_free_hdr)); +} + +/* + * Convert data space db to the corresponding free db. + */ +static xfs_dir2_db_t +xfs_dir3_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) + + (db / xfs_dir3_free_max_bests(geo)); +} + +/* + * Convert data space db to the corresponding index in a free db. + */ +static int +xfs_dir3_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return db % xfs_dir3_free_max_bests(geo); +} + +static void +xfs_dir2_free_hdr_from_disk( + struct xfs_dir3_icfree_hdr *to, + struct xfs_dir2_free *from) +{ + to->magic = be32_to_cpu(from->hdr.magic); + to->firstdb = be32_to_cpu(from->hdr.firstdb); + to->nvalid = be32_to_cpu(from->hdr.nvalid); + to->nused = be32_to_cpu(from->hdr.nused); + ASSERT(to->magic == XFS_DIR2_FREE_MAGIC); +} + +static void +xfs_dir2_free_hdr_to_disk( + struct xfs_dir2_free *to, + struct xfs_dir3_icfree_hdr *from) +{ + ASSERT(from->magic == XFS_DIR2_FREE_MAGIC); + + to->hdr.magic = cpu_to_be32(from->magic); + to->hdr.firstdb = cpu_to_be32(from->firstdb); + to->hdr.nvalid = cpu_to_be32(from->nvalid); + to->hdr.nused = cpu_to_be32(from->nused); +} + +static void +xfs_dir3_free_hdr_from_disk( + struct xfs_dir3_icfree_hdr *to, + struct xfs_dir2_free *from) +{ + struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from; + + to->magic = be32_to_cpu(hdr3->hdr.magic); + to->firstdb = be32_to_cpu(hdr3->firstdb); + to->nvalid = be32_to_cpu(hdr3->nvalid); + to->nused = be32_to_cpu(hdr3->nused); + + ASSERT(to->magic == XFS_DIR3_FREE_MAGIC); +} + +static void +xfs_dir3_free_hdr_to_disk( + struct xfs_dir2_free *to, + struct xfs_dir3_icfree_hdr *from) +{ + struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to; + + ASSERT(from->magic == XFS_DIR3_FREE_MAGIC); + + hdr3->hdr.magic = cpu_to_be32(from->magic); + hdr3->firstdb = cpu_to_be32(from->firstdb); + hdr3->nvalid = cpu_to_be32(from->nvalid); + hdr3->nused = cpu_to_be32(from->nused); +} + +static const struct xfs_dir_ops xfs_dir2_ops = { + .sf_entsize = xfs_dir2_sf_entsize, + .sf_nextentry = xfs_dir2_sf_nextentry, + .sf_get_ftype = xfs_dir2_sfe_get_ftype, + .sf_put_ftype = xfs_dir2_sfe_put_ftype, + .sf_get_ino = xfs_dir2_sfe_get_ino, + .sf_put_ino = xfs_dir2_sfe_put_ino, + .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino, + .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino, + + .data_entsize = xfs_dir2_data_entsize, + .data_get_ftype = xfs_dir2_data_get_ftype, + .data_put_ftype = xfs_dir2_data_put_ftype, + .data_entry_tag_p = xfs_dir2_data_entry_tag_p, + .data_bestfree_p = xfs_dir2_data_bestfree_p, + + .data_dot_offset = sizeof(struct xfs_dir2_data_hdr), + .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR2_DATA_ENTSIZE(1), + .data_first_offset = sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR2_DATA_ENTSIZE(1) + + XFS_DIR2_DATA_ENTSIZE(2), + .data_entry_offset = sizeof(struct xfs_dir2_data_hdr), + + .data_dot_entry_p = xfs_dir2_data_dot_entry_p, + .data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p, + .data_first_entry_p = xfs_dir2_data_first_entry_p, + .data_entry_p = xfs_dir2_data_entry_p, + .data_unused_p = xfs_dir2_data_unused_p, + + .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr), + .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk, + .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk, + .leaf_max_ents = xfs_dir2_max_leaf_ents, + .leaf_ents_p = xfs_dir2_leaf_ents_p, + + .node_hdr_size = sizeof(struct xfs_da_node_hdr), + .node_hdr_to_disk = xfs_da2_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da2_node_hdr_from_disk, + .node_tree_p = xfs_da2_node_tree_p, + + .free_hdr_size = sizeof(struct xfs_dir2_free_hdr), + .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk, + .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk, + .free_max_bests = xfs_dir2_free_max_bests, + .free_bests_p = xfs_dir2_free_bests_p, + .db_to_fdb = xfs_dir2_db_to_fdb, + .db_to_fdindex = xfs_dir2_db_to_fdindex, +}; + +static const struct xfs_dir_ops xfs_dir2_ftype_ops = { + .sf_entsize = xfs_dir3_sf_entsize, + .sf_nextentry = xfs_dir3_sf_nextentry, + .sf_get_ftype = xfs_dir3_sfe_get_ftype, + .sf_put_ftype = xfs_dir3_sfe_put_ftype, + .sf_get_ino = xfs_dir3_sfe_get_ino, + .sf_put_ino = xfs_dir3_sfe_put_ino, + .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino, + .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino, + + .data_entsize = xfs_dir3_data_entsize, + .data_get_ftype = xfs_dir3_data_get_ftype, + .data_put_ftype = xfs_dir3_data_put_ftype, + .data_entry_tag_p = xfs_dir3_data_entry_tag_p, + .data_bestfree_p = xfs_dir2_data_bestfree_p, + + .data_dot_offset = sizeof(struct xfs_dir2_data_hdr), + .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1), + .data_first_offset = sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1) + + XFS_DIR3_DATA_ENTSIZE(2), + .data_entry_offset = sizeof(struct xfs_dir2_data_hdr), + + .data_dot_entry_p = xfs_dir2_data_dot_entry_p, + .data_dotdot_entry_p = xfs_dir2_ftype_data_dotdot_entry_p, + .data_first_entry_p = xfs_dir2_ftype_data_first_entry_p, + .data_entry_p = xfs_dir2_data_entry_p, + .data_unused_p = xfs_dir2_data_unused_p, + + .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr), + .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk, + .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk, + .leaf_max_ents = xfs_dir2_max_leaf_ents, + .leaf_ents_p = xfs_dir2_leaf_ents_p, + + .node_hdr_size = sizeof(struct xfs_da_node_hdr), + .node_hdr_to_disk = xfs_da2_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da2_node_hdr_from_disk, + .node_tree_p = xfs_da2_node_tree_p, + + .free_hdr_size = sizeof(struct xfs_dir2_free_hdr), + .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk, + .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk, + .free_max_bests = xfs_dir2_free_max_bests, + .free_bests_p = xfs_dir2_free_bests_p, + .db_to_fdb = xfs_dir2_db_to_fdb, + .db_to_fdindex = xfs_dir2_db_to_fdindex, +}; + +static const struct xfs_dir_ops xfs_dir3_ops = { + .sf_entsize = xfs_dir3_sf_entsize, + .sf_nextentry = xfs_dir3_sf_nextentry, + .sf_get_ftype = xfs_dir3_sfe_get_ftype, + .sf_put_ftype = xfs_dir3_sfe_put_ftype, + .sf_get_ino = xfs_dir3_sfe_get_ino, + .sf_put_ino = xfs_dir3_sfe_put_ino, + .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino, + .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino, + + .data_entsize = xfs_dir3_data_entsize, + .data_get_ftype = xfs_dir3_data_get_ftype, + .data_put_ftype = xfs_dir3_data_put_ftype, + .data_entry_tag_p = xfs_dir3_data_entry_tag_p, + .data_bestfree_p = xfs_dir3_data_bestfree_p, + + .data_dot_offset = sizeof(struct xfs_dir3_data_hdr), + .data_dotdot_offset = sizeof(struct xfs_dir3_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1), + .data_first_offset = sizeof(struct xfs_dir3_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1) + + XFS_DIR3_DATA_ENTSIZE(2), + .data_entry_offset = sizeof(struct xfs_dir3_data_hdr), + + .data_dot_entry_p = xfs_dir3_data_dot_entry_p, + .data_dotdot_entry_p = xfs_dir3_data_dotdot_entry_p, + .data_first_entry_p = xfs_dir3_data_first_entry_p, + .data_entry_p = xfs_dir3_data_entry_p, + .data_unused_p = xfs_dir3_data_unused_p, + + .leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr), + .leaf_hdr_to_disk = xfs_dir3_leaf_hdr_to_disk, + .leaf_hdr_from_disk = xfs_dir3_leaf_hdr_from_disk, + .leaf_max_ents = xfs_dir3_max_leaf_ents, + .leaf_ents_p = xfs_dir3_leaf_ents_p, + + .node_hdr_size = sizeof(struct xfs_da3_node_hdr), + .node_hdr_to_disk = xfs_da3_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da3_node_hdr_from_disk, + .node_tree_p = xfs_da3_node_tree_p, + + .free_hdr_size = sizeof(struct xfs_dir3_free_hdr), + .free_hdr_to_disk = xfs_dir3_free_hdr_to_disk, + .free_hdr_from_disk = xfs_dir3_free_hdr_from_disk, + .free_max_bests = xfs_dir3_free_max_bests, + .free_bests_p = xfs_dir3_free_bests_p, + .db_to_fdb = xfs_dir3_db_to_fdb, + .db_to_fdindex = xfs_dir3_db_to_fdindex, +}; + +static const struct xfs_dir_ops xfs_dir2_nondir_ops = { + .node_hdr_size = sizeof(struct xfs_da_node_hdr), + .node_hdr_to_disk = xfs_da2_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da2_node_hdr_from_disk, + .node_tree_p = xfs_da2_node_tree_p, +}; + +static const struct xfs_dir_ops xfs_dir3_nondir_ops = { + .node_hdr_size = sizeof(struct xfs_da3_node_hdr), + .node_hdr_to_disk = xfs_da3_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da3_node_hdr_from_disk, + .node_tree_p = xfs_da3_node_tree_p, +}; + +/* + * Return the ops structure according to the current config. If we are passed + * an inode, then that overrides the default config we use which is based on + * feature bits. + */ +const struct xfs_dir_ops * +xfs_dir_get_ops( + struct xfs_mount *mp, + struct xfs_inode *dp) +{ + if (dp) + return dp->d_ops; + if (mp->m_dir_inode_ops) + return mp->m_dir_inode_ops; + if (xfs_sb_version_hascrc(&mp->m_sb)) + return &xfs_dir3_ops; + if (xfs_sb_version_hasftype(&mp->m_sb)) + return &xfs_dir2_ftype_ops; + return &xfs_dir2_ops; +} + +const struct xfs_dir_ops * +xfs_nondir_get_ops( + struct xfs_mount *mp, + struct xfs_inode *dp) +{ + if (dp) + return dp->d_ops; + if (mp->m_nondir_inode_ops) + return mp->m_nondir_inode_ops; + if (xfs_sb_version_hascrc(&mp->m_sb)) + return &xfs_dir3_nondir_ops; + return &xfs_dir2_nondir_ops; +} diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_da_format.h index 9cf67381adf..0a49b028637 100644 --- a/fs/xfs/xfs_dir2_format.h +++ b/fs/xfs/xfs_da_format.h @@ -16,8 +16,107 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef __XFS_DIR2_FORMAT_H__ -#define __XFS_DIR2_FORMAT_H__ +#ifndef __XFS_DA_FORMAT_H__ +#define __XFS_DA_FORMAT_H__ + +/* + * This structure is common to both leaf nodes and non-leaf nodes in the Btree. + * + * It is used to manage a doubly linked list of all blocks at the same + * level in the Btree, and to identify which type of block this is. + */ +#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */ +#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */ +#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */ +#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */ + +typedef struct xfs_da_blkinfo { + __be32 forw; /* previous block in list */ + __be32 back; /* following block in list */ + __be16 magic; /* validity check on block */ + __be16 pad; /* unused */ +} xfs_da_blkinfo_t; + +/* + * CRC enabled directory structure types + * + * The headers change size for the additional verification information, but + * otherwise the tree layouts and contents are unchanged. Hence the da btree + * code can use the struct xfs_da_blkinfo for manipulating the tree links and + * magic numbers without modification for both v2 and v3 nodes. + */ +#define XFS_DA3_NODE_MAGIC 0x3ebe /* magic number: non-leaf blocks */ +#define XFS_ATTR3_LEAF_MAGIC 0x3bee /* magic number: attribute leaf blks */ +#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v2 dirlf single blks */ +#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v2 dirlf multi blks */ + +struct xfs_da3_blkinfo { + /* + * the node link manipulation code relies on the fact that the first + * element of this structure is the struct xfs_da_blkinfo so it can + * ignore the differences in the rest of the structures. + */ + struct xfs_da_blkinfo hdr; + __be32 crc; /* CRC of block */ + __be64 blkno; /* first block of the buffer */ + __be64 lsn; /* sequence number of last write */ + uuid_t uuid; /* filesystem we belong to */ + __be64 owner; /* inode that owns the block */ +}; + +/* + * This is the structure of the root and intermediate nodes in the Btree. + * The leaf nodes are defined above. + * + * Entries are not packed. + * + * Since we have duplicate keys, use a binary search but always follow + * all match in the block, not just the first match found. + */ +#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */ + +typedef struct xfs_da_node_hdr { + struct xfs_da_blkinfo info; /* block type, links, etc. */ + __be16 __count; /* count of active entries */ + __be16 __level; /* level above leaves (leaf == 0) */ +} xfs_da_node_hdr_t; + +struct xfs_da3_node_hdr { + struct xfs_da3_blkinfo info; /* block type, links, etc. */ + __be16 __count; /* count of active entries */ + __be16 __level; /* level above leaves (leaf == 0) */ + __be32 __pad32; +}; + +#define XFS_DA3_NODE_CRC_OFF (offsetof(struct xfs_da3_node_hdr, info.crc)) + +typedef struct xfs_da_node_entry { + __be32 hashval; /* hash value for this descendant */ + __be32 before; /* Btree block before this key */ +} xfs_da_node_entry_t; + +typedef struct xfs_da_intnode { + struct xfs_da_node_hdr hdr; + struct xfs_da_node_entry __btree[]; +} xfs_da_intnode_t; + +struct xfs_da3_intnode { + struct xfs_da3_node_hdr hdr; + struct xfs_da_node_entry __btree[]; +}; + +/* + * In-core version of the node header to abstract the differences in the v2 and + * v3 disk format of the headers. Callers need to convert to/from disk format as + * appropriate. + */ +struct xfs_da3_icnode_hdr { + __uint32_t forw; + __uint32_t back; + __uint16_t magic; + __uint16_t count; + __uint16_t level; +}; /* * Directory version 2. @@ -189,79 +288,6 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr) ((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count)); } -static inline int -xfs_dir3_sf_entsize( - struct xfs_mount *mp, - struct xfs_dir2_sf_hdr *hdr, - int len) -{ - int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */ - - count += len; /* name */ - count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) : - sizeof(xfs_dir2_ino4_t); /* ino # */ - if (xfs_sb_version_hasftype(&mp->m_sb)) - count += sizeof(__uint8_t); /* file type */ - return count; -} - -static inline struct xfs_dir2_sf_entry * -xfs_dir3_sf_nextentry( - struct xfs_mount *mp, - struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep) -{ - return (struct xfs_dir2_sf_entry *) - ((char *)sfep + xfs_dir3_sf_entsize(mp, hdr, sfep->namelen)); -} - -/* - * in dir3 shortform directories, the file type field is stored at a variable - * offset after the inode number. Because it's only a single byte, endian - * conversion is not necessary. - */ -static inline __uint8_t * -xfs_dir3_sfe_ftypep( - struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep) -{ - return (__uint8_t *)&sfep->name[sfep->namelen]; -} - -static inline __uint8_t -xfs_dir3_sfe_get_ftype( - struct xfs_mount *mp, - struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep) -{ - __uint8_t *ftp; - - if (!xfs_sb_version_hasftype(&mp->m_sb)) - return XFS_DIR3_FT_UNKNOWN; - - ftp = xfs_dir3_sfe_ftypep(hdr, sfep); - if (*ftp >= XFS_DIR3_FT_MAX) - return XFS_DIR3_FT_UNKNOWN; - return *ftp; -} - -static inline void -xfs_dir3_sfe_put_ftype( - struct xfs_mount *mp, - struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep, - __uint8_t ftype) -{ - __uint8_t *ftp; - - ASSERT(ftype < XFS_DIR3_FT_MAX); - - if (!xfs_sb_version_hasftype(&mp->m_sb)) - return; - ftp = xfs_dir3_sfe_ftypep(hdr, sfep); - *ftp = ftype; -} - /* * Data block structures. * @@ -298,8 +324,6 @@ xfs_dir3_sfe_put_ftype( #define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG)) #define XFS_DIR2_DATA_SPACE 0 #define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE) -#define XFS_DIR2_DATA_FIRSTDB(mp) \ - xfs_dir2_byte_to_db(mp, XFS_DIR2_DATA_OFFSET) /* * Describe a free area in the data block. @@ -345,17 +369,6 @@ struct xfs_dir3_data_hdr { #define XFS_DIR3_DATA_CRC_OFF offsetof(struct xfs_dir3_data_hdr, hdr.crc) -static inline struct xfs_dir2_data_free * -xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr) -{ - if (hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { - struct xfs_dir3_data_hdr *hdr3 = (struct xfs_dir3_data_hdr *)hdr; - return hdr3->best_free; - } - return hdr->bestfree; -} - /* * Active entry in a data block. * @@ -389,72 +402,6 @@ typedef struct xfs_dir2_data_unused { } xfs_dir2_data_unused_t; /* - * Size of a data entry. - */ -static inline int -__xfs_dir3_data_entsize( - bool ftype, - int n) -{ - int size = offsetof(struct xfs_dir2_data_entry, name[0]); - - size += n; - size += sizeof(xfs_dir2_data_off_t); - if (ftype) - size += sizeof(__uint8_t); - return roundup(size, XFS_DIR2_DATA_ALIGN); -} -static inline int -xfs_dir3_data_entsize( - struct xfs_mount *mp, - int n) -{ - bool ftype = xfs_sb_version_hasftype(&mp->m_sb) ? true : false; - return __xfs_dir3_data_entsize(ftype, n); -} - -static inline __uint8_t -xfs_dir3_dirent_get_ftype( - struct xfs_mount *mp, - struct xfs_dir2_data_entry *dep) -{ - if (xfs_sb_version_hasftype(&mp->m_sb)) { - __uint8_t type = dep->name[dep->namelen]; - - ASSERT(type < XFS_DIR3_FT_MAX); - if (type < XFS_DIR3_FT_MAX) - return type; - - } - return XFS_DIR3_FT_UNKNOWN; -} - -static inline void -xfs_dir3_dirent_put_ftype( - struct xfs_mount *mp, - struct xfs_dir2_data_entry *dep, - __uint8_t type) -{ - ASSERT(type < XFS_DIR3_FT_MAX); - ASSERT(dep->namelen != 0); - - if (xfs_sb_version_hasftype(&mp->m_sb)) - dep->name[dep->namelen] = type; -} - -/* - * Pointer to an entry's tag word. - */ -static inline __be16 * -xfs_dir3_data_entry_tag_p( - struct xfs_mount *mp, - struct xfs_dir2_data_entry *dep) -{ - return (__be16 *)((char *)dep + - xfs_dir3_data_entsize(mp, dep->namelen) - sizeof(__be16)); -} - -/* * Pointer to a freespace's tag word. */ static inline __be16 * @@ -464,93 +411,6 @@ xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup) be16_to_cpu(dup->length) - sizeof(__be16)); } -static inline size_t -xfs_dir3_data_hdr_size(bool dir3) -{ - if (dir3) - return sizeof(struct xfs_dir3_data_hdr); - return sizeof(struct xfs_dir2_data_hdr); -} - -static inline size_t -xfs_dir3_data_entry_offset(struct xfs_dir2_data_hdr *hdr) -{ - bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); - return xfs_dir3_data_hdr_size(dir3); -} - -static inline struct xfs_dir2_data_entry * -xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_entry *) - ((char *)hdr + xfs_dir3_data_entry_offset(hdr)); -} - -static inline struct xfs_dir2_data_unused * -xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_unused *) - ((char *)hdr + xfs_dir3_data_entry_offset(hdr)); -} - -/* - * Offsets of . and .. in data space (always block 0) - * - * XXX: there is scope for significant optimisation of the logic here. Right - * now we are checking for "dir3 format" over and over again. Ideally we should - * only do it once for each operation. - */ -static inline xfs_dir2_data_aoff_t -xfs_dir3_data_dot_offset(struct xfs_mount *mp) -{ - return xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&mp->m_sb)); -} - -static inline xfs_dir2_data_aoff_t -xfs_dir3_data_dotdot_offset(struct xfs_mount *mp) -{ - return xfs_dir3_data_dot_offset(mp) + - xfs_dir3_data_entsize(mp, 1); -} - -static inline xfs_dir2_data_aoff_t -xfs_dir3_data_first_offset(struct xfs_mount *mp) -{ - return xfs_dir3_data_dotdot_offset(mp) + - xfs_dir3_data_entsize(mp, 2); -} - -/* - * location of . and .. in data space (always block 0) - */ -static inline struct xfs_dir2_data_entry * -xfs_dir3_data_dot_entry_p( - struct xfs_mount *mp, - struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_entry *) - ((char *)hdr + xfs_dir3_data_dot_offset(mp)); -} - -static inline struct xfs_dir2_data_entry * -xfs_dir3_data_dotdot_entry_p( - struct xfs_mount *mp, - struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_entry *) - ((char *)hdr + xfs_dir3_data_dotdot_offset(mp)); -} - -static inline struct xfs_dir2_data_entry * -xfs_dir3_data_first_entry_p( - struct xfs_mount *mp, - struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_entry *) - ((char *)hdr + xfs_dir3_data_first_offset(mp)); -} - /* * Leaf block structures. * @@ -588,8 +448,6 @@ xfs_dir3_data_first_entry_p( */ #define XFS_DIR2_LEAF_SPACE 1 #define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE) -#define XFS_DIR2_LEAF_FIRSTDB(mp) \ - xfs_dir2_byte_to_db(mp, XFS_DIR2_LEAF_OFFSET) /* * Leaf block header. @@ -645,50 +503,6 @@ struct xfs_dir3_leaf { #define XFS_DIR3_LEAF_CRC_OFF offsetof(struct xfs_dir3_leaf_hdr, info.crc) -extern void xfs_dir3_leaf_hdr_from_disk(struct xfs_dir3_icleaf_hdr *to, - struct xfs_dir2_leaf *from); - -static inline int -xfs_dir3_leaf_hdr_size(struct xfs_dir2_leaf *lp) -{ - if (lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || - lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) - return sizeof(struct xfs_dir3_leaf_hdr); - return sizeof(struct xfs_dir2_leaf_hdr); -} - -static inline int -xfs_dir3_max_leaf_ents(struct xfs_mount *mp, struct xfs_dir2_leaf *lp) -{ - return (mp->m_dirblksize - xfs_dir3_leaf_hdr_size(lp)) / - (uint)sizeof(struct xfs_dir2_leaf_entry); -} - -/* - * Get address of the bestcount field in the single-leaf block. - */ -static inline struct xfs_dir2_leaf_entry * -xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp) -{ - if (lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || - lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { - struct xfs_dir3_leaf *lp3 = (struct xfs_dir3_leaf *)lp; - return lp3->__ents; - } - return lp->__ents; -} - -/* - * Get address of the bestcount field in the single-leaf block. - */ -static inline struct xfs_dir2_leaf_tail * -xfs_dir2_leaf_tail_p(struct xfs_mount *mp, struct xfs_dir2_leaf *lp) -{ - return (struct xfs_dir2_leaf_tail *) - ((char *)lp + mp->m_dirblksize - - sizeof(struct xfs_dir2_leaf_tail)); -} - /* * Get address of the bests array in the single-leaf block. */ @@ -699,123 +513,6 @@ xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp) } /* - * DB blocks here are logical directory block numbers, not filesystem blocks. - */ - -/* - * Convert dataptr to byte in file space - */ -static inline xfs_dir2_off_t -xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) -{ - return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG; -} - -/* - * Convert byte in file space to dataptr. It had better be aligned. - */ -static inline xfs_dir2_dataptr_t -xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by) -{ - return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG); -} - -/* - * Convert byte in space to (DB) block - */ -static inline xfs_dir2_db_t -xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by) -{ - return (xfs_dir2_db_t) - (by >> (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)); -} - -/* - * Convert dataptr to a block number - */ -static inline xfs_dir2_db_t -xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) -{ - return xfs_dir2_byte_to_db(mp, xfs_dir2_dataptr_to_byte(mp, dp)); -} - -/* - * Convert byte in space to offset in a block - */ -static inline xfs_dir2_data_aoff_t -xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by) -{ - return (xfs_dir2_data_aoff_t)(by & - ((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) - 1)); -} - -/* - * Convert dataptr to a byte offset in a block - */ -static inline xfs_dir2_data_aoff_t -xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) -{ - return xfs_dir2_byte_to_off(mp, xfs_dir2_dataptr_to_byte(mp, dp)); -} - -/* - * Convert block and offset to byte in space - */ -static inline xfs_dir2_off_t -xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db, - xfs_dir2_data_aoff_t o) -{ - return ((xfs_dir2_off_t)db << - (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) + o; -} - -/* - * Convert block (DB) to block (dablk) - */ -static inline xfs_dablk_t -xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db) -{ - return (xfs_dablk_t)(db << mp->m_sb.sb_dirblklog); -} - -/* - * Convert byte in space to (DA) block - */ -static inline xfs_dablk_t -xfs_dir2_byte_to_da(struct xfs_mount *mp, xfs_dir2_off_t by) -{ - return xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, by)); -} - -/* - * Convert block and offset to dataptr - */ -static inline xfs_dir2_dataptr_t -xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db, - xfs_dir2_data_aoff_t o) -{ - return xfs_dir2_byte_to_dataptr(mp, xfs_dir2_db_off_to_byte(mp, db, o)); -} - -/* - * Convert block (dablk) to block (DB) - */ -static inline xfs_dir2_db_t -xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da) -{ - return (xfs_dir2_db_t)(da >> mp->m_sb.sb_dirblklog); -} - -/* - * Convert block (dablk) to byte offset in space - */ -static inline xfs_dir2_off_t -xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da) -{ - return xfs_dir2_db_off_to_byte(mp, xfs_dir2_da_to_db(mp, da), 0); -} - -/* * Free space block defintions for the node format. */ @@ -824,8 +521,6 @@ xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da) */ #define XFS_DIR2_FREE_SPACE 2 #define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE) -#define XFS_DIR2_FREE_FIRSTDB(mp) \ - xfs_dir2_byte_to_db(mp, XFS_DIR2_FREE_OFFSET) typedef struct xfs_dir2_free_hdr { __be32 magic; /* XFS_DIR2_FREE_MAGIC */ @@ -869,48 +564,6 @@ struct xfs_dir3_icfree_hdr { }; -void xfs_dir3_free_hdr_from_disk(struct xfs_dir3_icfree_hdr *to, - struct xfs_dir2_free *from); - -static inline int -xfs_dir3_free_hdr_size(struct xfs_mount *mp) -{ - if (xfs_sb_version_hascrc(&mp->m_sb)) - return sizeof(struct xfs_dir3_free_hdr); - return sizeof(struct xfs_dir2_free_hdr); -} - -static inline int -xfs_dir3_free_max_bests(struct xfs_mount *mp) -{ - return (mp->m_dirblksize - xfs_dir3_free_hdr_size(mp)) / - sizeof(xfs_dir2_data_off_t); -} - -static inline __be16 * -xfs_dir3_free_bests_p(struct xfs_mount *mp, struct xfs_dir2_free *free) -{ - return (__be16 *)((char *)free + xfs_dir3_free_hdr_size(mp)); -} - -/* - * Convert data space db to the corresponding free db. - */ -static inline xfs_dir2_db_t -xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db) -{ - return XFS_DIR2_FREE_FIRSTDB(mp) + db / xfs_dir3_free_max_bests(mp); -} - -/* - * Convert data space db to the corresponding index in a free db. - */ -static inline int -xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db) -{ - return db % xfs_dir3_free_max_bests(mp); -} - /* * Single block format. * @@ -943,22 +596,266 @@ typedef struct xfs_dir2_block_tail { } xfs_dir2_block_tail_t; /* - * Pointer to the leaf header embedded in a data block (1-block format) + * Pointer to the leaf entries embedded in a data block (1-block format) */ -static inline struct xfs_dir2_block_tail * -xfs_dir2_block_tail_p(struct xfs_mount *mp, struct xfs_dir2_data_hdr *hdr) +static inline struct xfs_dir2_leaf_entry * +xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp) { - return ((struct xfs_dir2_block_tail *) - ((char *)hdr + mp->m_dirblksize)) - 1; + return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count); } + /* - * Pointer to the leaf entries embedded in a data block (1-block format) + * Attribute storage layout + * + * Attribute lists are structured around Btrees where all the data + * elements are in the leaf nodes. Attribute names are hashed into an int, + * then that int is used as the index into the Btree. Since the hashval + * of an attribute name may not be unique, we may have duplicate keys. The + * internal links in the Btree are logical block offsets into the file. + * + * Struct leaf_entry's are packed from the top. Name/values grow from the + * bottom but are not packed. The freemap contains run-length-encoded entries + * for the free bytes after the leaf_entry's, but only the N largest such, + * smaller runs are dropped. When the freemap doesn't show enough space + * for an allocation, we compact the name/value area and try again. If we + * still don't have enough space, then we have to split the block. The + * name/value structs (both local and remote versions) must be 32bit aligned. + * + * Since we have duplicate hash keys, for each key that matches, compare + * the actual name string. The root and intermediate node search always + * takes the first-in-the-block key match found, so we should only have + * to work "forw"ard. If none matches, continue with the "forw"ard leaf + * nodes until the hash key changes or the attribute name is found. + * + * We store the fact that an attribute is a ROOT/USER/SECURE attribute in + * the leaf_entry. The namespaces are independent only because we also look + * at the namespace bit when we are looking for a matching attribute name. + * + * We also store an "incomplete" bit in the leaf_entry. It shows that an + * attribute is in the middle of being created and should not be shown to + * the user if we crash during the time that the bit is set. We clear the + * bit when we have finished setting up the attribute. We do this because + * we cannot create some large attributes inside a single transaction, and we + * need some indication that we weren't finished if we crash in the middle. + */ +#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */ + +typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ + __be16 base; /* base of free region */ + __be16 size; /* length of free region */ +} xfs_attr_leaf_map_t; + +typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */ + xfs_da_blkinfo_t info; /* block type, links, etc. */ + __be16 count; /* count of active leaf_entry's */ + __be16 usedbytes; /* num bytes of names/values stored */ + __be16 firstused; /* first used byte in name area */ + __u8 holes; /* != 0 if blk needs compaction */ + __u8 pad1; + xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE]; + /* N largest free regions */ +} xfs_attr_leaf_hdr_t; + +typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */ + __be32 hashval; /* hash value of name */ + __be16 nameidx; /* index into buffer of name/value */ + __u8 flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */ + __u8 pad2; /* unused pad byte */ +} xfs_attr_leaf_entry_t; + +typedef struct xfs_attr_leaf_name_local { + __be16 valuelen; /* number of bytes in value */ + __u8 namelen; /* length of name bytes */ + __u8 nameval[1]; /* name/value bytes */ +} xfs_attr_leaf_name_local_t; + +typedef struct xfs_attr_leaf_name_remote { + __be32 valueblk; /* block number of value bytes */ + __be32 valuelen; /* number of bytes in value */ + __u8 namelen; /* length of name bytes */ + __u8 name[1]; /* name bytes */ +} xfs_attr_leaf_name_remote_t; + +typedef struct xfs_attr_leafblock { + xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */ + xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */ + xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */ + xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */ +} xfs_attr_leafblock_t; + +/* + * CRC enabled leaf structures. Called "version 3" structures to match the + * version number of the directory and dablk structures for this feature, and + * attr2 is already taken by the variable inode attribute fork size feature. + */ +struct xfs_attr3_leaf_hdr { + struct xfs_da3_blkinfo info; + __be16 count; + __be16 usedbytes; + __be16 firstused; + __u8 holes; + __u8 pad1; + struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE]; + __be32 pad2; /* 64 bit alignment */ +}; + +#define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc)) + +struct xfs_attr3_leafblock { + struct xfs_attr3_leaf_hdr hdr; + struct xfs_attr_leaf_entry entries[1]; + + /* + * The rest of the block contains the following structures after the + * leaf entries, growing from the bottom up. The variables are never + * referenced, the locations accessed purely from helper functions. + * + * struct xfs_attr_leaf_name_local + * struct xfs_attr_leaf_name_remote + */ +}; + +/* + * incore, neutral version of the attribute leaf header + */ +struct xfs_attr3_icleaf_hdr { + __uint32_t forw; + __uint32_t back; + __uint16_t magic; + __uint16_t count; + __uint16_t usedbytes; + __uint16_t firstused; + __u8 holes; + struct { + __uint16_t base; + __uint16_t size; + } freemap[XFS_ATTR_LEAF_MAPSIZE]; +}; + +/* + * Flags used in the leaf_entry[i].flags field. + * NOTE: the INCOMPLETE bit must not collide with the flags bits specified + * on the system call, they are "or"ed together for various operations. */ -static inline struct xfs_dir2_leaf_entry * -xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp) +#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ +#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ +#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */ +#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */ +#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT) +#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT) +#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT) +#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT) + +/* + * Conversion macros for converting namespace bits from argument flags + * to ondisk flags. + */ +#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE) +#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) +#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK) +#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK) +#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\ + ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0)) +#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\ + ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0)) + +/* + * Alignment for namelist and valuelist entries (since they are mixed + * there can be only one alignment value) + */ +#define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t)) + +static inline int +xfs_attr3_leaf_hdr_size(struct xfs_attr_leafblock *leafp) { - return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count); + if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) + return sizeof(struct xfs_attr3_leaf_hdr); + return sizeof(struct xfs_attr_leaf_hdr); +} + +static inline struct xfs_attr_leaf_entry * +xfs_attr3_leaf_entryp(xfs_attr_leafblock_t *leafp) +{ + if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) + return &((struct xfs_attr3_leafblock *)leafp)->entries[0]; + return &leafp->entries[0]; } -#endif /* __XFS_DIR2_FORMAT_H__ */ +/* + * Cast typed pointers for "local" and "remote" name/value structs. + */ +static inline char * +xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx) +{ + struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp); + + return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)]; +} + +static inline xfs_attr_leaf_name_remote_t * +xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx) +{ + return (xfs_attr_leaf_name_remote_t *)xfs_attr3_leaf_name(leafp, idx); +} + +static inline xfs_attr_leaf_name_local_t * +xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) +{ + return (xfs_attr_leaf_name_local_t *)xfs_attr3_leaf_name(leafp, idx); +} + +/* + * Calculate total bytes used (including trailing pad for alignment) for + * a "local" name/value structure, a "remote" name/value structure, and + * a pointer which might be either. + */ +static inline int xfs_attr_leaf_entsize_remote(int nlen) +{ + return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \ + XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); +} + +static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen) +{ + return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) + + XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); +} + +static inline int xfs_attr_leaf_entsize_local_max(int bsize) +{ + return (((bsize) >> 1) + ((bsize) >> 2)); +} + + + +/* + * Remote attribute block format definition + * + * There is one of these headers per filesystem block in a remote attribute. + * This is done to ensure there is a 1:1 mapping between the attribute value + * length and the number of blocks needed to store the attribute. This makes the + * verification of a buffer a little more complex, but greatly simplifies the + * allocation, reading and writing of these attributes as we don't have to guess + * the number of blocks needed to store the attribute data. + */ +#define XFS_ATTR3_RMT_MAGIC 0x5841524d /* XARM */ + +struct xfs_attr3_rmt_hdr { + __be32 rm_magic; + __be32 rm_offset; + __be32 rm_bytes; + __be32 rm_crc; + uuid_t rm_uuid; + __be64 rm_owner; + __be64 rm_blkno; + __be64 rm_lsn; +}; + +#define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc) + +#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \ + ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ + sizeof(struct xfs_attr3_rmt_hdr) : 0)) + +#endif /* __XFS_DA_FORMAT_H__ */ diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index e5869b50dc4..623bbe8fd92 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h @@ -89,6 +89,8 @@ typedef struct xfs_dinode { /* structure must be padded to 64 bit alignment */ } xfs_dinode_t; +#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc) + #define DI_MAX_FLUSH 0xffff /* diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index edf203ab50a..79670cda48a 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -17,25 +17,24 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" -#include "xfs_dir2_format.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_dinode.h" struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR }; @@ -86,33 +85,74 @@ static struct xfs_nameops xfs_ascii_ci_nameops = { .compname = xfs_ascii_ci_compname, }; -void -xfs_dir_mount( - xfs_mount_t *mp) +int +xfs_da_mount( + struct xfs_mount *mp) { - int nodehdr_size; + struct xfs_da_geometry *dageo; + int nodehdr_size; - ASSERT(xfs_sb_version_hasdirv2(&mp->m_sb)); + ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT); ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <= XFS_MAX_BLOCKSIZE); - mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog); - mp->m_dirblkfsbs = 1 << mp->m_sb.sb_dirblklog; - mp->m_dirdatablk = xfs_dir2_db_to_da(mp, XFS_DIR2_DATA_FIRSTDB(mp)); - mp->m_dirleafblk = xfs_dir2_db_to_da(mp, XFS_DIR2_LEAF_FIRSTDB(mp)); - mp->m_dirfreeblk = xfs_dir2_db_to_da(mp, XFS_DIR2_FREE_FIRSTDB(mp)); - - nodehdr_size = __xfs_da3_node_hdr_size(xfs_sb_version_hascrc(&mp->m_sb)); - mp->m_attr_node_ents = (mp->m_sb.sb_blocksize - nodehdr_size) / + + mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL); + mp->m_nondir_inode_ops = xfs_nondir_get_ops(mp, NULL); + + nodehdr_size = mp->m_dir_inode_ops->node_hdr_size; + mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), + KM_SLEEP | KM_MAYFAIL); + mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), + KM_SLEEP | KM_MAYFAIL); + if (!mp->m_dir_geo || !mp->m_attr_geo) { + kmem_free(mp->m_dir_geo); + kmem_free(mp->m_attr_geo); + return ENOMEM; + } + + /* set up directory geometry */ + dageo = mp->m_dir_geo; + dageo->blklog = mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog; + dageo->fsblog = mp->m_sb.sb_blocklog; + dageo->blksize = 1 << dageo->blklog; + dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog; + + /* + * Now we've set up the block conversion variables, we can calculate the + * segment block constants using the geometry structure. + */ + dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET); + dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET); + dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET); + dageo->node_ents = (dageo->blksize - nodehdr_size) / (uint)sizeof(xfs_da_node_entry_t); - mp->m_dir_node_ents = (mp->m_dirblksize - nodehdr_size) / + dageo->magicpct = (dageo->blksize * 37) / 100; + + /* set up attribute geometry - single fsb only */ + dageo = mp->m_attr_geo; + dageo->blklog = mp->m_sb.sb_blocklog; + dageo->fsblog = mp->m_sb.sb_blocklog; + dageo->blksize = 1 << dageo->blklog; + dageo->fsbcount = 1; + dageo->node_ents = (dageo->blksize - nodehdr_size) / (uint)sizeof(xfs_da_node_entry_t); + dageo->magicpct = (dageo->blksize * 37) / 100; - mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100; if (xfs_sb_version_hasasciici(&mp->m_sb)) mp->m_dirnameops = &xfs_ascii_ci_nameops; else mp->m_dirnameops = &xfs_default_nameops; + + return 0; +} + +void +xfs_da_unmount( + struct xfs_mount *mp) +{ + kmem_free(mp->m_dir_geo); + kmem_free(mp->m_attr_geo); } /* @@ -176,16 +216,24 @@ xfs_dir_init( xfs_inode_t *dp, xfs_inode_t *pdp) { - xfs_da_args_t args; + struct xfs_da_args *args; int error; - memset((char *)&args, 0, sizeof(args)); - args.dp = dp; - args.trans = tp; ASSERT(S_ISDIR(dp->i_d.di_mode)); - if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) + error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino); + if (error) return error; - return xfs_dir2_sf_create(&args, pdp->i_ino); + + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; + + args->geo = dp->i_mount->m_dir_geo; + args->dp = dp; + args->trans = tp; + error = xfs_dir2_sf_create(args, pdp->i_ino); + kmem_free(args); + return error; } /* @@ -201,41 +249,57 @@ xfs_dir_createname( xfs_bmap_free_t *flist, /* bmap's freeblock list */ xfs_extlen_t total) /* bmap's total block count */ { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ ASSERT(S_ISDIR(dp->i_d.di_mode)); - if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) + rval = xfs_dir_ino_validate(tp->t_mountp, inum); + if (rval) return rval; XFS_STATS_INC(xs_dir_create); - memset(&args, 0, sizeof(xfs_da_args_t)); - args.name = name->name; - args.namelen = name->len; - args.filetype = name->type; - args.hashval = dp->i_mount->m_dirnameops->hashname(name); - args.inumber = inum; - args.dp = dp; - args.firstblock = first; - args.flist = flist; - args.total = total; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; - args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; - - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_addname(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_addname(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_addname(&args); + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; + + args->geo = dp->i_mount->m_dir_geo; + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->inumber = inum; + args->dp = dp; + args->firstblock = first; + args->flist = flist; + args->total = total; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_addname(args); + goto out_free; + } + + rval = xfs_dir2_isblock(args, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_addname(args); + goto out_free; + } + + rval = xfs_dir2_isleaf(args, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_addname(args); else - rval = xfs_dir2_node_addname(&args); + rval = xfs_dir2_node_addname(args); + +out_free: + kmem_free(args); return rval; } @@ -278,46 +342,67 @@ xfs_dir_lookup( xfs_ino_t *inum, /* out: inode number */ struct xfs_name *ci_name) /* out: actual name if CI match */ { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_lookup); - memset(&args, 0, sizeof(xfs_da_args_t)); - args.name = name->name; - args.namelen = name->len; - args.filetype = name->type; - args.hashval = dp->i_mount->m_dirnameops->hashname(name); - args.dp = dp; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; - args.op_flags = XFS_DA_OP_OKNOENT; + /* + * We need to use KM_NOFS here so that lockdep will not throw false + * positive deadlock warnings on a non-transactional lookup path. It is + * safe to recurse into inode recalim in that case, but lockdep can't + * easily be taught about it. Hence KM_NOFS avoids having to add more + * lockdep Doing this avoids having to add a bunch of lockdep class + * annotations into the reclaim path for the ilock. + */ + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + args->geo = dp->i_mount->m_dir_geo; + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->dp = dp; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + args->op_flags = XFS_DA_OP_OKNOENT; if (ci_name) - args.op_flags |= XFS_DA_OP_CILOOKUP; + args->op_flags |= XFS_DA_OP_CILOOKUP; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_lookup(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_lookup(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_lookup(&args); + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_lookup(args); + goto out_check_rval; + } + + rval = xfs_dir2_isblock(args, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_lookup(args); + goto out_check_rval; + } + + rval = xfs_dir2_isleaf(args, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_lookup(args); else - rval = xfs_dir2_node_lookup(&args); + rval = xfs_dir2_node_lookup(args); + +out_check_rval: if (rval == EEXIST) rval = 0; if (!rval) { - *inum = args.inumber; + *inum = args->inumber; if (ci_name) { - ci_name->name = args.value; - ci_name->len = args.valuelen; + ci_name->name = args->value; + ci_name->len = args->valuelen; } } +out_free: + kmem_free(args); return rval; } @@ -334,38 +419,52 @@ xfs_dir_removename( xfs_bmap_free_t *flist, /* bmap's freeblock list */ xfs_extlen_t total) /* bmap's total block count */ { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_remove); - memset(&args, 0, sizeof(xfs_da_args_t)); - args.name = name->name; - args.namelen = name->len; - args.filetype = name->type; - args.hashval = dp->i_mount->m_dirnameops->hashname(name); - args.inumber = ino; - args.dp = dp; - args.firstblock = first; - args.flist = flist; - args.total = total; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; - - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_removename(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_removename(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_removename(&args); + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; + + args->geo = dp->i_mount->m_dir_geo; + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->inumber = ino; + args->dp = dp; + args->firstblock = first; + args->flist = flist; + args->total = total; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_removename(args); + goto out_free; + } + + rval = xfs_dir2_isblock(args, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_removename(args); + goto out_free; + } + + rval = xfs_dir2_isleaf(args, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_removename(args); else - rval = xfs_dir2_node_removename(&args); + rval = xfs_dir2_node_removename(args); +out_free: + kmem_free(args); return rval; } @@ -382,40 +481,55 @@ xfs_dir_replace( xfs_bmap_free_t *flist, /* bmap's freeblock list */ xfs_extlen_t total) /* bmap's total block count */ { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ ASSERT(S_ISDIR(dp->i_d.di_mode)); - if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) + rval = xfs_dir_ino_validate(tp->t_mountp, inum); + if (rval) return rval; - memset(&args, 0, sizeof(xfs_da_args_t)); - args.name = name->name; - args.namelen = name->len; - args.filetype = name->type; - args.hashval = dp->i_mount->m_dirnameops->hashname(name); - args.inumber = inum; - args.dp = dp; - args.firstblock = first; - args.flist = flist; - args.total = total; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; - - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_replace(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_replace(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_replace(&args); + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; + + args->geo = dp->i_mount->m_dir_geo; + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->inumber = inum; + args->dp = dp; + args->firstblock = first; + args->flist = flist; + args->total = total; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_replace(args); + goto out_free; + } + + rval = xfs_dir2_isblock(args, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_replace(args); + goto out_free; + } + + rval = xfs_dir2_isleaf(args, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_replace(args); else - rval = xfs_dir2_node_replace(&args); + rval = xfs_dir2_node_replace(args); +out_free: + kmem_free(args); return rval; } @@ -430,7 +544,7 @@ xfs_dir_canenter( struct xfs_name *name, /* name of entry to add */ uint resblks) { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ @@ -439,29 +553,43 @@ xfs_dir_canenter( ASSERT(S_ISDIR(dp->i_d.di_mode)); - memset(&args, 0, sizeof(xfs_da_args_t)); - args.name = name->name; - args.namelen = name->len; - args.filetype = name->type; - args.hashval = dp->i_mount->m_dirnameops->hashname(name); - args.dp = dp; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; - args.op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME | + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; + + args->geo = dp->i_mount->m_dir_geo; + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->dp = dp; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + args->op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_addname(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_addname(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_addname(&args); + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_addname(args); + goto out_free; + } + + rval = xfs_dir2_isblock(args, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_addname(args); + goto out_free; + } + + rval = xfs_dir2_isleaf(args, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_addname(args); else - rval = xfs_dir2_node_addname(&args); + rval = xfs_dir2_node_addname(args); +out_free: + kmem_free(args); return rval; } @@ -493,13 +621,13 @@ xfs_dir2_grow_inode( * Set lowest possible block in the space requested. */ bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE); - count = mp->m_dirblkfsbs; + count = args->geo->fsbcount; error = xfs_da_grow_inode_int(args, &bno, count); if (error) return error; - *dbp = xfs_dir2_da_to_db(mp, (xfs_dablk_t)bno); + *dbp = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)bno); /* * Update file's size if this is the data space and it grew. @@ -521,19 +649,16 @@ xfs_dir2_grow_inode( */ int xfs_dir2_isblock( - xfs_trans_t *tp, - xfs_inode_t *dp, - int *vp) /* out: 1 is block, 0 is not block */ + struct xfs_da_args *args, + int *vp) /* out: 1 is block, 0 is not block */ { - xfs_fileoff_t last; /* last file offset */ - xfs_mount_t *mp; - int rval; + xfs_fileoff_t last; /* last file offset */ + int rval; - mp = dp->i_mount; - if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) + if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) return rval; - rval = XFS_FSB_TO_B(mp, last) == mp->m_dirblksize; - ASSERT(rval == 0 || dp->i_d.di_size == mp->m_dirblksize); + rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize; + ASSERT(rval == 0 || args->dp->i_d.di_size == args->geo->blksize); *vp = rval; return 0; } @@ -543,18 +668,15 @@ xfs_dir2_isblock( */ int xfs_dir2_isleaf( - xfs_trans_t *tp, - xfs_inode_t *dp, - int *vp) /* out: 1 is leaf, 0 is not leaf */ + struct xfs_da_args *args, + int *vp) /* out: 1 is block, 0 is not block */ { - xfs_fileoff_t last; /* last file offset */ - xfs_mount_t *mp; - int rval; + xfs_fileoff_t last; /* last file offset */ + int rval; - mp = dp->i_mount; - if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) + if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) return rval; - *vp = last == mp->m_dirleafblk + (1 << mp->m_sb.sb_dirblklog); + *vp = last == args->geo->leafblk + args->geo->fsbcount; return 0; } @@ -582,11 +704,11 @@ xfs_dir2_shrink_inode( dp = args->dp; mp = dp->i_mount; tp = args->trans; - da = xfs_dir2_db_to_da(mp, db); + da = xfs_dir2_db_to_da(args->geo, db); /* * Unmap the fsblock(s). */ - if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs, + if ((error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, XFS_BMAPI_METADATA, 0, args->firstblock, args->flist, &done))) { /* @@ -613,12 +735,12 @@ xfs_dir2_shrink_inode( /* * If it's not a data block, we're done. */ - if (db >= XFS_DIR2_LEAF_FIRSTDB(mp)) + if (db >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET)) return 0; /* * If the block isn't the last one in the directory, we're done. */ - if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(mp, db + 1, 0)) + if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0)) return 0; bno = da; if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) { @@ -627,7 +749,7 @@ xfs_dir2_shrink_inode( */ return error; } - if (db == mp->m_dirdatablk) + if (db == args->geo->datablk) ASSERT(bno == 0); else ASSERT(bno > 0); diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h index 9910401327d..c8e86b0b5e9 100644 --- a/fs/xfs/xfs_dir2.h +++ b/fs/xfs/xfs_dir2.h @@ -32,10 +32,91 @@ struct xfs_dir2_data_unused; extern struct xfs_name xfs_name_dotdot; /* + * directory operations vector for encode/decode routines + */ +struct xfs_dir_ops { + int (*sf_entsize)(struct xfs_dir2_sf_hdr *hdr, int len); + struct xfs_dir2_sf_entry * + (*sf_nextentry)(struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep); + __uint8_t (*sf_get_ftype)(struct xfs_dir2_sf_entry *sfep); + void (*sf_put_ftype)(struct xfs_dir2_sf_entry *sfep, + __uint8_t ftype); + xfs_ino_t (*sf_get_ino)(struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep); + void (*sf_put_ino)(struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, + xfs_ino_t ino); + xfs_ino_t (*sf_get_parent_ino)(struct xfs_dir2_sf_hdr *hdr); + void (*sf_put_parent_ino)(struct xfs_dir2_sf_hdr *hdr, + xfs_ino_t ino); + + int (*data_entsize)(int len); + __uint8_t (*data_get_ftype)(struct xfs_dir2_data_entry *dep); + void (*data_put_ftype)(struct xfs_dir2_data_entry *dep, + __uint8_t ftype); + __be16 * (*data_entry_tag_p)(struct xfs_dir2_data_entry *dep); + struct xfs_dir2_data_free * + (*data_bestfree_p)(struct xfs_dir2_data_hdr *hdr); + + xfs_dir2_data_aoff_t data_dot_offset; + xfs_dir2_data_aoff_t data_dotdot_offset; + xfs_dir2_data_aoff_t data_first_offset; + size_t data_entry_offset; + + struct xfs_dir2_data_entry * + (*data_dot_entry_p)(struct xfs_dir2_data_hdr *hdr); + struct xfs_dir2_data_entry * + (*data_dotdot_entry_p)(struct xfs_dir2_data_hdr *hdr); + struct xfs_dir2_data_entry * + (*data_first_entry_p)(struct xfs_dir2_data_hdr *hdr); + struct xfs_dir2_data_entry * + (*data_entry_p)(struct xfs_dir2_data_hdr *hdr); + struct xfs_dir2_data_unused * + (*data_unused_p)(struct xfs_dir2_data_hdr *hdr); + + int leaf_hdr_size; + void (*leaf_hdr_to_disk)(struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from); + void (*leaf_hdr_from_disk)(struct xfs_dir3_icleaf_hdr *to, + struct xfs_dir2_leaf *from); + int (*leaf_max_ents)(struct xfs_da_geometry *geo); + struct xfs_dir2_leaf_entry * + (*leaf_ents_p)(struct xfs_dir2_leaf *lp); + + int node_hdr_size; + void (*node_hdr_to_disk)(struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from); + void (*node_hdr_from_disk)(struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from); + struct xfs_da_node_entry * + (*node_tree_p)(struct xfs_da_intnode *dap); + + int free_hdr_size; + void (*free_hdr_to_disk)(struct xfs_dir2_free *to, + struct xfs_dir3_icfree_hdr *from); + void (*free_hdr_from_disk)(struct xfs_dir3_icfree_hdr *to, + struct xfs_dir2_free *from); + int (*free_max_bests)(struct xfs_da_geometry *geo); + __be16 * (*free_bests_p)(struct xfs_dir2_free *free); + xfs_dir2_db_t (*db_to_fdb)(struct xfs_da_geometry *geo, + xfs_dir2_db_t db); + int (*db_to_fdindex)(struct xfs_da_geometry *geo, + xfs_dir2_db_t db); +}; + +extern const struct xfs_dir_ops * + xfs_dir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp); +extern const struct xfs_dir_ops * + xfs_nondir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp); + +/* * Generic directory interface routines */ extern void xfs_dir_startup(void); -extern void xfs_dir_mount(struct xfs_mount *mp); +extern int xfs_da_mount(struct xfs_mount *mp); +extern void xfs_da_unmount(struct xfs_mount *mp); + extern int xfs_dir_isempty(struct xfs_inode *dp); extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_inode *pdp); @@ -65,37 +146,30 @@ extern int xfs_dir2_sf_to_block(struct xfs_da_args *args); /* * Interface routines used by userspace utilities */ -extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp); -extern void xfs_dir2_sf_put_parent_ino(struct xfs_dir2_sf_hdr *sfp, - xfs_ino_t ino); -extern xfs_ino_t xfs_dir3_sfe_get_ino(struct xfs_mount *mp, - struct xfs_dir2_sf_hdr *sfp, struct xfs_dir2_sf_entry *sfep); -extern void xfs_dir3_sfe_put_ino(struct xfs_mount *mp, - struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep, - xfs_ino_t ino); - -extern int xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *r); -extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *r); +extern int xfs_dir2_isblock(struct xfs_da_args *args, int *r); +extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r); extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_buf *bp); -extern void xfs_dir2_data_freescan(struct xfs_mount *mp, +extern void xfs_dir2_data_freescan(struct xfs_inode *dp, struct xfs_dir2_data_hdr *hdr, int *loghead); -extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_buf *bp, - struct xfs_dir2_data_entry *dep); -extern void xfs_dir2_data_log_header(struct xfs_trans *tp, +extern void xfs_dir2_data_log_entry(struct xfs_da_args *args, + struct xfs_buf *bp, struct xfs_dir2_data_entry *dep); +extern void xfs_dir2_data_log_header(struct xfs_da_args *args, struct xfs_buf *bp); -extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_buf *bp, - struct xfs_dir2_data_unused *dup); -extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_buf *bp, +extern void xfs_dir2_data_log_unused(struct xfs_da_args *args, + struct xfs_buf *bp, struct xfs_dir2_data_unused *dup); +extern void xfs_dir2_data_make_free(struct xfs_da_args *args, + struct xfs_buf *bp, xfs_dir2_data_aoff_t offset, + xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); +extern void xfs_dir2_data_use_free(struct xfs_da_args *args, + struct xfs_buf *bp, struct xfs_dir2_data_unused *dup, xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); -extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, - struct xfs_dir2_data_unused *dup, xfs_dir2_data_aoff_t offset, - xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); extern struct xfs_dir2_data_free *xfs_dir2_data_freefind( - struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_unused *dup); + struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf, + struct xfs_dir2_data_unused *dup); extern const struct xfs_buf_ops xfs_dir3_block_buf_ops; extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops; diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 12dad188939..c7cd3154026 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -18,25 +18,25 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_buf_item.h" -#include "xfs_dir2_format.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_cksum.h" +#include "xfs_dinode.h" /* * Local function prototypes. @@ -89,13 +89,14 @@ xfs_dir3_block_read_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; - if ((xfs_sb_version_hascrc(&mp->m_sb) && - !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_DIR3_DATA_CRC_OFF)) || - !xfs_dir3_block_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_dir3_block_verify(bp)) xfs_buf_ioerror(bp, EFSCORRUPTED); - } + + if (bp->b_error) + xfs_verifier_error(bp); } static void @@ -107,8 +108,8 @@ xfs_dir3_block_write_verify( struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; if (!xfs_dir3_block_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } @@ -118,7 +119,7 @@ xfs_dir3_block_write_verify( if (bip) hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_DATA_CRC_OFF); + xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF); } const struct xfs_buf_ops xfs_dir3_block_buf_ops = { @@ -135,7 +136,7 @@ xfs_dir3_block_read( struct xfs_mount *mp = dp->i_mount; int err; - err = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp, + err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp, XFS_DATA_FORK, &xfs_dir3_block_buf_ops); if (!err && tp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); @@ -168,6 +169,7 @@ xfs_dir3_block_init( static void xfs_dir2_block_need_space( + struct xfs_inode *dp, struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_block_tail *btp, struct xfs_dir2_leaf_entry *blp, @@ -183,7 +185,7 @@ xfs_dir2_block_need_space( struct xfs_dir2_data_unused *enddup = NULL; *compact = 0; - bf = xfs_dir3_data_bestfree_p(hdr); + bf = dp->d_ops->data_bestfree_p(hdr); /* * If there are stale entries we'll use one for the leaf. @@ -279,7 +281,7 @@ out: */ static void xfs_dir2_block_compact( - struct xfs_trans *tp, + struct xfs_da_args *args, struct xfs_buf *bp, struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_block_tail *btp, @@ -312,18 +314,17 @@ xfs_dir2_block_compact( *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1); *lfloghigh -= be32_to_cpu(btp->stale) - 1; be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1)); - xfs_dir2_data_make_free(tp, bp, + xfs_dir2_data_make_free(args, bp, (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)), needlog, &needscan); - blp += be32_to_cpu(btp->stale) - 1; btp->stale = cpu_to_be32(1); /* * If we now need to rebuild the bestfree map, do so. * This needs to happen before the next call to use_free. */ if (needscan) - xfs_dir2_data_freescan(tp->t_mountp, hdr, needlog); + xfs_dir2_data_freescan(args->dp, hdr, needlog); } /* @@ -369,20 +370,20 @@ xfs_dir2_block_addname( if (error) return error; - len = xfs_dir3_data_entsize(mp, args->namelen); + len = dp->d_ops->data_entsize(args->namelen); /* * Set up pointers to parts of the block. */ hdr = bp->b_addr; - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Find out if we can reuse stale entries or whether we need extra * space for entry and new leaf. */ - xfs_dir2_block_need_space(hdr, btp, blp, &tagp, &dup, + xfs_dir2_block_need_space(dp, hdr, btp, blp, &tagp, &dup, &enddup, &compact, len); /* @@ -418,7 +419,7 @@ xfs_dir2_block_addname( * If need to compact the leaf entries, do it now. */ if (compact) { - xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog, + xfs_dir2_block_compact(args, bp, hdr, btp, blp, &needlog, &lfloghigh, &lfloglow); /* recalculate blp post-compaction */ blp = xfs_dir2_block_leaf_p(btp); @@ -453,7 +454,7 @@ xfs_dir2_block_addname( /* * Mark the space needed for the new leaf entry, now in use. */ - xfs_dir2_data_use_free(tp, bp, enddup, + xfs_dir2_data_use_free(args, bp, enddup, (xfs_dir2_data_aoff_t) ((char *)enddup - (char *)hdr + be16_to_cpu(enddup->length) - sizeof(*blp)), @@ -468,7 +469,7 @@ xfs_dir2_block_addname( * This needs to happen before the next call to use_free. */ if (needscan) { - xfs_dir2_data_freescan(mp, hdr, &needlog); + xfs_dir2_data_freescan(dp, hdr, &needlog); needscan = 0; } /* @@ -534,13 +535,13 @@ xfs_dir2_block_addname( * Fill in the leaf entry. */ blp[mid].hashval = cpu_to_be32(args->hashval); - blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, + blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( (char *)dep - (char *)hdr)); xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh); /* * Mark space for the data entry used. */ - xfs_dir2_data_use_free(tp, bp, dup, + xfs_dir2_data_use_free(args, bp, dup, (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), (xfs_dir2_data_aoff_t)len, &needlog, &needscan); /* @@ -549,18 +550,18 @@ xfs_dir2_block_addname( dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, args->namelen); - xfs_dir3_dirent_put_ftype(mp, dep, args->filetype); - tagp = xfs_dir3_data_entry_tag_p(mp, dep); + dp->d_ops->data_put_ftype(dep, args->filetype); + tagp = dp->d_ops->data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); /* * Clean up the bestfree array and log the header, tail, and entry. */ if (needscan) - xfs_dir2_data_freescan(mp, hdr, &needlog); + xfs_dir2_data_freescan(dp, hdr, &needlog); if (needlog) - xfs_dir2_data_log_header(tp, bp); + xfs_dir2_data_log_header(args, bp); xfs_dir2_block_log_tail(tp, bp); - xfs_dir2_data_log_entry(tp, bp, dep); + xfs_dir2_data_log_entry(args, bp, dep); xfs_dir3_data_check(dp, bp); return 0; } @@ -579,7 +580,7 @@ xfs_dir2_block_log_leaf( xfs_dir2_leaf_entry_t *blp; xfs_dir2_block_tail_t *btp; - btp = xfs_dir2_block_tail_p(tp->t_mountp, hdr); + btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr); blp = xfs_dir2_block_leaf_p(btp); xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr), (uint)((char *)&blp[last + 1] - (char *)hdr - 1)); @@ -596,7 +597,7 @@ xfs_dir2_block_log_tail( xfs_dir2_data_hdr_t *hdr = bp->b_addr; xfs_dir2_block_tail_t *btp; - btp = xfs_dir2_block_tail_p(tp->t_mountp, hdr); + btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr); xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr), (uint)((char *)(btp + 1) - (char *)hdr - 1)); } @@ -631,18 +632,19 @@ xfs_dir2_block_lookup( mp = dp->i_mount; hdr = bp->b_addr; xfs_dir3_data_check(dp, bp); - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Get the offset from the leaf entry, to point to the data. */ dep = (xfs_dir2_data_entry_t *)((char *)hdr + - xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(blp[ent].address))); /* * Fill in inode number, CI name if appropriate, release the block. */ args->inumber = be64_to_cpu(dep->inumber); - args->filetype = xfs_dir3_dirent_get_ftype(mp, dep); + args->filetype = dp->d_ops->data_get_ftype(dep); error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); xfs_trans_brelse(args->trans, bp); return XFS_ERROR(error); @@ -683,7 +685,7 @@ xfs_dir2_block_lookup_int( hdr = bp->b_addr; xfs_dir3_data_check(dp, bp); - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Loop doing a binary search for our hash value. @@ -721,7 +723,7 @@ xfs_dir2_block_lookup_int( * Get pointer to the entry from the leaf. */ dep = (xfs_dir2_data_entry_t *) - ((char *)hdr + xfs_dir2_dataptr_to_off(mp, addr)); + ((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, addr)); /* * Compare name and if it's an exact match, return the index * and buffer. If it's the first case-insensitive match, store @@ -788,20 +790,21 @@ xfs_dir2_block_removename( tp = args->trans; mp = dp->i_mount; hdr = bp->b_addr; - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Point to the data entry using the leaf entry. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(blp[ent].address))); /* * Mark the data entry's space free. */ needlog = needscan = 0; - xfs_dir2_data_make_free(tp, bp, + xfs_dir2_data_make_free(args, bp, (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), - xfs_dir3_data_entsize(mp, dep->namelen), &needlog, &needscan); + dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan); /* * Fix up the block tail. */ @@ -816,9 +819,9 @@ xfs_dir2_block_removename( * Fix up bestfree, log the header if necessary. */ if (needscan) - xfs_dir2_data_freescan(mp, hdr, &needlog); + xfs_dir2_data_freescan(dp, hdr, &needlog); if (needlog) - xfs_dir2_data_log_header(tp, bp); + xfs_dir2_data_log_header(args, bp); xfs_dir3_data_check(dp, bp); /* * See if the size as a shortform is good enough. @@ -863,20 +866,21 @@ xfs_dir2_block_replace( dp = args->dp; mp = dp->i_mount; hdr = bp->b_addr; - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Point to the data entry we need to change. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(blp[ent].address))); ASSERT(be64_to_cpu(dep->inumber) != args->inumber); /* * Change the inode number to the new value. */ dep->inumber = cpu_to_be64(args->inumber); - xfs_dir3_dirent_put_ftype(mp, dep, args->filetype); - xfs_dir2_data_log_entry(args->trans, bp, dep); + dp->d_ops->data_put_ftype(dep, args->filetype); + xfs_dir2_data_log_entry(args, bp, dep); xfs_dir3_data_check(dp, bp); return 0; } @@ -934,9 +938,9 @@ xfs_dir2_leaf_to_block( tp = args->trans; mp = dp->i_mount; leaf = lbp->b_addr; - xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); - ents = xfs_dir3_leaf_ents_p(leaf); - ltp = xfs_dir2_leaf_tail_p(mp, leaf); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC || leafhdr.magic == XFS_DIR3_LEAF1_MAGIC); @@ -946,13 +950,13 @@ xfs_dir2_leaf_to_block( * been left behind during no-space-reservation operations. * These will show up in the leaf bests table. */ - while (dp->i_d.di_size > mp->m_dirblksize) { + while (dp->i_d.di_size > args->geo->blksize) { int hdrsz; - hdrsz = xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&mp->m_sb)); + hdrsz = dp->d_ops->data_entry_offset; bestsp = xfs_dir2_leaf_bests_p(ltp); if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) == - mp->m_dirblksize - hdrsz) { + args->geo->blksize - hdrsz) { if ((error = xfs_dir2_leaf_trim_data(args, lbp, (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1)))) @@ -964,7 +968,7 @@ xfs_dir2_leaf_to_block( * Read the data block if we don't already have it, give up if it fails. */ if (!dbp) { - error = xfs_dir3_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp); + error = xfs_dir3_data_read(tp, dp, args->geo->datablk, -1, &dbp); if (error) return error; } @@ -980,7 +984,7 @@ xfs_dir2_leaf_to_block( /* * Look at the last data entry. */ - tagp = (__be16 *)((char *)hdr + mp->m_dirblksize) - 1; + tagp = (__be16 *)((char *)hdr + args->geo->blksize) - 1; dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); /* * If it's not free or is too short we can't do it. @@ -999,12 +1003,12 @@ xfs_dir2_leaf_to_block( /* * Use up the space at the end of the block (blp/btp). */ - xfs_dir2_data_use_free(tp, dbp, dup, mp->m_dirblksize - size, size, + xfs_dir2_data_use_free(args, dbp, dup, args->geo->blksize - size, size, &needlog, &needscan); /* * Initialize the block tail. */ - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(args->geo, hdr); btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale); btp->stale = 0; xfs_dir2_block_log_tail(tp, dbp); @@ -1023,13 +1027,13 @@ xfs_dir2_leaf_to_block( * Scan the bestfree if we need it and log the data block header. */ if (needscan) - xfs_dir2_data_freescan(mp, hdr, &needlog); + xfs_dir2_data_freescan(dp, hdr, &needlog); if (needlog) - xfs_dir2_data_log_header(tp, dbp); + xfs_dir2_data_log_header(args, dbp); /* * Pitch the old leaf block. */ - error = xfs_da_shrink_inode(args, mp->m_dirleafblk, lbp); + error = xfs_da_shrink_inode(args, args->geo->leafblk, lbp); if (error) return error; @@ -1136,15 +1140,15 @@ xfs_dir2_sf_to_block( * The whole thing is initialized to free by the init routine. * Say we're using the leaf and tail area. */ - dup = xfs_dir3_data_unused_p(hdr); + dup = dp->d_ops->data_unused_p(hdr); needlog = needscan = 0; - xfs_dir2_data_use_free(tp, bp, dup, mp->m_dirblksize - i, i, &needlog, - &needscan); + xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i, + i, &needlog, &needscan); ASSERT(needscan == 0); /* * Fill in the tail. */ - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(args->geo, hdr); btp->count = cpu_to_be32(sfp->count + 2); /* ., .. */ btp->stale = 0; blp = xfs_dir2_block_leaf_p(btp); @@ -1152,38 +1156,38 @@ xfs_dir2_sf_to_block( /* * Remove the freespace, we'll manage it. */ - xfs_dir2_data_use_free(tp, bp, dup, + xfs_dir2_data_use_free(args, bp, dup, (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), be16_to_cpu(dup->length), &needlog, &needscan); /* * Create entry for . */ - dep = xfs_dir3_data_dot_entry_p(mp, hdr); + dep = dp->d_ops->data_dot_entry_p(hdr); dep->inumber = cpu_to_be64(dp->i_ino); dep->namelen = 1; dep->name[0] = '.'; - xfs_dir3_dirent_put_ftype(mp, dep, XFS_DIR3_FT_DIR); - tagp = xfs_dir3_data_entry_tag_p(mp, dep); + dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR); + tagp = dp->d_ops->data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); - xfs_dir2_data_log_entry(tp, bp, dep); + xfs_dir2_data_log_entry(args, bp, dep); blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot); - blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, + blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( (char *)dep - (char *)hdr)); /* * Create entry for .. */ - dep = xfs_dir3_data_dotdot_entry_p(mp, hdr); - dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp)); + dep = dp->d_ops->data_dotdot_entry_p(hdr); + dep->inumber = cpu_to_be64(dp->d_ops->sf_get_parent_ino(sfp)); dep->namelen = 2; dep->name[0] = dep->name[1] = '.'; - xfs_dir3_dirent_put_ftype(mp, dep, XFS_DIR3_FT_DIR); - tagp = xfs_dir3_data_entry_tag_p(mp, dep); + dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR); + tagp = dp->d_ops->data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); - xfs_dir2_data_log_entry(tp, bp, dep); + xfs_dir2_data_log_entry(args, bp, dep); blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot); - blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, + blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( (char *)dep - (char *)hdr)); - offset = xfs_dir3_data_first_offset(mp); + offset = dp->d_ops->data_first_offset; /* * Loop over existing entries, stuff them in. */ @@ -1213,8 +1217,10 @@ xfs_dir2_sf_to_block( dup->length = cpu_to_be16(newoffset - offset); *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16( ((char *)dup - (char *)hdr)); - xfs_dir2_data_log_unused(tp, bp, dup); - xfs_dir2_data_freeinsert(hdr, dup, &dummy); + xfs_dir2_data_log_unused(args, bp, dup); + xfs_dir2_data_freeinsert(hdr, + dp->d_ops->data_bestfree_p(hdr), + dup, &dummy); offset += be16_to_cpu(dup->length); continue; } @@ -1222,25 +1228,24 @@ xfs_dir2_sf_to_block( * Copy a real entry. */ dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset); - dep->inumber = cpu_to_be64(xfs_dir3_sfe_get_ino(mp, sfp, sfep)); + dep->inumber = cpu_to_be64(dp->d_ops->sf_get_ino(sfp, sfep)); dep->namelen = sfep->namelen; - xfs_dir3_dirent_put_ftype(mp, dep, - xfs_dir3_sfe_get_ftype(mp, sfp, sfep)); + dp->d_ops->data_put_ftype(dep, dp->d_ops->sf_get_ftype(sfep)); memcpy(dep->name, sfep->name, dep->namelen); - tagp = xfs_dir3_data_entry_tag_p(mp, dep); + tagp = dp->d_ops->data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); - xfs_dir2_data_log_entry(tp, bp, dep); + xfs_dir2_data_log_entry(args, bp, dep); name.name = sfep->name; name.len = sfep->namelen; blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops-> hashname(&name)); - blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, + blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( (char *)dep - (char *)hdr)); offset = (int)((char *)(tagp + 1) - (char *)hdr); if (++i == sfp->count) sfep = NULL; else - sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep); + sfep = dp->d_ops->sf_nextentry(sfp, sfep); } /* Done with the temporary buffer */ kmem_free(sfp); diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index 47e1326c169..8c2f6422648 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -18,20 +18,19 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_dir2_format.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_cksum.h" @@ -63,32 +62,52 @@ __xfs_dir3_data_check( char *p; /* current data position */ int stale; /* count of stale leaves */ struct xfs_name name; + const struct xfs_dir_ops *ops; + struct xfs_da_geometry *geo; mp = bp->b_target->bt_mount; + geo = mp->m_dir_geo; + + /* + * We can be passed a null dp here from a verifier, so we need to go the + * hard way to get them. + */ + ops = xfs_dir_get_ops(mp, dp); + hdr = bp->b_addr; - bf = xfs_dir3_data_bestfree_p(hdr); - p = (char *)xfs_dir3_data_entry_p(hdr); + p = (char *)ops->data_entry_p(hdr); switch (hdr->magic) { case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(geo, hdr); lep = xfs_dir2_block_leaf_p(btp); endp = (char *)lep; + + /* + * The number of leaf entries is limited by the size of the + * block and the amount of space used by the data entries. + * We don't know how much space is used by the data entries yet, + * so just ensure that the count falls somewhere inside the + * block right now. + */ + XFS_WANT_CORRUPTED_RETURN(be32_to_cpu(btp->count) < + ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry)); break; case cpu_to_be32(XFS_DIR3_DATA_MAGIC): case cpu_to_be32(XFS_DIR2_DATA_MAGIC): - endp = (char *)hdr + mp->m_dirblksize; + endp = (char *)hdr + geo->blksize; break; default: XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp); return EFSCORRUPTED; } - count = lastfree = freeseen = 0; /* * Account for zero bestfree entries. */ + bf = ops->data_bestfree_p(hdr); + count = lastfree = freeseen = 0; if (!bf[0].length) { XFS_WANT_CORRUPTED_RETURN(!bf[0].offset); freeseen |= 1 << 0; @@ -121,7 +140,7 @@ __xfs_dir3_data_check( XFS_WANT_CORRUPTED_RETURN( be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == (char *)dup - (char *)hdr); - dfp = xfs_dir2_data_freefind(hdr, dup); + dfp = xfs_dir2_data_freefind(hdr, bf, dup); if (dfp) { i = (int)(dfp - bf); XFS_WANT_CORRUPTED_RETURN( @@ -147,17 +166,17 @@ __xfs_dir3_data_check( XFS_WANT_CORRUPTED_RETURN( !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))); XFS_WANT_CORRUPTED_RETURN( - be16_to_cpu(*xfs_dir3_data_entry_tag_p(mp, dep)) == + be16_to_cpu(*ops->data_entry_tag_p(dep)) == (char *)dep - (char *)hdr); XFS_WANT_CORRUPTED_RETURN( - xfs_dir3_dirent_get_ftype(mp, dep) < XFS_DIR3_FT_MAX); + ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX); count++; lastfree = 0; if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { - addr = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, - (xfs_dir2_data_aoff_t) - ((char *)dep - (char *)hdr)); + addr = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + (xfs_dir2_data_aoff_t) + ((char *)dep - (char *)hdr)); name.name = dep->name; name.len = dep->namelen; hash = mp->m_dirnameops->hashname(&name); @@ -168,7 +187,7 @@ __xfs_dir3_data_check( } XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count)); } - p += xfs_dir3_data_entsize(mp, dep->namelen); + p += ops->data_entsize(dep->namelen); } /* * Need to have seen all the entries and all the bestfree slots. @@ -224,7 +243,6 @@ static void xfs_dir3_data_reada_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir2_data_hdr *hdr = bp->b_addr; switch (hdr->magic) { @@ -238,8 +256,8 @@ xfs_dir3_data_reada_verify( xfs_dir3_data_verify(bp); return; default: - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); break; } } @@ -250,13 +268,14 @@ xfs_dir3_data_read_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; - if ((xfs_sb_version_hascrc(&mp->m_sb) && - !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_DIR3_DATA_CRC_OFF)) || - !xfs_dir3_data_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_dir3_data_verify(bp)) xfs_buf_ioerror(bp, EFSCORRUPTED); - } + + if (bp->b_error) + xfs_verifier_error(bp); } static void @@ -268,8 +287,8 @@ xfs_dir3_data_write_verify( struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; if (!xfs_dir3_data_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } @@ -279,7 +298,7 @@ xfs_dir3_data_write_verify( if (bip) hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_DATA_CRC_OFF); + xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF); } const struct xfs_buf_ops xfs_dir3_data_buf_ops = { @@ -312,12 +331,11 @@ xfs_dir3_data_read( int xfs_dir3_data_readahead( - struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno) { - return xfs_da_reada_buf(tp, dp, bno, mapped_bno, + return xfs_da_reada_buf(dp, bno, mapped_bno, XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops); } @@ -327,19 +345,18 @@ xfs_dir3_data_readahead( */ xfs_dir2_data_free_t * xfs_dir2_data_freefind( - xfs_dir2_data_hdr_t *hdr, /* data block */ - xfs_dir2_data_unused_t *dup) /* data unused entry */ + struct xfs_dir2_data_hdr *hdr, /* data block header */ + struct xfs_dir2_data_free *bf, /* bestfree table pointer */ + struct xfs_dir2_data_unused *dup) /* unused space */ { xfs_dir2_data_free_t *dfp; /* bestfree entry */ xfs_dir2_data_aoff_t off; /* offset value needed */ - struct xfs_dir2_data_free *bf; #ifdef DEBUG int matched; /* matched the value */ int seenzero; /* saw a 0 bestfree entry */ #endif off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); - bf = xfs_dir3_data_bestfree_p(hdr); #ifdef DEBUG /* @@ -399,11 +416,11 @@ xfs_dir2_data_freefind( */ xfs_dir2_data_free_t * /* entry inserted */ xfs_dir2_data_freeinsert( - xfs_dir2_data_hdr_t *hdr, /* data block pointer */ - xfs_dir2_data_unused_t *dup, /* unused space */ + struct xfs_dir2_data_hdr *hdr, /* data block pointer */ + struct xfs_dir2_data_free *dfp, /* bestfree table pointer */ + struct xfs_dir2_data_unused *dup, /* unused space */ int *loghead) /* log the data header (out) */ { - xfs_dir2_data_free_t *dfp; /* bestfree table pointer */ xfs_dir2_data_free_t new; /* new bestfree entry */ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || @@ -411,7 +428,6 @@ xfs_dir2_data_freeinsert( hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); - dfp = xfs_dir3_data_bestfree_p(hdr); new.length = dup->length; new.offset = cpu_to_be16((char *)dup - (char *)hdr); @@ -444,11 +460,11 @@ xfs_dir2_data_freeinsert( */ STATIC void xfs_dir2_data_freeremove( - xfs_dir2_data_hdr_t *hdr, /* data block header */ - xfs_dir2_data_free_t *dfp, /* bestfree entry pointer */ + struct xfs_dir2_data_hdr *hdr, /* data block header */ + struct xfs_dir2_data_free *bf, /* bestfree table pointer */ + struct xfs_dir2_data_free *dfp, /* bestfree entry pointer */ int *loghead) /* out: log data header */ { - struct xfs_dir2_data_free *bf; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || @@ -458,7 +474,6 @@ xfs_dir2_data_freeremove( /* * It's the first entry, slide the next 2 up. */ - bf = xfs_dir3_data_bestfree_p(hdr); if (dfp == &bf[0]) { bf[0] = bf[1]; bf[1] = bf[2]; @@ -486,9 +501,9 @@ xfs_dir2_data_freeremove( */ void xfs_dir2_data_freescan( - xfs_mount_t *mp, /* filesystem mount point */ - xfs_dir2_data_hdr_t *hdr, /* data block header */ - int *loghead) /* out: log data header */ + struct xfs_inode *dp, + struct xfs_dir2_data_hdr *hdr, + int *loghead) { xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* active data entry */ @@ -496,6 +511,7 @@ xfs_dir2_data_freescan( struct xfs_dir2_data_free *bf; char *endp; /* end of block's data */ char *p; /* current entry pointer */ + struct xfs_da_geometry *geo = dp->i_mount->m_dir_geo; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || @@ -505,19 +521,19 @@ xfs_dir2_data_freescan( /* * Start by clearing the table. */ - bf = xfs_dir3_data_bestfree_p(hdr); + bf = dp->d_ops->data_bestfree_p(hdr); memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT); *loghead = 1; /* * Set up pointers. */ - p = (char *)xfs_dir3_data_entry_p(hdr); + p = (char *)dp->d_ops->data_entry_p(hdr); if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(geo, hdr); endp = (char *)xfs_dir2_block_leaf_p(btp); } else - endp = (char *)hdr + mp->m_dirblksize; + endp = (char *)hdr + geo->blksize; /* * Loop over the block's entries. */ @@ -529,7 +545,7 @@ xfs_dir2_data_freescan( if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { ASSERT((char *)dup - (char *)hdr == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup))); - xfs_dir2_data_freeinsert(hdr, dup, loghead); + xfs_dir2_data_freeinsert(hdr, bf, dup, loghead); p += be16_to_cpu(dup->length); } /* @@ -538,8 +554,8 @@ xfs_dir2_data_freescan( else { dep = (xfs_dir2_data_entry_t *)p; ASSERT((char *)dep - (char *)hdr == - be16_to_cpu(*xfs_dir3_data_entry_tag_p(mp, dep))); - p += xfs_dir3_data_entsize(mp, dep->namelen); + be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep))); + p += dp->d_ops->data_entsize(dep->namelen); } } } @@ -571,8 +587,8 @@ xfs_dir3_data_init( /* * Get the buffer set up for the block. */ - error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp, - XFS_DATA_FORK); + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, blkno), + -1, &bp, XFS_DATA_FORK); if (error) return error; bp->b_ops = &xfs_dir3_data_buf_ops; @@ -594,8 +610,8 @@ xfs_dir3_data_init( } else hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); - bf = xfs_dir3_data_bestfree_p(hdr); - bf[0].offset = cpu_to_be16(xfs_dir3_data_entry_offset(hdr)); + bf = dp->d_ops->data_bestfree_p(hdr); + bf[0].offset = cpu_to_be16(dp->d_ops->data_entry_offset); for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) { bf[i].length = 0; bf[i].offset = 0; @@ -604,18 +620,18 @@ xfs_dir3_data_init( /* * Set up an unused entry for the block's body. */ - dup = xfs_dir3_data_unused_p(hdr); + dup = dp->d_ops->data_unused_p(hdr); dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); - t = mp->m_dirblksize - (uint)xfs_dir3_data_entry_offset(hdr); + t = args->geo->blksize - (uint)dp->d_ops->data_entry_offset; bf[0].length = cpu_to_be16(t); dup->length = cpu_to_be16(t); *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr); /* * Log it and return it. */ - xfs_dir2_data_log_header(tp, bp); - xfs_dir2_data_log_unused(tp, bp, dup); + xfs_dir2_data_log_header(args, bp); + xfs_dir2_data_log_unused(args, bp, dup); *bpp = bp; return 0; } @@ -625,20 +641,19 @@ xfs_dir3_data_init( */ void xfs_dir2_data_log_entry( - struct xfs_trans *tp, + struct xfs_da_args *args, struct xfs_buf *bp, xfs_dir2_data_entry_t *dep) /* data entry pointer */ { struct xfs_dir2_data_hdr *hdr = bp->b_addr; - struct xfs_mount *mp = tp->t_mountp; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); - xfs_trans_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr), - (uint)((char *)(xfs_dir3_data_entry_tag_p(mp, dep) + 1) - + xfs_trans_log_buf(args->trans, bp, (uint)((char *)dep - (char *)hdr), + (uint)((char *)(args->dp->d_ops->data_entry_tag_p(dep) + 1) - (char *)hdr - 1)); } @@ -647,17 +662,20 @@ xfs_dir2_data_log_entry( */ void xfs_dir2_data_log_header( - struct xfs_trans *tp, + struct xfs_da_args *args, struct xfs_buf *bp) { - xfs_dir2_data_hdr_t *hdr = bp->b_addr; +#ifdef DEBUG + struct xfs_dir2_data_hdr *hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); +#endif - xfs_trans_log_buf(tp, bp, 0, xfs_dir3_data_entry_offset(hdr) - 1); + xfs_trans_log_buf(args->trans, bp, 0, + args->dp->d_ops->data_entry_offset - 1); } /* @@ -665,7 +683,7 @@ xfs_dir2_data_log_header( */ void xfs_dir2_data_log_unused( - struct xfs_trans *tp, + struct xfs_da_args *args, struct xfs_buf *bp, xfs_dir2_data_unused_t *dup) /* data unused pointer */ { @@ -679,13 +697,13 @@ xfs_dir2_data_log_unused( /* * Log the first part of the unused entry. */ - xfs_trans_log_buf(tp, bp, (uint)((char *)dup - (char *)hdr), + xfs_trans_log_buf(args->trans, bp, (uint)((char *)dup - (char *)hdr), (uint)((char *)&dup->length + sizeof(dup->length) - 1 - (char *)hdr)); /* * Log the end (tag) of the unused entry. */ - xfs_trans_log_buf(tp, bp, + xfs_trans_log_buf(args->trans, bp, (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr), (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr + sizeof(xfs_dir2_data_off_t) - 1)); @@ -697,7 +715,7 @@ xfs_dir2_data_log_unused( */ void xfs_dir2_data_make_free( - struct xfs_trans *tp, + struct xfs_da_args *args, struct xfs_buf *bp, xfs_dir2_data_aoff_t offset, /* starting byte offset */ xfs_dir2_data_aoff_t len, /* length in bytes */ @@ -707,14 +725,12 @@ xfs_dir2_data_make_free( xfs_dir2_data_hdr_t *hdr; /* data block pointer */ xfs_dir2_data_free_t *dfp; /* bestfree pointer */ char *endptr; /* end of data area */ - xfs_mount_t *mp; /* filesystem mount point */ int needscan; /* need to regen bestfree */ xfs_dir2_data_unused_t *newdup; /* new unused entry */ xfs_dir2_data_unused_t *postdup; /* unused entry after us */ xfs_dir2_data_unused_t *prevdup; /* unused entry before us */ struct xfs_dir2_data_free *bf; - mp = tp->t_mountp; hdr = bp->b_addr; /* @@ -722,20 +738,20 @@ xfs_dir2_data_make_free( */ if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)) - endptr = (char *)hdr + mp->m_dirblksize; + endptr = (char *)hdr + args->geo->blksize; else { xfs_dir2_block_tail_t *btp; /* block tail */ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(args->geo, hdr); endptr = (char *)xfs_dir2_block_leaf_p(btp); } /* * If this isn't the start of the block, then back up to * the previous entry and see if it's free. */ - if (offset > xfs_dir3_data_entry_offset(hdr)) { + if (offset > args->dp->d_ops->data_entry_offset) { __be16 *tagp; /* tag just before us */ tagp = (__be16 *)((char *)hdr + offset) - 1; @@ -761,15 +777,15 @@ xfs_dir2_data_make_free( * Previous and following entries are both free, * merge everything into a single free entry. */ - bf = xfs_dir3_data_bestfree_p(hdr); + bf = args->dp->d_ops->data_bestfree_p(hdr); if (prevdup && postdup) { xfs_dir2_data_free_t *dfp2; /* another bestfree pointer */ /* * See if prevdup and/or postdup are in bestfree table. */ - dfp = xfs_dir2_data_freefind(hdr, prevdup); - dfp2 = xfs_dir2_data_freefind(hdr, postdup); + dfp = xfs_dir2_data_freefind(hdr, bf, prevdup); + dfp2 = xfs_dir2_data_freefind(hdr, bf, postdup); /* * We need a rescan unless there are exactly 2 free entries * namely our two. Then we know what's happening, otherwise @@ -783,7 +799,7 @@ xfs_dir2_data_make_free( be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length)); *xfs_dir2_data_unused_tag_p(prevdup) = cpu_to_be16((char *)prevdup - (char *)hdr); - xfs_dir2_data_log_unused(tp, bp, prevdup); + xfs_dir2_data_log_unused(args, bp, prevdup); if (!needscan) { /* * Has to be the case that entries 0 and 1 are @@ -797,12 +813,13 @@ xfs_dir2_data_make_free( ASSERT(dfp2 == dfp); dfp2 = &bf[1]; } - xfs_dir2_data_freeremove(hdr, dfp2, needlogp); - xfs_dir2_data_freeremove(hdr, dfp, needlogp); + xfs_dir2_data_freeremove(hdr, bf, dfp2, needlogp); + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); /* * Now insert the new entry. */ - dfp = xfs_dir2_data_freeinsert(hdr, prevdup, needlogp); + dfp = xfs_dir2_data_freeinsert(hdr, bf, prevdup, + needlogp); ASSERT(dfp == &bf[0]); ASSERT(dfp->length == prevdup->length); ASSERT(!dfp[1].length); @@ -813,19 +830,19 @@ xfs_dir2_data_make_free( * The entry before us is free, merge with it. */ else if (prevdup) { - dfp = xfs_dir2_data_freefind(hdr, prevdup); + dfp = xfs_dir2_data_freefind(hdr, bf, prevdup); be16_add_cpu(&prevdup->length, len); *xfs_dir2_data_unused_tag_p(prevdup) = cpu_to_be16((char *)prevdup - (char *)hdr); - xfs_dir2_data_log_unused(tp, bp, prevdup); + xfs_dir2_data_log_unused(args, bp, prevdup); /* * If the previous entry was in the table, the new entry * is longer, so it will be in the table too. Remove * the old one and add the new one. */ if (dfp) { - xfs_dir2_data_freeremove(hdr, dfp, needlogp); - xfs_dir2_data_freeinsert(hdr, prevdup, needlogp); + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); + xfs_dir2_data_freeinsert(hdr, bf, prevdup, needlogp); } /* * Otherwise we need a scan if the new entry is big enough. @@ -839,21 +856,21 @@ xfs_dir2_data_make_free( * The following entry is free, merge with it. */ else if (postdup) { - dfp = xfs_dir2_data_freefind(hdr, postdup); + dfp = xfs_dir2_data_freefind(hdr, bf, postdup); newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset); newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length)); *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)hdr); - xfs_dir2_data_log_unused(tp, bp, newdup); + xfs_dir2_data_log_unused(args, bp, newdup); /* * If the following entry was in the table, the new entry * is longer, so it will be in the table too. Remove * the old one and add the new one. */ if (dfp) { - xfs_dir2_data_freeremove(hdr, dfp, needlogp); - xfs_dir2_data_freeinsert(hdr, newdup, needlogp); + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); + xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp); } /* * Otherwise we need a scan if the new entry is big enough. @@ -872,8 +889,8 @@ xfs_dir2_data_make_free( newdup->length = cpu_to_be16(len); *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)hdr); - xfs_dir2_data_log_unused(tp, bp, newdup); - xfs_dir2_data_freeinsert(hdr, newdup, needlogp); + xfs_dir2_data_log_unused(args, bp, newdup); + xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp); } *needscanp = needscan; } @@ -883,7 +900,7 @@ xfs_dir2_data_make_free( */ void xfs_dir2_data_use_free( - struct xfs_trans *tp, + struct xfs_da_args *args, struct xfs_buf *bp, xfs_dir2_data_unused_t *dup, /* unused entry */ xfs_dir2_data_aoff_t offset, /* starting offset to use */ @@ -913,9 +930,9 @@ xfs_dir2_data_use_free( /* * Look up the entry in the bestfree table. */ - dfp = xfs_dir2_data_freefind(hdr, dup); oldlen = be16_to_cpu(dup->length); - bf = xfs_dir3_data_bestfree_p(hdr); + bf = args->dp->d_ops->data_bestfree_p(hdr); + dfp = xfs_dir2_data_freefind(hdr, bf, dup); ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length)); /* * Check for alignment with front and back of the entry. @@ -932,7 +949,8 @@ xfs_dir2_data_use_free( if (dfp) { needscan = (bf[2].offset != 0); if (!needscan) - xfs_dir2_data_freeremove(hdr, dfp, needlogp); + xfs_dir2_data_freeremove(hdr, bf, dfp, + needlogp); } } /* @@ -945,13 +963,14 @@ xfs_dir2_data_use_free( newdup->length = cpu_to_be16(oldlen - len); *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)hdr); - xfs_dir2_data_log_unused(tp, bp, newdup); + xfs_dir2_data_log_unused(args, bp, newdup); /* * If it was in the table, remove it and add the new one. */ if (dfp) { - xfs_dir2_data_freeremove(hdr, dfp, needlogp); - dfp = xfs_dir2_data_freeinsert(hdr, newdup, needlogp); + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); + dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup, + needlogp); ASSERT(dfp != NULL); ASSERT(dfp->length == newdup->length); ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr); @@ -972,13 +991,14 @@ xfs_dir2_data_use_free( newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup); *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)hdr); - xfs_dir2_data_log_unused(tp, bp, newdup); + xfs_dir2_data_log_unused(args, bp, newdup); /* * If it was in the table, remove it and add the new one. */ if (dfp) { - xfs_dir2_data_freeremove(hdr, dfp, needlogp); - dfp = xfs_dir2_data_freeinsert(hdr, newdup, needlogp); + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); + dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup, + needlogp); ASSERT(dfp != NULL); ASSERT(dfp->length == newdup->length); ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr); @@ -999,13 +1019,13 @@ xfs_dir2_data_use_free( newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup); *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)hdr); - xfs_dir2_data_log_unused(tp, bp, newdup); + xfs_dir2_data_log_unused(args, bp, newdup); newdup2 = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len); newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length)); *xfs_dir2_data_unused_tag_p(newdup2) = cpu_to_be16((char *)newdup2 - (char *)hdr); - xfs_dir2_data_log_unused(tp, bp, newdup2); + xfs_dir2_data_log_unused(args, bp, newdup2); /* * If the old entry was in the table, we need to scan * if the 3rd entry was valid, since these entries @@ -1017,9 +1037,11 @@ xfs_dir2_data_use_free( if (dfp) { needscan = (bf[2].length != 0); if (!needscan) { - xfs_dir2_data_freeremove(hdr, dfp, needlogp); - xfs_dir2_data_freeinsert(hdr, newdup, needlogp); - xfs_dir2_data_freeinsert(hdr, newdup2, + xfs_dir2_data_freeremove(hdr, bf, dfp, + needlogp); + xfs_dir2_data_freeinsert(hdr, bf, newdup, + needlogp); + xfs_dir2_data_freeinsert(hdr, bf, newdup2, needlogp); } } diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 1021c8356d0..fb0aad4440c 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -18,23 +18,21 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_bmap.h" -#include "xfs_dir2_format.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_cksum.h" @@ -43,30 +41,31 @@ */ static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp, int *indexp, struct xfs_buf **dbpp); -static void xfs_dir3_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp, - int first, int last); -static void xfs_dir3_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp); +static void xfs_dir3_leaf_log_bests(struct xfs_da_args *args, + struct xfs_buf *bp, int first, int last); +static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args, + struct xfs_buf *bp); /* * Check the internal consistency of a leaf1 block. * Pop an assert if something is wrong. */ #ifdef DEBUG -#define xfs_dir3_leaf_check(mp, bp) \ +#define xfs_dir3_leaf_check(dp, bp) \ do { \ - if (!xfs_dir3_leaf1_check((mp), (bp))) \ + if (!xfs_dir3_leaf1_check((dp), (bp))) \ ASSERT(0); \ } while (0); STATIC bool xfs_dir3_leaf1_check( - struct xfs_mount *mp, + struct xfs_inode *dp, struct xfs_buf *bp) { struct xfs_dir2_leaf *leaf = bp->b_addr; struct xfs_dir3_icleaf_hdr leafhdr; - xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) { struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; @@ -75,71 +74,16 @@ xfs_dir3_leaf1_check( } else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC) return false; - return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf); + return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf); } #else -#define xfs_dir3_leaf_check(mp, bp) +#define xfs_dir3_leaf_check(dp, bp) #endif -void -xfs_dir3_leaf_hdr_from_disk( - struct xfs_dir3_icleaf_hdr *to, - struct xfs_dir2_leaf *from) -{ - if (from->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || - from->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) { - to->forw = be32_to_cpu(from->hdr.info.forw); - to->back = be32_to_cpu(from->hdr.info.back); - to->magic = be16_to_cpu(from->hdr.info.magic); - to->count = be16_to_cpu(from->hdr.count); - to->stale = be16_to_cpu(from->hdr.stale); - } else { - struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from; - - to->forw = be32_to_cpu(hdr3->info.hdr.forw); - to->back = be32_to_cpu(hdr3->info.hdr.back); - to->magic = be16_to_cpu(hdr3->info.hdr.magic); - to->count = be16_to_cpu(hdr3->count); - to->stale = be16_to_cpu(hdr3->stale); - } - - ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC || - to->magic == XFS_DIR3_LEAF1_MAGIC || - to->magic == XFS_DIR2_LEAFN_MAGIC || - to->magic == XFS_DIR3_LEAFN_MAGIC); -} - -void -xfs_dir3_leaf_hdr_to_disk( - struct xfs_dir2_leaf *to, - struct xfs_dir3_icleaf_hdr *from) -{ - ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC || - from->magic == XFS_DIR3_LEAF1_MAGIC || - from->magic == XFS_DIR2_LEAFN_MAGIC || - from->magic == XFS_DIR3_LEAFN_MAGIC); - - if (from->magic == XFS_DIR2_LEAF1_MAGIC || - from->magic == XFS_DIR2_LEAFN_MAGIC) { - to->hdr.info.forw = cpu_to_be32(from->forw); - to->hdr.info.back = cpu_to_be32(from->back); - to->hdr.info.magic = cpu_to_be16(from->magic); - to->hdr.count = cpu_to_be16(from->count); - to->hdr.stale = cpu_to_be16(from->stale); - } else { - struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to; - - hdr3->info.hdr.forw = cpu_to_be32(from->forw); - hdr3->info.hdr.back = cpu_to_be32(from->back); - hdr3->info.hdr.magic = cpu_to_be16(from->magic); - hdr3->count = cpu_to_be16(from->count); - hdr3->stale = cpu_to_be16(from->stale); - } -} - bool xfs_dir3_leaf_check_int( struct xfs_mount *mp, + struct xfs_inode *dp, struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf) { @@ -147,16 +91,30 @@ xfs_dir3_leaf_check_int( xfs_dir2_leaf_tail_t *ltp; int stale; int i; + const struct xfs_dir_ops *ops; + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_da_geometry *geo = mp->m_dir_geo; + + /* + * we can be passed a null dp here from a verifier, so we need to go the + * hard way to get them. + */ + ops = xfs_dir_get_ops(mp, dp); + + if (!hdr) { + ops->leaf_hdr_from_disk(&leafhdr, leaf); + hdr = &leafhdr; + } - ents = xfs_dir3_leaf_ents_p(leaf); - ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ents = ops->leaf_ents_p(leaf); + ltp = xfs_dir2_leaf_tail_p(geo, leaf); /* * XXX (dgc): This value is not restrictive enough. * Should factor in the size of the bests table as well. * We can deduce a value for that from di_size. */ - if (hdr->count > xfs_dir3_max_leaf_ents(mp, leaf)) + if (hdr->count > ops->leaf_max_ents(geo)) return false; /* Leaves and bests don't overlap in leaf format. */ @@ -192,7 +150,6 @@ xfs_dir3_leaf_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir2_leaf *leaf = bp->b_addr; - struct xfs_dir3_icleaf_hdr leafhdr; ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); @@ -214,8 +171,7 @@ xfs_dir3_leaf_verify( return false; } - xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); - return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf); + return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf); } static void @@ -225,13 +181,14 @@ __read_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; - if ((xfs_sb_version_hascrc(&mp->m_sb) && - !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_DIR3_LEAF_CRC_OFF)) || - !xfs_dir3_leaf_verify(bp, magic)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_dir3_leaf_verify(bp, magic)) xfs_buf_ioerror(bp, EFSCORRUPTED); - } + + if (bp->b_error) + xfs_verifier_error(bp); } static void @@ -244,8 +201,8 @@ __write_verify( struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; if (!xfs_dir3_leaf_verify(bp, magic)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } @@ -255,7 +212,7 @@ __write_verify( if (bip) hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_LEAF_CRC_OFF); + xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF); } static void @@ -368,7 +325,7 @@ xfs_dir3_leaf_init( if (type == XFS_DIR2_LEAF1_MAGIC) { struct xfs_dir2_leaf_tail *ltp; - ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(mp->m_dir_geo, leaf); ltp->bestcount = 0; bp->b_ops = &xfs_dir3_leaf1_buf_ops; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAF1_BUF); @@ -392,18 +349,18 @@ xfs_dir3_leaf_get_buf( int error; ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); - ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) && - bno < XFS_DIR2_FREE_FIRSTDB(mp)); + ASSERT(bno >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET) && + bno < xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET)); - error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp, - XFS_DATA_FORK); + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, bno), + -1, &bp, XFS_DATA_FORK); if (error) return error; xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic); - xfs_dir3_leaf_log_header(tp, bp); + xfs_dir3_leaf_log_header(args, bp); if (magic == XFS_DIR2_LEAF1_MAGIC) - xfs_dir3_leaf_log_tail(tp, bp); + xfs_dir3_leaf_log_tail(args, bp); *bpp = bp; return 0; } @@ -448,8 +405,8 @@ xfs_dir2_block_to_leaf( if ((error = xfs_da_grow_inode(args, &blkno))) { return error; } - ldb = xfs_dir2_da_to_db(mp, blkno); - ASSERT(ldb == XFS_DIR2_LEAF_FIRSTDB(mp)); + ldb = xfs_dir2_da_to_db(args->geo, blkno); + ASSERT(ldb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET)); /* * Initialize the leaf block, get a buffer for it. */ @@ -460,35 +417,35 @@ xfs_dir2_block_to_leaf( leaf = lbp->b_addr; hdr = dbp->b_addr; xfs_dir3_data_check(dp, dbp); - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); - bf = xfs_dir3_data_bestfree_p(hdr); - ents = xfs_dir3_leaf_ents_p(leaf); + bf = dp->d_ops->data_bestfree_p(hdr); + ents = dp->d_ops->leaf_ents_p(leaf); /* * Set the counts in the leaf header. */ - xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); leafhdr.count = be32_to_cpu(btp->count); leafhdr.stale = be32_to_cpu(btp->stale); - xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr); - xfs_dir3_leaf_log_header(tp, lbp); + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, lbp); /* * Could compact these but I think we always do the conversion * after squeezing out stale entries. */ memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir3_leaf_log_ents(tp, lbp, 0, leafhdr.count - 1); + xfs_dir3_leaf_log_ents(args, lbp, 0, leafhdr.count - 1); needscan = 0; needlog = 1; /* * Make the space formerly occupied by the leaf entries and block * tail be free. */ - xfs_dir2_data_make_free(tp, dbp, + xfs_dir2_data_make_free(args, dbp, (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), - (xfs_dir2_data_aoff_t)((char *)hdr + mp->m_dirblksize - + (xfs_dir2_data_aoff_t)((char *)hdr + args->geo->blksize - (char *)blp), &needlog, &needscan); /* @@ -502,11 +459,11 @@ xfs_dir2_block_to_leaf( hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); if (needscan) - xfs_dir2_data_freescan(mp, hdr, &needlog); + xfs_dir2_data_freescan(dp, hdr, &needlog); /* * Set up leaf tail and bests table. */ - ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); ltp->bestcount = cpu_to_be32(1); bestsp = xfs_dir2_leaf_bests_p(ltp); bestsp[0] = bf[0].length; @@ -514,10 +471,10 @@ xfs_dir2_block_to_leaf( * Log the data header and leaf bests table. */ if (needlog) - xfs_dir2_data_log_header(tp, dbp); - xfs_dir3_leaf_check(mp, lbp); + xfs_dir2_data_log_header(args, dbp); + xfs_dir3_leaf_check(dp, lbp); xfs_dir3_data_check(dp, dbp); - xfs_dir3_leaf_log_bests(tp, lbp, 0, 0); + xfs_dir3_leaf_log_bests(args, lbp, 0, 0); return 0; } @@ -686,7 +643,7 @@ xfs_dir2_leaf_addname( tp = args->trans; mp = dp->i_mount; - error = xfs_dir3_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); + error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp); if (error) return error; @@ -698,11 +655,11 @@ xfs_dir2_leaf_addname( */ index = xfs_dir2_leaf_search_hash(args, lbp); leaf = lbp->b_addr; - ltp = xfs_dir2_leaf_tail_p(mp, leaf); - ents = xfs_dir3_leaf_ents_p(leaf); - xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); - length = xfs_dir3_data_entsize(mp, args->namelen); + length = dp->d_ops->data_entsize(args->namelen); /* * See if there are any entries with the same hash value @@ -715,7 +672,7 @@ xfs_dir2_leaf_addname( index++, lep++) { if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) continue; - i = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); + i = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address)); ASSERT(i < be32_to_cpu(ltp->bestcount)); ASSERT(bestsp[i] != cpu_to_be16(NULLDATAOFF)); if (be16_to_cpu(bestsp[i]) >= length) { @@ -855,16 +812,17 @@ xfs_dir2_leaf_addname( memmove(&bestsp[0], &bestsp[1], be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0])); be32_add_cpu(<p->bestcount, 1); - xfs_dir3_leaf_log_tail(tp, lbp); - xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(args, lbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, + be32_to_cpu(ltp->bestcount) - 1); } /* * If we're filling in a previously empty block just log it. */ else - xfs_dir3_leaf_log_bests(tp, lbp, use_block, use_block); + xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block); hdr = dbp->b_addr; - bf = xfs_dir3_data_bestfree_p(hdr); + bf = dp->d_ops->data_bestfree_p(hdr); bestsp[use_block] = bf[0].length; grown = 1; } else { @@ -873,14 +831,14 @@ xfs_dir2_leaf_addname( * Just read that one in. */ error = xfs_dir3_data_read(tp, dp, - xfs_dir2_db_to_da(mp, use_block), - -1, &dbp); + xfs_dir2_db_to_da(args->geo, use_block), + -1, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; } hdr = dbp->b_addr; - bf = xfs_dir3_data_bestfree_p(hdr); + bf = dp->d_ops->data_bestfree_p(hdr); grown = 0; } /* @@ -893,7 +851,7 @@ xfs_dir2_leaf_addname( /* * Mark the initial part of our freespace in use for the new entry. */ - xfs_dir2_data_use_free(tp, dbp, dup, + xfs_dir2_data_use_free(args, dbp, dup, (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length, &needlog, &needscan); /* @@ -903,20 +861,20 @@ xfs_dir2_leaf_addname( dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, dep->namelen); - xfs_dir3_dirent_put_ftype(mp, dep, args->filetype); - tagp = xfs_dir3_data_entry_tag_p(mp, dep); + dp->d_ops->data_put_ftype(dep, args->filetype); + tagp = dp->d_ops->data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); /* * Need to scan fix up the bestfree table. */ if (needscan) - xfs_dir2_data_freescan(mp, hdr, &needlog); + xfs_dir2_data_freescan(dp, hdr, &needlog); /* * Need to log the data block's header. */ if (needlog) - xfs_dir2_data_log_header(tp, dbp); - xfs_dir2_data_log_entry(tp, dbp, dep); + xfs_dir2_data_log_header(args, dbp); + xfs_dir2_data_log_entry(args, dbp, dep); /* * If the bests table needs to be changed, do it. * Log the change unless we've already done that. @@ -924,7 +882,7 @@ xfs_dir2_leaf_addname( if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(bf[0].length)) { bestsp[use_block] = bf[0].length; if (!grown) - xfs_dir3_leaf_log_bests(tp, lbp, use_block, use_block); + xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block); } lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale, @@ -934,15 +892,16 @@ xfs_dir2_leaf_addname( * Fill in the new leaf entry. */ lep->hashval = cpu_to_be32(args->hashval); - lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp, use_block, + lep->address = cpu_to_be32( + xfs_dir2_db_off_to_dataptr(args->geo, use_block, be16_to_cpu(*tagp))); /* * Log the leaf fields and give up the buffers. */ - xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr); - xfs_dir3_leaf_log_header(tp, lbp); - xfs_dir3_leaf_log_ents(tp, lbp, lfloglow, lfloghigh); - xfs_dir3_leaf_check(mp, lbp); + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, lbp); + xfs_dir3_leaf_log_ents(args, lbp, lfloglow, lfloghigh); + xfs_dir3_leaf_check(dp, lbp); xfs_dir3_data_check(dp, dbp); return 0; } @@ -962,6 +921,7 @@ xfs_dir3_leaf_compact( int loglow; /* first leaf entry to log */ int to; /* target leaf index */ struct xfs_dir2_leaf_entry *ents; + struct xfs_inode *dp = args->dp; leaf = bp->b_addr; if (!leafhdr->stale) @@ -970,7 +930,7 @@ xfs_dir3_leaf_compact( /* * Compress out the stale entries in place. */ - ents = xfs_dir3_leaf_ents_p(leaf); + ents = dp->d_ops->leaf_ents_p(leaf); for (from = to = 0, loglow = -1; from < leafhdr->count; from++) { if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) continue; @@ -991,10 +951,10 @@ xfs_dir3_leaf_compact( leafhdr->count -= leafhdr->stale; leafhdr->stale = 0; - xfs_dir3_leaf_hdr_to_disk(leaf, leafhdr); - xfs_dir3_leaf_log_header(args->trans, bp); + dp->d_ops->leaf_hdr_to_disk(leaf, leafhdr); + xfs_dir3_leaf_log_header(args, bp); if (loglow != -1) - xfs_dir3_leaf_log_ents(args->trans, bp, loglow, to - 1); + xfs_dir3_leaf_log_ents(args, bp, loglow, to - 1); } /* @@ -1096,7 +1056,7 @@ xfs_dir3_leaf_compact_x1( */ static void xfs_dir3_leaf_log_bests( - xfs_trans_t *tp, /* transaction pointer */ + struct xfs_da_args *args, struct xfs_buf *bp, /* leaf buffer */ int first, /* first entry to log */ int last) /* last entry to log */ @@ -1109,10 +1069,11 @@ xfs_dir3_leaf_log_bests( ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)); - ltp = xfs_dir2_leaf_tail_p(tp->t_mountp, leaf); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); firstb = xfs_dir2_leaf_bests_p(ltp) + first; lastb = xfs_dir2_leaf_bests_p(ltp) + last; - xfs_trans_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf), + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)firstb - (char *)leaf), (uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1)); } @@ -1121,10 +1082,10 @@ xfs_dir3_leaf_log_bests( */ void xfs_dir3_leaf_log_ents( - xfs_trans_t *tp, /* transaction pointer */ - struct xfs_buf *bp, /* leaf buffer */ - int first, /* first entry to log */ - int last) /* last entry to log */ + struct xfs_da_args *args, + struct xfs_buf *bp, + int first, + int last) { xfs_dir2_leaf_entry_t *firstlep; /* pointer to first entry */ xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */ @@ -1136,10 +1097,11 @@ xfs_dir3_leaf_log_ents( leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); - ents = xfs_dir3_leaf_ents_p(leaf); + ents = args->dp->d_ops->leaf_ents_p(leaf); firstlep = &ents[first]; lastlep = &ents[last]; - xfs_trans_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf), + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)firstlep - (char *)leaf), (uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1)); } @@ -1148,7 +1110,7 @@ xfs_dir3_leaf_log_ents( */ void xfs_dir3_leaf_log_header( - struct xfs_trans *tp, + struct xfs_da_args *args, struct xfs_buf *bp) { struct xfs_dir2_leaf *leaf = bp->b_addr; @@ -1158,8 +1120,9 @@ xfs_dir3_leaf_log_header( leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); - xfs_trans_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf), - xfs_dir3_leaf_hdr_size(leaf) - 1); + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)&leaf->hdr - (char *)leaf), + args->dp->d_ops->leaf_hdr_size - 1); } /* @@ -1167,21 +1130,20 @@ xfs_dir3_leaf_log_header( */ STATIC void xfs_dir3_leaf_log_tail( - struct xfs_trans *tp, + struct xfs_da_args *args, struct xfs_buf *bp) { struct xfs_dir2_leaf *leaf = bp->b_addr; xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - struct xfs_mount *mp = tp->t_mountp; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); - ltp = xfs_dir2_leaf_tail_p(mp, leaf); - xfs_trans_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf), - (uint)(mp->m_dirblksize - 1)); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + xfs_trans_log_buf(args->trans, bp, (uint)((char *)ltp - (char *)leaf), + (uint)(args->geo->blksize - 1)); } /* @@ -1214,9 +1176,9 @@ xfs_dir2_leaf_lookup( } tp = args->trans; dp = args->dp; - xfs_dir3_leaf_check(dp->i_mount, lbp); + xfs_dir3_leaf_check(dp, lbp); leaf = lbp->b_addr; - ents = xfs_dir3_leaf_ents_p(leaf); + ents = dp->d_ops->leaf_ents_p(leaf); /* * Get to the leaf entry and contained data entry address. */ @@ -1227,12 +1189,12 @@ xfs_dir2_leaf_lookup( */ dep = (xfs_dir2_data_entry_t *) ((char *)dbp->b_addr + - xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address))); /* * Return the found inode number & CI name if appropriate */ args->inumber = be64_to_cpu(dep->inumber); - args->filetype = xfs_dir3_dirent_get_ftype(dp->i_mount, dep); + args->filetype = dp->d_ops->data_get_ftype(dep); error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); xfs_trans_brelse(tp, dbp); xfs_trans_brelse(tp, lbp); @@ -1273,15 +1235,15 @@ xfs_dir2_leaf_lookup_int( tp = args->trans; mp = dp->i_mount; - error = xfs_dir3_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); + error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp); if (error) return error; *lbpp = lbp; leaf = lbp->b_addr; - xfs_dir3_leaf_check(mp, lbp); - ents = xfs_dir3_leaf_ents_p(leaf); - xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + xfs_dir3_leaf_check(dp, lbp); + ents = dp->d_ops->leaf_ents_p(leaf); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); /* * Look for the first leaf entry with our hash value. @@ -1302,7 +1264,8 @@ xfs_dir2_leaf_lookup_int( /* * Get the new data block number. */ - newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); + newdb = xfs_dir2_dataptr_to_db(args->geo, + be32_to_cpu(lep->address)); /* * If it's not the same as the old data block number, * need to pitch the old one and read the new one. @@ -1311,8 +1274,8 @@ xfs_dir2_leaf_lookup_int( if (dbp) xfs_trans_brelse(tp, dbp); error = xfs_dir3_data_read(tp, dp, - xfs_dir2_db_to_da(mp, newdb), - -1, &dbp); + xfs_dir2_db_to_da(args->geo, newdb), + -1, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1323,7 +1286,8 @@ xfs_dir2_leaf_lookup_int( * Point to the data entry. */ dep = (xfs_dir2_data_entry_t *)((char *)dbp->b_addr + - xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(lep->address))); /* * Compare name and if it's an exact match, return the index * and buffer. If it's the first case-insensitive match, store @@ -1352,8 +1316,8 @@ xfs_dir2_leaf_lookup_int( if (cidb != curdb) { xfs_trans_brelse(tp, dbp); error = xfs_dir3_data_read(tp, dp, - xfs_dir2_db_to_da(mp, cidb), - -1, &dbp); + xfs_dir2_db_to_da(args->geo, cidb), + -1, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1415,60 +1379,60 @@ xfs_dir2_leaf_removename( leaf = lbp->b_addr; hdr = dbp->b_addr; xfs_dir3_data_check(dp, dbp); - bf = xfs_dir3_data_bestfree_p(hdr); - xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); - ents = xfs_dir3_leaf_ents_p(leaf); + bf = dp->d_ops->data_bestfree_p(hdr); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); /* * Point to the leaf entry, use that to point to the data entry. */ lep = &ents[index]; - db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); - dep = (xfs_dir2_data_entry_t *) - ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); + db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address)); + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address))); needscan = needlog = 0; oldbest = be16_to_cpu(bf[0].length); - ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); ASSERT(be16_to_cpu(bestsp[db]) == oldbest); /* * Mark the former data entry unused. */ - xfs_dir2_data_make_free(tp, dbp, + xfs_dir2_data_make_free(args, dbp, (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), - xfs_dir3_data_entsize(mp, dep->namelen), &needlog, &needscan); + dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan); /* * We just mark the leaf entry stale by putting a null in it. */ leafhdr.stale++; - xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr); - xfs_dir3_leaf_log_header(tp, lbp); + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, lbp); lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); - xfs_dir3_leaf_log_ents(tp, lbp, index, index); + xfs_dir3_leaf_log_ents(args, lbp, index, index); /* * Scan the freespace in the data block again if necessary, * log the data block header if necessary. */ if (needscan) - xfs_dir2_data_freescan(mp, hdr, &needlog); + xfs_dir2_data_freescan(dp, hdr, &needlog); if (needlog) - xfs_dir2_data_log_header(tp, dbp); + xfs_dir2_data_log_header(args, dbp); /* * If the longest freespace in the data block has changed, * put the new value in the bests table and log that. */ if (be16_to_cpu(bf[0].length) != oldbest) { bestsp[db] = bf[0].length; - xfs_dir3_leaf_log_bests(tp, lbp, db, db); + xfs_dir3_leaf_log_bests(args, lbp, db, db); } xfs_dir3_data_check(dp, dbp); /* * If the data block is now empty then get rid of the data block. */ if (be16_to_cpu(bf[0].length) == - mp->m_dirblksize - xfs_dir3_data_entry_offset(hdr)) { - ASSERT(db != mp->m_dirdatablk); + args->geo->blksize - dp->d_ops->data_entry_offset) { + ASSERT(db != args->geo->datablk); if ((error = xfs_dir2_shrink_inode(args, db, dbp))) { /* * Nope, can't get rid of it because it caused @@ -1478,7 +1442,7 @@ xfs_dir2_leaf_removename( */ if (error == ENOSPC && args->total == 0) error = 0; - xfs_dir3_leaf_check(mp, lbp); + xfs_dir3_leaf_check(dp, lbp); return error; } dbp = NULL; @@ -1501,18 +1465,19 @@ xfs_dir2_leaf_removename( memmove(&bestsp[db - i], bestsp, (be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp)); be32_add_cpu(<p->bestcount, -(db - i)); - xfs_dir3_leaf_log_tail(tp, lbp); - xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(args, lbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, + be32_to_cpu(ltp->bestcount) - 1); } else bestsp[db] = cpu_to_be16(NULLDATAOFF); } /* * If the data block was not the first one, drop it. */ - else if (db != mp->m_dirdatablk) + else if (db != args->geo->datablk) dbp = NULL; - xfs_dir3_leaf_check(mp, lbp); + xfs_dir3_leaf_check(dp, lbp); /* * See if we can convert to block form. */ @@ -1547,7 +1512,7 @@ xfs_dir2_leaf_replace( } dp = args->dp; leaf = lbp->b_addr; - ents = xfs_dir3_leaf_ents_p(leaf); + ents = dp->d_ops->leaf_ents_p(leaf); /* * Point to the leaf entry, get data address from it. */ @@ -1557,16 +1522,16 @@ xfs_dir2_leaf_replace( */ dep = (xfs_dir2_data_entry_t *) ((char *)dbp->b_addr + - xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address))); ASSERT(args->inumber != be64_to_cpu(dep->inumber)); /* * Put the new inode number in, log it. */ dep->inumber = cpu_to_be64(args->inumber); - xfs_dir3_dirent_put_ftype(dp->i_mount, dep, args->filetype); + dp->d_ops->data_put_ftype(dep, args->filetype); tp = args->trans; - xfs_dir2_data_log_entry(tp, dbp, dep); - xfs_dir3_leaf_check(dp->i_mount, lbp); + xfs_dir2_data_log_entry(args, dbp, dep); + xfs_dir3_leaf_check(dp, lbp); xfs_trans_brelse(tp, lbp); return 0; } @@ -1592,8 +1557,8 @@ xfs_dir2_leaf_search_hash( struct xfs_dir3_icleaf_hdr leafhdr; leaf = lbp->b_addr; - ents = xfs_dir3_leaf_ents_p(leaf); - xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = args->dp->d_ops->leaf_ents_p(leaf); + args->dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); /* * Note, the table cannot be empty, so we have to go through the loop. @@ -1651,22 +1616,23 @@ xfs_dir2_leaf_trim_data( /* * Read the offending data block. We need its buffer. */ - error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp); + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, db), + -1, &dbp); if (error) return error; leaf = lbp->b_addr; - ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); #ifdef DEBUG { struct xfs_dir2_data_hdr *hdr = dbp->b_addr; - struct xfs_dir2_data_free *bf = xfs_dir3_data_bestfree_p(hdr); + struct xfs_dir2_data_free *bf = dp->d_ops->data_bestfree_p(hdr); ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); ASSERT(be16_to_cpu(bf[0].length) == - mp->m_dirblksize - xfs_dir3_data_entry_offset(hdr)); + args->geo->blksize - dp->d_ops->data_entry_offset); ASSERT(db == be32_to_cpu(ltp->bestcount) - 1); } #endif @@ -1685,8 +1651,8 @@ xfs_dir2_leaf_trim_data( bestsp = xfs_dir2_leaf_bests_p(ltp); be32_add_cpu(<p->bestcount, -1); memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp)); - xfs_dir3_leaf_log_tail(tp, lbp); - xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(args, lbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); return 0; } @@ -1750,22 +1716,22 @@ xfs_dir2_node_to_leaf( /* * Get the last offset in the file. */ - if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) { + if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) { return error; } - fo -= mp->m_dirblkfsbs; + fo -= args->geo->fsbcount; /* * If there are freespace blocks other than the first one, * take this opportunity to remove trailing empty freespace blocks * that may have been left behind during no-space-reservation * operations. */ - while (fo > mp->m_dirfreeblk) { + while (fo > args->geo->freeblk) { if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) { return error; } if (rval) - fo -= mp->m_dirblkfsbs; + fo -= args->geo->fsbcount; else return 0; } @@ -1778,11 +1744,11 @@ xfs_dir2_node_to_leaf( /* * If it's not the single leaf block, give up. */ - if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + mp->m_dirblksize) + if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + args->geo->blksize) return 0; lbp = state->path.blk[0].bp; leaf = lbp->b_addr; - xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); @@ -1790,11 +1756,11 @@ xfs_dir2_node_to_leaf( /* * Read the freespace block. */ - error = xfs_dir2_free_read(tp, dp, mp->m_dirfreeblk, &fbp); + error = xfs_dir2_free_read(tp, dp, args->geo->freeblk, &fbp); if (error) return error; free = fbp->b_addr; - xfs_dir3_free_hdr_from_disk(&freehdr, free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); ASSERT(!freehdr.firstdb); @@ -1802,7 +1768,7 @@ xfs_dir2_node_to_leaf( * Now see if the leafn and free data will fit in a leaf1. * If not, release the buffer and give up. */ - if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > mp->m_dirblksize) { + if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > args->geo->blksize) { xfs_trans_brelse(tp, fbp); return 0; } @@ -1822,25 +1788,27 @@ xfs_dir2_node_to_leaf( /* * Set up the leaf tail from the freespace block. */ - ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); ltp->bestcount = cpu_to_be32(freehdr.nvalid); /* * Set up the leaf bests table. */ - memcpy(xfs_dir2_leaf_bests_p(ltp), xfs_dir3_free_bests_p(mp, free), + memcpy(xfs_dir2_leaf_bests_p(ltp), dp->d_ops->free_bests_p(free), freehdr.nvalid * sizeof(xfs_dir2_data_off_t)); - xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr); - xfs_dir3_leaf_log_header(tp, lbp); - xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); - xfs_dir3_leaf_log_tail(tp, lbp); - xfs_dir3_leaf_check(mp, lbp); + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, lbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(args, lbp); + xfs_dir3_leaf_check(dp, lbp); /* * Get rid of the freespace block. */ - error = xfs_dir2_shrink_inode(args, XFS_DIR2_FREE_FIRSTDB(mp), fbp); + error = xfs_dir2_shrink_inode(args, + xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET), + fbp); if (error) { /* * This can't fail here because it can only happen when diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 4c3dba7ffb7..da43d304fca 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -18,22 +18,21 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_bmap.h" -#include "xfs_dir2_format.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_cksum.h" @@ -55,21 +54,21 @@ static int xfs_dir2_node_addname_int(xfs_da_args_t *args, * Check internal consistency of a leafn block. */ #ifdef DEBUG -#define xfs_dir3_leaf_check(mp, bp) \ +#define xfs_dir3_leaf_check(dp, bp) \ do { \ - if (!xfs_dir3_leafn_check((mp), (bp))) \ + if (!xfs_dir3_leafn_check((dp), (bp))) \ ASSERT(0); \ } while (0); static bool xfs_dir3_leafn_check( - struct xfs_mount *mp, + struct xfs_inode *dp, struct xfs_buf *bp) { struct xfs_dir2_leaf *leaf = bp->b_addr; struct xfs_dir3_icleaf_hdr leafhdr; - xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) { struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; @@ -78,10 +77,10 @@ xfs_dir3_leafn_check( } else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC) return false; - return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf); + return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf); } #else -#define xfs_dir3_leaf_check(mp, bp) +#define xfs_dir3_leaf_check(dp, bp) #endif static bool @@ -116,13 +115,14 @@ xfs_dir3_free_read_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; - if ((xfs_sb_version_hascrc(&mp->m_sb) && - !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_DIR3_FREE_CRC_OFF)) || - !xfs_dir3_free_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_dir3_free_verify(bp)) xfs_buf_ioerror(bp, EFSCORRUPTED); - } + + if (bp->b_error) + xfs_verifier_error(bp); } static void @@ -134,8 +134,8 @@ xfs_dir3_free_write_verify( struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; if (!xfs_dir3_free_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } @@ -145,7 +145,7 @@ xfs_dir3_free_write_verify( if (bip) hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_FREE_CRC_OFF); + xfs_buf_update_cksum(bp, XFS_DIR3_FREE_CRC_OFF); } const struct xfs_buf_ops xfs_dir3_free_buf_ops = { @@ -193,66 +193,20 @@ xfs_dir2_free_try_read( return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp); } - -void -xfs_dir3_free_hdr_from_disk( - struct xfs_dir3_icfree_hdr *to, - struct xfs_dir2_free *from) -{ - if (from->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)) { - to->magic = be32_to_cpu(from->hdr.magic); - to->firstdb = be32_to_cpu(from->hdr.firstdb); - to->nvalid = be32_to_cpu(from->hdr.nvalid); - to->nused = be32_to_cpu(from->hdr.nused); - } else { - struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from; - - to->magic = be32_to_cpu(hdr3->hdr.magic); - to->firstdb = be32_to_cpu(hdr3->firstdb); - to->nvalid = be32_to_cpu(hdr3->nvalid); - to->nused = be32_to_cpu(hdr3->nused); - } - - ASSERT(to->magic == XFS_DIR2_FREE_MAGIC || - to->magic == XFS_DIR3_FREE_MAGIC); -} - -static void -xfs_dir3_free_hdr_to_disk( - struct xfs_dir2_free *to, - struct xfs_dir3_icfree_hdr *from) -{ - ASSERT(from->magic == XFS_DIR2_FREE_MAGIC || - from->magic == XFS_DIR3_FREE_MAGIC); - - if (from->magic == XFS_DIR2_FREE_MAGIC) { - to->hdr.magic = cpu_to_be32(from->magic); - to->hdr.firstdb = cpu_to_be32(from->firstdb); - to->hdr.nvalid = cpu_to_be32(from->nvalid); - to->hdr.nused = cpu_to_be32(from->nused); - } else { - struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to; - - hdr3->hdr.magic = cpu_to_be32(from->magic); - hdr3->firstdb = cpu_to_be32(from->firstdb); - hdr3->nvalid = cpu_to_be32(from->nvalid); - hdr3->nused = cpu_to_be32(from->nused); - } -} - static int xfs_dir3_free_get_buf( - struct xfs_trans *tp, - struct xfs_inode *dp, + xfs_da_args_t *args, xfs_dir2_db_t fbno, struct xfs_buf **bpp) { + struct xfs_trans *tp = args->trans; + struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_buf *bp; int error; struct xfs_dir3_icfree_hdr hdr; - error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fbno), + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, fbno), -1, &bp, XFS_DATA_FORK); if (error) return error; @@ -277,7 +231,7 @@ xfs_dir3_free_get_buf( uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid); } else hdr.magic = XFS_DIR2_FREE_MAGIC; - xfs_dir3_free_hdr_to_disk(bp->b_addr, &hdr); + dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr); *bpp = bp; return 0; } @@ -287,7 +241,7 @@ xfs_dir3_free_get_buf( */ STATIC void xfs_dir2_free_log_bests( - struct xfs_trans *tp, + struct xfs_da_args *args, struct xfs_buf *bp, int first, /* first entry to log */ int last) /* last entry to log */ @@ -296,10 +250,10 @@ xfs_dir2_free_log_bests( __be16 *bests; free = bp->b_addr; - bests = xfs_dir3_free_bests_p(tp->t_mountp, free); + bests = args->dp->d_ops->free_bests_p(free); ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); - xfs_trans_log_buf(tp, bp, + xfs_trans_log_buf(args->trans, bp, (uint)((char *)&bests[first] - (char *)free), (uint)((char *)&bests[last] - (char *)free + sizeof(bests[0]) - 1)); @@ -310,7 +264,7 @@ xfs_dir2_free_log_bests( */ static void xfs_dir2_free_log_header( - struct xfs_trans *tp, + struct xfs_da_args *args, struct xfs_buf *bp) { #ifdef DEBUG @@ -320,7 +274,8 @@ xfs_dir2_free_log_header( ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); #endif - xfs_trans_log_buf(tp, bp, 0, xfs_dir3_free_hdr_size(tp->t_mountp) - 1); + xfs_trans_log_buf(args->trans, bp, 0, + args->dp->d_ops->free_hdr_size - 1); } /* @@ -360,27 +315,27 @@ xfs_dir2_leaf_to_node( if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) { return error; } - ASSERT(fdb == XFS_DIR2_FREE_FIRSTDB(mp)); + ASSERT(fdb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET)); /* * Get the buffer for the new freespace block. */ - error = xfs_dir3_free_get_buf(tp, dp, fdb, &fbp); + error = xfs_dir3_free_get_buf(args, fdb, &fbp); if (error) return error; free = fbp->b_addr; - xfs_dir3_free_hdr_from_disk(&freehdr, free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); leaf = lbp->b_addr; - ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); ASSERT(be32_to_cpu(ltp->bestcount) <= - (uint)dp->i_d.di_size / mp->m_dirblksize); + (uint)dp->i_d.di_size / args->geo->blksize); /* * Copy freespace entries from the leaf block to the new block. * Count active entries. */ from = xfs_dir2_leaf_bests_p(ltp); - to = xfs_dir3_free_bests_p(mp, free); + to = dp->d_ops->free_bests_p(free); for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) { if ((off = be16_to_cpu(*from)) != NULLDATAOFF) n++; @@ -393,9 +348,9 @@ xfs_dir2_leaf_to_node( freehdr.nused = n; freehdr.nvalid = be32_to_cpu(ltp->bestcount); - xfs_dir3_free_hdr_to_disk(fbp->b_addr, &freehdr); - xfs_dir2_free_log_bests(tp, fbp, 0, freehdr.nvalid - 1); - xfs_dir2_free_log_header(tp, fbp); + dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr); + xfs_dir2_free_log_bests(args, fbp, 0, freehdr.nvalid - 1); + xfs_dir2_free_log_header(args, fbp); /* * Converting the leaf to a leafnode is just a matter of changing the @@ -409,8 +364,8 @@ xfs_dir2_leaf_to_node( leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); lbp->b_ops = &xfs_dir3_leafn_buf_ops; xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF); - xfs_dir3_leaf_log_header(tp, lbp); - xfs_dir3_leaf_check(mp, lbp); + xfs_dir3_leaf_log_header(args, lbp); + xfs_dir3_leaf_check(dp, lbp); return 0; } @@ -443,8 +398,8 @@ xfs_dir2_leafn_add( mp = dp->i_mount; tp = args->trans; leaf = bp->b_addr; - xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); - ents = xfs_dir3_leaf_ents_p(leaf); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); /* * Quick check just to make sure we are not going to index @@ -460,7 +415,7 @@ xfs_dir2_leafn_add( * a compact. */ - if (leafhdr.count == xfs_dir3_max_leaf_ents(mp, leaf)) { + if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) { if (!leafhdr.stale) return XFS_ERROR(ENOSPC); compact = leafhdr.stale > 1; @@ -495,33 +450,34 @@ xfs_dir2_leafn_add( highstale, &lfloglow, &lfloghigh); lep->hashval = cpu_to_be32(args->hashval); - lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp, + lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(args->geo, args->blkno, args->index)); - xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr); - xfs_dir3_leaf_log_header(tp, bp); - xfs_dir3_leaf_log_ents(tp, bp, lfloglow, lfloghigh); - xfs_dir3_leaf_check(mp, bp); + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, bp); + xfs_dir3_leaf_log_ents(args, bp, lfloglow, lfloghigh); + xfs_dir3_leaf_check(dp, bp); return 0; } #ifdef DEBUG static void xfs_dir2_free_hdr_check( - struct xfs_mount *mp, + struct xfs_inode *dp, struct xfs_buf *bp, xfs_dir2_db_t db) { struct xfs_dir3_icfree_hdr hdr; - xfs_dir3_free_hdr_from_disk(&hdr, bp->b_addr); + dp->d_ops->free_hdr_from_disk(&hdr, bp->b_addr); - ASSERT((hdr.firstdb % xfs_dir3_free_max_bests(mp)) == 0); + ASSERT((hdr.firstdb % + dp->d_ops->free_max_bests(dp->i_mount->m_dir_geo)) == 0); ASSERT(hdr.firstdb <= db); ASSERT(db < hdr.firstdb + hdr.nvalid); } #else -#define xfs_dir2_free_hdr_check(mp, dp, db) +#define xfs_dir2_free_hdr_check(dp, bp, db) #endif /* DEBUG */ /* @@ -530,6 +486,7 @@ xfs_dir2_free_hdr_check( */ xfs_dahash_t /* hash value */ xfs_dir2_leafn_lasthash( + struct xfs_inode *dp, struct xfs_buf *bp, /* leaf buffer */ int *count) /* count of entries in leaf */ { @@ -537,7 +494,7 @@ xfs_dir2_leafn_lasthash( struct xfs_dir2_leaf_entry *ents; struct xfs_dir3_icleaf_hdr leafhdr; - xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); @@ -547,7 +504,7 @@ xfs_dir2_leafn_lasthash( if (!leafhdr.count) return 0; - ents = xfs_dir3_leaf_ents_p(leaf); + ents = dp->d_ops->leaf_ents_p(leaf); return be32_to_cpu(ents[leafhdr.count - 1].hashval); } @@ -584,10 +541,10 @@ xfs_dir2_leafn_lookup_for_addname( tp = args->trans; mp = dp->i_mount; leaf = bp->b_addr; - xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); - ents = xfs_dir3_leaf_ents_p(leaf); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); - xfs_dir3_leaf_check(mp, bp); + xfs_dir3_leaf_check(dp, bp); ASSERT(leafhdr.count > 0); /* @@ -605,7 +562,7 @@ xfs_dir2_leafn_lookup_for_addname( ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); } - length = xfs_dir3_data_entsize(mp, args->namelen); + length = dp->d_ops->data_entsize(args->namelen); /* * Loop over leaf entries with the right hash value. */ @@ -620,7 +577,8 @@ xfs_dir2_leafn_lookup_for_addname( /* * Pull the data block number from the entry. */ - newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); + newdb = xfs_dir2_dataptr_to_db(args->geo, + be32_to_cpu(lep->address)); /* * For addname, we're looking for a place to put the new entry. * We want to use a data block with an entry of equal @@ -637,7 +595,7 @@ xfs_dir2_leafn_lookup_for_addname( * Convert the data block to the free block * holding its freespace information. */ - newfdb = xfs_dir2_db_to_fdb(mp, newdb); + newfdb = dp->d_ops->db_to_fdb(args->geo, newdb); /* * If it's not the one we have in hand, read it in. */ @@ -649,22 +607,23 @@ xfs_dir2_leafn_lookup_for_addname( xfs_trans_brelse(tp, curbp); error = xfs_dir2_free_read(tp, dp, - xfs_dir2_db_to_da(mp, newfdb), + xfs_dir2_db_to_da(args->geo, + newfdb), &curbp); if (error) return error; free = curbp->b_addr; - xfs_dir2_free_hdr_check(mp, curbp, curdb); + xfs_dir2_free_hdr_check(dp, curbp, curdb); } /* * Get the index for our entry. */ - fi = xfs_dir2_db_to_fdindex(mp, curdb); + fi = dp->d_ops->db_to_fdindex(args->geo, curdb); /* * If it has room, return it. */ - bests = xfs_dir3_free_bests_p(mp, free); + bests = dp->d_ops->free_bests_p(free); if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) { XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int", XFS_ERRLEVEL_LOW, mp); @@ -734,10 +693,10 @@ xfs_dir2_leafn_lookup_for_entry( tp = args->trans; mp = dp->i_mount; leaf = bp->b_addr; - xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); - ents = xfs_dir3_leaf_ents_p(leaf); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); - xfs_dir3_leaf_check(mp, bp); + xfs_dir3_leaf_check(dp, bp); ASSERT(leafhdr.count > 0); /* @@ -765,7 +724,8 @@ xfs_dir2_leafn_lookup_for_entry( /* * Pull the data block number from the entry. */ - newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); + newdb = xfs_dir2_dataptr_to_db(args->geo, + be32_to_cpu(lep->address)); /* * Not adding a new entry, so we really want to find * the name given to us. @@ -790,7 +750,8 @@ xfs_dir2_leafn_lookup_for_entry( curbp = state->extrablk.bp; } else { error = xfs_dir3_data_read(tp, dp, - xfs_dir2_db_to_da(mp, newdb), + xfs_dir2_db_to_da(args->geo, + newdb), -1, &curbp); if (error) return error; @@ -802,7 +763,8 @@ xfs_dir2_leafn_lookup_for_entry( * Point to the data entry. */ dep = (xfs_dir2_data_entry_t *)((char *)curbp->b_addr + - xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(lep->address))); /* * Compare the entry and if it's an exact match, return * EEXIST immediately. If it's the first case-insensitive @@ -816,7 +778,7 @@ xfs_dir2_leafn_lookup_for_entry( xfs_trans_brelse(tp, state->extrablk.bp); args->cmpresult = cmp; args->inumber = be64_to_cpu(dep->inumber); - args->filetype = xfs_dir3_dirent_get_ftype(mp, dep); + args->filetype = dp->d_ops->data_get_ftype(dep); *indexp = index; state->extravalid = 1; state->extrablk.bp = curbp; @@ -888,7 +850,6 @@ xfs_dir3_leafn_moveents( int start_d,/* destination leaf index */ int count) /* count of leaves to copy */ { - struct xfs_trans *tp = args->trans; int stale; /* count stale leaves copied */ trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count); @@ -907,7 +868,7 @@ xfs_dir3_leafn_moveents( if (start_d < dhdr->count) { memmove(&dents[start_d + count], &dents[start_d], (dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir3_leaf_log_ents(tp, bp_d, start_d + count, + xfs_dir3_leaf_log_ents(args, bp_d, start_d + count, count + dhdr->count - 1); } /* @@ -929,7 +890,7 @@ xfs_dir3_leafn_moveents( */ memcpy(&dents[start_d], &sents[start_s], count * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir3_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1); + xfs_dir3_leaf_log_ents(args, bp_d, start_d, start_d + count - 1); /* * If there are source entries after the ones we copied, @@ -938,7 +899,7 @@ xfs_dir3_leafn_moveents( if (start_s + count < shdr->count) { memmove(&sents[start_s], &sents[start_s + count], count * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir3_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1); + xfs_dir3_leaf_log_ents(args, bp_s, start_s, start_s + count - 1); } /* @@ -956,6 +917,7 @@ xfs_dir3_leafn_moveents( */ int /* sort order */ xfs_dir2_leafn_order( + struct xfs_inode *dp, struct xfs_buf *leaf1_bp, /* leaf1 buffer */ struct xfs_buf *leaf2_bp) /* leaf2 buffer */ { @@ -966,10 +928,10 @@ xfs_dir2_leafn_order( struct xfs_dir3_icleaf_hdr hdr1; struct xfs_dir3_icleaf_hdr hdr2; - xfs_dir3_leaf_hdr_from_disk(&hdr1, leaf1); - xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf2); - ents1 = xfs_dir3_leaf_ents_p(leaf1); - ents2 = xfs_dir3_leaf_ents_p(leaf2); + dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1); + dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2); + ents1 = dp->d_ops->leaf_ents_p(leaf1); + ents2 = dp->d_ops->leaf_ents_p(leaf2); if (hdr1.count > 0 && hdr2.count > 0 && (be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) || @@ -1007,12 +969,13 @@ xfs_dir2_leafn_rebalance( struct xfs_dir2_leaf_entry *ents2; struct xfs_dir3_icleaf_hdr hdr1; struct xfs_dir3_icleaf_hdr hdr2; + struct xfs_inode *dp = state->args->dp; args = state->args; /* * If the block order is wrong, swap the arguments. */ - if ((swap = xfs_dir2_leafn_order(blk1->bp, blk2->bp))) { + if ((swap = xfs_dir2_leafn_order(dp, blk1->bp, blk2->bp))) { xfs_da_state_blk_t *tmp; /* temp for block swap */ tmp = blk1; @@ -1021,10 +984,10 @@ xfs_dir2_leafn_rebalance( } leaf1 = blk1->bp->b_addr; leaf2 = blk2->bp->b_addr; - xfs_dir3_leaf_hdr_from_disk(&hdr1, leaf1); - xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf2); - ents1 = xfs_dir3_leaf_ents_p(leaf1); - ents2 = xfs_dir3_leaf_ents_p(leaf2); + dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1); + dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2); + ents1 = dp->d_ops->leaf_ents_p(leaf1); + ents2 = dp->d_ops->leaf_ents_p(leaf2); oldsum = hdr1.count + hdr2.count; #if defined(DEBUG) || defined(XFS_WARN) @@ -1070,13 +1033,13 @@ xfs_dir2_leafn_rebalance( ASSERT(hdr1.stale + hdr2.stale == oldstale); /* log the changes made when moving the entries */ - xfs_dir3_leaf_hdr_to_disk(leaf1, &hdr1); - xfs_dir3_leaf_hdr_to_disk(leaf2, &hdr2); - xfs_dir3_leaf_log_header(args->trans, blk1->bp); - xfs_dir3_leaf_log_header(args->trans, blk2->bp); + dp->d_ops->leaf_hdr_to_disk(leaf1, &hdr1); + dp->d_ops->leaf_hdr_to_disk(leaf2, &hdr2); + xfs_dir3_leaf_log_header(args, blk1->bp); + xfs_dir3_leaf_log_header(args, blk2->bp); - xfs_dir3_leaf_check(args->dp->i_mount, blk1->bp); - xfs_dir3_leaf_check(args->dp->i_mount, blk2->bp); + xfs_dir3_leaf_check(dp, blk1->bp); + xfs_dir3_leaf_check(dp, blk2->bp); /* * Mark whether we're inserting into the old or new leaf. @@ -1097,11 +1060,11 @@ xfs_dir2_leafn_rebalance( * Finally sanity check just to make sure we are not returning a * negative index */ - if(blk2->index < 0) { + if (blk2->index < 0) { state->inleaf = 1; blk2->index = 0; - xfs_alert(args->dp->i_mount, - "%s: picked the wrong leaf? reverting original leaf: blk1->index %d\n", + xfs_alert(dp->i_mount, + "%s: picked the wrong leaf? reverting original leaf: blk1->index %d", __func__, blk1->index); } } @@ -1116,21 +1079,20 @@ xfs_dir3_data_block_free( struct xfs_buf *fbp, int longest) { - struct xfs_trans *tp = args->trans; int logfree = 0; __be16 *bests; struct xfs_dir3_icfree_hdr freehdr; + struct xfs_inode *dp = args->dp; - xfs_dir3_free_hdr_from_disk(&freehdr, free); - - bests = xfs_dir3_free_bests_p(tp->t_mountp, free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); + bests = dp->d_ops->free_bests_p(free); if (hdr) { /* * Data block is not empty, just set the free entry to the new * value. */ bests[findex] = cpu_to_be16(longest); - xfs_dir2_free_log_bests(tp, fbp, findex, findex); + xfs_dir2_free_log_bests(args, fbp, findex, findex); return 0; } @@ -1157,8 +1119,8 @@ xfs_dir3_data_block_free( logfree = 1; } - xfs_dir3_free_hdr_to_disk(free, &freehdr); - xfs_dir2_free_log_header(tp, fbp); + dp->d_ops->free_hdr_to_disk(free, &freehdr); + xfs_dir2_free_log_header(args, fbp); /* * If there are no useful entries left in the block, get rid of the @@ -1182,7 +1144,7 @@ xfs_dir3_data_block_free( /* Log the free entry that changed, unless we got rid of it. */ if (logfree) - xfs_dir2_free_log_bests(tp, fbp, findex, findex); + xfs_dir2_free_log_bests(args, fbp, findex, findex); return 0; } @@ -1222,8 +1184,8 @@ xfs_dir2_leafn_remove( tp = args->trans; mp = dp->i_mount; leaf = bp->b_addr; - xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); - ents = xfs_dir3_leaf_ents_p(leaf); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); /* * Point to the entry we're removing. @@ -1233,9 +1195,9 @@ xfs_dir2_leafn_remove( /* * Extract the data block and offset from the entry. */ - db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); + db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address)); ASSERT(dblk->blkno == db); - off = xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)); + off = xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)); ASSERT(dblk->index == off); /* @@ -1243,11 +1205,11 @@ xfs_dir2_leafn_remove( * Log the leaf block changes. */ leafhdr.stale++; - xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr); - xfs_dir3_leaf_log_header(tp, bp); + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, bp); lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); - xfs_dir3_leaf_log_ents(tp, bp, index, index); + xfs_dir3_leaf_log_ents(args, bp, index, index); /* * Make the data entry free. Keep track of the longest freespace @@ -1256,19 +1218,19 @@ xfs_dir2_leafn_remove( dbp = dblk->bp; hdr = dbp->b_addr; dep = (xfs_dir2_data_entry_t *)((char *)hdr + off); - bf = xfs_dir3_data_bestfree_p(hdr); + bf = dp->d_ops->data_bestfree_p(hdr); longest = be16_to_cpu(bf[0].length); needlog = needscan = 0; - xfs_dir2_data_make_free(tp, dbp, off, - xfs_dir3_data_entsize(mp, dep->namelen), &needlog, &needscan); + xfs_dir2_data_make_free(args, dbp, off, + dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan); /* * Rescan the data block freespaces for bestfree. * Log the data block header if needed. */ if (needscan) - xfs_dir2_data_freescan(mp, hdr, &needlog); + xfs_dir2_data_freescan(dp, hdr, &needlog); if (needlog) - xfs_dir2_data_log_header(tp, dbp); + xfs_dir2_data_log_header(args, dbp); xfs_dir3_data_check(dp, dbp); /* * If the longest data block freespace changes, need to update @@ -1285,8 +1247,9 @@ xfs_dir2_leafn_remove( * Convert the data block number to a free block, * read in the free block. */ - fdb = xfs_dir2_db_to_fdb(mp, db); - error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, fdb), + fdb = dp->d_ops->db_to_fdb(args->geo, db); + error = xfs_dir2_free_read(tp, dp, + xfs_dir2_db_to_da(args->geo, fdb), &fbp); if (error) return error; @@ -1294,22 +1257,23 @@ xfs_dir2_leafn_remove( #ifdef DEBUG { struct xfs_dir3_icfree_hdr freehdr; - xfs_dir3_free_hdr_from_disk(&freehdr, free); - ASSERT(freehdr.firstdb == xfs_dir3_free_max_bests(mp) * - (fdb - XFS_DIR2_FREE_FIRSTDB(mp))); + dp->d_ops->free_hdr_from_disk(&freehdr, free); + ASSERT(freehdr.firstdb == dp->d_ops->free_max_bests(args->geo) * + (fdb - xfs_dir2_byte_to_db(args->geo, + XFS_DIR2_FREE_OFFSET))); } #endif /* * Calculate which entry we need to fix. */ - findex = xfs_dir2_db_to_fdindex(mp, db); + findex = dp->d_ops->db_to_fdindex(args->geo, db); longest = be16_to_cpu(bf[0].length); /* * If the data block is now empty we can get rid of it * (usually). */ - if (longest == mp->m_dirblksize - - xfs_dir3_data_entry_offset(hdr)) { + if (longest == args->geo->blksize - + dp->d_ops->data_entry_offset) { /* * Try to punch out the data block. */ @@ -1336,14 +1300,14 @@ xfs_dir2_leafn_remove( return error; } - xfs_dir3_leaf_check(mp, bp); + xfs_dir3_leaf_check(dp, bp); /* * Return indication of whether this leaf block is empty enough * to justify trying to join it with a neighbor. */ - *rval = (xfs_dir3_leaf_hdr_size(leaf) + + *rval = (dp->d_ops->leaf_hdr_size + (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) < - mp->m_dir_magicpct; + args->geo->magicpct; return 0; } @@ -1360,13 +1324,14 @@ xfs_dir2_leafn_split( xfs_dablk_t blkno; /* new leaf block number */ int error; /* error return value */ xfs_mount_t *mp; /* filesystem mount point */ + struct xfs_inode *dp; /* * Allocate space for a new leaf node. */ args = state->args; - mp = args->dp->i_mount; - ASSERT(args != NULL); + dp = args->dp; + mp = dp->i_mount; ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC); error = xfs_da_grow_inode(args, &blkno); if (error) { @@ -1375,7 +1340,7 @@ xfs_dir2_leafn_split( /* * Initialize the new leaf block. */ - error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(mp, blkno), + error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(args->geo, blkno), &newblk->bp, XFS_DIR2_LEAFN_MAGIC); if (error) return error; @@ -1401,10 +1366,10 @@ xfs_dir2_leafn_split( /* * Update last hashval in each block since we added the name. */ - oldblk->hashval = xfs_dir2_leafn_lasthash(oldblk->bp, NULL); - newblk->hashval = xfs_dir2_leafn_lasthash(newblk->bp, NULL); - xfs_dir3_leaf_check(mp, oldblk->bp); - xfs_dir3_leaf_check(mp, newblk->bp); + oldblk->hashval = xfs_dir2_leafn_lasthash(dp, oldblk->bp, NULL); + newblk->hashval = xfs_dir2_leafn_lasthash(dp, newblk->bp, NULL); + xfs_dir3_leaf_check(dp, oldblk->bp); + xfs_dir3_leaf_check(dp, newblk->bp); return error; } @@ -1434,6 +1399,7 @@ xfs_dir2_leafn_toosmall( int rval; /* result from path_shift */ struct xfs_dir3_icleaf_hdr leafhdr; struct xfs_dir2_leaf_entry *ents; + struct xfs_inode *dp = state->args->dp; /* * Check for the degenerate case of the block being over 50% full. @@ -1442,13 +1408,13 @@ xfs_dir2_leafn_toosmall( */ blk = &state->path.blk[state->path.active - 1]; leaf = blk->bp->b_addr; - xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); - ents = xfs_dir3_leaf_ents_p(leaf); - xfs_dir3_leaf_check(state->args->dp->i_mount, blk->bp); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + xfs_dir3_leaf_check(dp, blk->bp); count = leafhdr.count - leafhdr.stale; - bytes = xfs_dir3_leaf_hdr_size(leaf) + count * sizeof(ents[0]); - if (bytes > (state->blocksize >> 1)) { + bytes = dp->d_ops->leaf_hdr_size + count * sizeof(ents[0]); + if (bytes > (state->args->geo->blksize >> 1)) { /* * Blk over 50%, don't try to join. */ @@ -1492,7 +1458,7 @@ xfs_dir2_leafn_toosmall( /* * Read the sibling leaf block. */ - error = xfs_dir3_leafn_read(state->args->trans, state->args->dp, + error = xfs_dir3_leafn_read(state->args->trans, dp, blkno, -1, &bp); if (error) return error; @@ -1501,11 +1467,12 @@ xfs_dir2_leafn_toosmall( * Count bytes in the two blocks combined. */ count = leafhdr.count - leafhdr.stale; - bytes = state->blocksize - (state->blocksize >> 2); + bytes = state->args->geo->blksize - + (state->args->geo->blksize >> 2); leaf = bp->b_addr; - xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf); - ents = xfs_dir3_leaf_ents_p(leaf); + dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); count += hdr2.count - hdr2.stale; bytes -= count * sizeof(ents[0]); @@ -1559,6 +1526,7 @@ xfs_dir2_leafn_unbalance( struct xfs_dir3_icleaf_hdr drophdr; struct xfs_dir2_leaf_entry *sents; struct xfs_dir2_leaf_entry *dents; + struct xfs_inode *dp = state->args->dp; args = state->args; ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC); @@ -1566,10 +1534,10 @@ xfs_dir2_leafn_unbalance( drop_leaf = drop_blk->bp->b_addr; save_leaf = save_blk->bp->b_addr; - xfs_dir3_leaf_hdr_from_disk(&savehdr, save_leaf); - xfs_dir3_leaf_hdr_from_disk(&drophdr, drop_leaf); - sents = xfs_dir3_leaf_ents_p(save_leaf); - dents = xfs_dir3_leaf_ents_p(drop_leaf); + dp->d_ops->leaf_hdr_from_disk(&savehdr, save_leaf); + dp->d_ops->leaf_hdr_from_disk(&drophdr, drop_leaf); + sents = dp->d_ops->leaf_ents_p(save_leaf); + dents = dp->d_ops->leaf_ents_p(drop_leaf); /* * If there are any stale leaf entries, take this opportunity @@ -1584,7 +1552,7 @@ xfs_dir2_leafn_unbalance( * Move the entries from drop to the appropriate end of save. */ drop_blk->hashval = be32_to_cpu(dents[drophdr.count - 1].hashval); - if (xfs_dir2_leafn_order(save_blk->bp, drop_blk->bp)) + if (xfs_dir2_leafn_order(dp, save_blk->bp, drop_blk->bp)) xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0, save_blk->bp, &savehdr, sents, 0, drophdr.count); @@ -1595,13 +1563,13 @@ xfs_dir2_leafn_unbalance( save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval); /* log the changes made when moving the entries */ - xfs_dir3_leaf_hdr_to_disk(save_leaf, &savehdr); - xfs_dir3_leaf_hdr_to_disk(drop_leaf, &drophdr); - xfs_dir3_leaf_log_header(args->trans, save_blk->bp); - xfs_dir3_leaf_log_header(args->trans, drop_blk->bp); + dp->d_ops->leaf_hdr_to_disk(save_leaf, &savehdr); + dp->d_ops->leaf_hdr_to_disk(drop_leaf, &drophdr); + xfs_dir3_leaf_log_header(args, save_blk->bp); + xfs_dir3_leaf_log_header(args, drop_blk->bp); - xfs_dir3_leaf_check(args->dp->i_mount, save_blk->bp); - xfs_dir3_leaf_check(args->dp->i_mount, drop_blk->bp); + xfs_dir3_leaf_check(dp, save_blk->bp); + xfs_dir3_leaf_check(dp, drop_blk->bp); } /* @@ -1624,8 +1592,6 @@ xfs_dir2_node_addname( state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; - state->blocksize = state->mp->m_dirblksize; - state->node_ents = state->mp->m_dir_node_ents; /* * Look up the name. We're not supposed to find it, but * this gives us the insertion point. @@ -1712,7 +1678,7 @@ xfs_dir2_node_addname_int( dp = args->dp; mp = dp->i_mount; tp = args->trans; - length = xfs_dir3_data_entsize(mp, args->namelen); + length = dp->d_ops->data_entsize(args->namelen); /* * If we came in with a freespace block that means that lookup * found an entry with our hash value. This is the freespace @@ -1726,8 +1692,8 @@ xfs_dir2_node_addname_int( ifbno = fblk->blkno; free = fbp->b_addr; findex = fblk->index; - bests = xfs_dir3_free_bests_p(mp, free); - xfs_dir3_free_hdr_from_disk(&freehdr, free); + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); /* * This means the free entry showed that the data block had @@ -1764,9 +1730,9 @@ xfs_dir2_node_addname_int( if (dbno == -1) { xfs_fileoff_t fo; /* freespace block number */ - if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) + if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) return error; - lastfbno = xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo); + lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo); fbno = ifbno; } /* @@ -1784,7 +1750,8 @@ xfs_dir2_node_addname_int( * us a freespace block to start with. */ if (++fbno == 0) - fbno = XFS_DIR2_FREE_FIRSTDB(mp); + fbno = xfs_dir2_byte_to_db(args->geo, + XFS_DIR2_FREE_OFFSET); /* * If it's ifbno we already looked at it. */ @@ -1802,8 +1769,8 @@ xfs_dir2_node_addname_int( * to avoid it. */ error = xfs_dir2_free_try_read(tp, dp, - xfs_dir2_db_to_da(mp, fbno), - &fbp); + xfs_dir2_db_to_da(args->geo, fbno), + &fbp); if (error) return error; if (!fbp) @@ -1819,8 +1786,8 @@ xfs_dir2_node_addname_int( * and the freehdr are actually initialised if they are placed * there, so we have to do it here to avoid warnings. Blech. */ - bests = xfs_dir3_free_bests_p(mp, free); - xfs_dir3_free_hdr_from_disk(&freehdr, free); + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); if (be16_to_cpu(bests[findex]) != NULLDATAOFF && be16_to_cpu(bests[findex]) >= length) dbno = freehdr.firstdb + findex; @@ -1871,10 +1838,10 @@ xfs_dir2_node_addname_int( * Get the freespace block corresponding to the data block * that was just allocated. */ - fbno = xfs_dir2_db_to_fdb(mp, dbno); + fbno = dp->d_ops->db_to_fdb(args->geo, dbno); error = xfs_dir2_free_try_read(tp, dp, - xfs_dir2_db_to_da(mp, fbno), - &fbp); + xfs_dir2_db_to_da(args->geo, fbno), + &fbp); if (error) return error; @@ -1888,12 +1855,13 @@ xfs_dir2_node_addname_int( if (error) return error; - if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) { + if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) { xfs_alert(mp, "%s: dir ino %llu needed freesp block %lld for\n" " data block %lld, got %lld ifbno %llu lastfbno %d", __func__, (unsigned long long)dp->i_ino, - (long long)xfs_dir2_db_to_fdb(mp, dbno), + (long long)dp->d_ops->db_to_fdb( + args->geo, dbno), (long long)dbno, (long long)fbno, (unsigned long long)ifbno, lastfbno); if (fblk) { @@ -1914,34 +1882,36 @@ xfs_dir2_node_addname_int( /* * Get a buffer for the new block. */ - error = xfs_dir3_free_get_buf(tp, dp, fbno, &fbp); + error = xfs_dir3_free_get_buf(args, fbno, &fbp); if (error) return error; free = fbp->b_addr; - bests = xfs_dir3_free_bests_p(mp, free); - xfs_dir3_free_hdr_from_disk(&freehdr, free); + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); /* * Remember the first slot as our empty slot. */ - freehdr.firstdb = (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) * - xfs_dir3_free_max_bests(mp); + freehdr.firstdb = + (fbno - xfs_dir2_byte_to_db(args->geo, + XFS_DIR2_FREE_OFFSET)) * + dp->d_ops->free_max_bests(args->geo); } else { free = fbp->b_addr; - bests = xfs_dir3_free_bests_p(mp, free); - xfs_dir3_free_hdr_from_disk(&freehdr, free); + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); } /* * Set the freespace block index from the data block number. */ - findex = xfs_dir2_db_to_fdindex(mp, dbno); + findex = dp->d_ops->db_to_fdindex(args->geo, dbno); /* * If it's after the end of the current entries in the * freespace block, extend that table. */ if (findex >= freehdr.nvalid) { - ASSERT(findex < xfs_dir3_free_max_bests(mp)); + ASSERT(findex < dp->d_ops->free_max_bests(args->geo)); freehdr.nvalid = findex + 1; /* * Tag new entry so nused will go up. @@ -1954,8 +1924,8 @@ xfs_dir2_node_addname_int( */ if (bests[findex] == cpu_to_be16(NULLDATAOFF)) { freehdr.nused++; - xfs_dir3_free_hdr_to_disk(fbp->b_addr, &freehdr); - xfs_dir2_free_log_header(tp, fbp); + dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr); + xfs_dir2_free_log_header(args, fbp); } /* * Update the real value in the table. @@ -1963,7 +1933,7 @@ xfs_dir2_node_addname_int( * change again. */ hdr = dbp->b_addr; - bf = xfs_dir3_data_bestfree_p(hdr); + bf = dp->d_ops->data_bestfree_p(hdr); bests[findex] = bf[0].length; logfree = 1; } @@ -1980,12 +1950,13 @@ xfs_dir2_node_addname_int( /* * Read the data block in. */ - error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno), + error = xfs_dir3_data_read(tp, dp, + xfs_dir2_db_to_da(args->geo, dbno), -1, &dbp); if (error) return error; hdr = dbp->b_addr; - bf = xfs_dir3_data_bestfree_p(hdr); + bf = dp->d_ops->data_bestfree_p(hdr); logfree = 0; } ASSERT(be16_to_cpu(bf[0].length) >= length); @@ -1998,7 +1969,7 @@ xfs_dir2_node_addname_int( /* * Mark the first part of the unused space, inuse for us. */ - xfs_dir2_data_use_free(tp, dbp, dup, + xfs_dir2_data_use_free(args, dbp, dup, (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length, &needlog, &needscan); /* @@ -2008,24 +1979,24 @@ xfs_dir2_node_addname_int( dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, dep->namelen); - xfs_dir3_dirent_put_ftype(mp, dep, args->filetype); - tagp = xfs_dir3_data_entry_tag_p(mp, dep); + dp->d_ops->data_put_ftype(dep, args->filetype); + tagp = dp->d_ops->data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); - xfs_dir2_data_log_entry(tp, dbp, dep); + xfs_dir2_data_log_entry(args, dbp, dep); /* * Rescan the block for bestfree if needed. */ if (needscan) - xfs_dir2_data_freescan(mp, hdr, &needlog); + xfs_dir2_data_freescan(dp, hdr, &needlog); /* * Log the data block header if needed. */ if (needlog) - xfs_dir2_data_log_header(tp, dbp); + xfs_dir2_data_log_header(args, dbp); /* * If the freespace entry is now wrong, update it. */ - bests = xfs_dir3_free_bests_p(mp, free); /* gcc is so stupid */ + bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */ if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) { bests[findex] = bf[0].length; logfree = 1; @@ -2034,7 +2005,7 @@ xfs_dir2_node_addname_int( * Log the freespace entry if needed. */ if (logfree) - xfs_dir2_free_log_bests(tp, fbp, findex, findex); + xfs_dir2_free_log_bests(args, fbp, findex, findex); /* * Return the data block and offset in args, then drop the data block. */ @@ -2065,8 +2036,6 @@ xfs_dir2_node_lookup( state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; - state->blocksize = state->mp->m_dirblksize; - state->node_ents = state->mp->m_dir_node_ents; /* * Fill in the path to the entry in the cursor. */ @@ -2105,12 +2074,12 @@ xfs_dir2_node_lookup( */ int /* error */ xfs_dir2_node_removename( - xfs_da_args_t *args) /* operation arguments */ + struct xfs_da_args *args) /* operation arguments */ { - xfs_da_state_blk_t *blk; /* leaf block */ + struct xfs_da_state_blk *blk; /* leaf block */ int error; /* error return value */ int rval; /* operation return value */ - xfs_da_state_t *state; /* btree cursor */ + struct xfs_da_state *state; /* btree cursor */ trace_xfs_dir2_node_removename(args); @@ -2120,21 +2089,18 @@ xfs_dir2_node_removename( state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; - state->blocksize = state->mp->m_dirblksize; - state->node_ents = state->mp->m_dir_node_ents; - /* - * Look up the entry we're deleting, set up the cursor. - */ + + /* Look up the entry we're deleting, set up the cursor. */ error = xfs_da3_node_lookup_int(state, &rval); if (error) - rval = error; - /* - * Didn't find it, upper layer screwed up. - */ + goto out_free; + + /* Didn't find it, upper layer screwed up. */ if (rval != EEXIST) { - xfs_da_state_free(state); - return rval; + error = rval; + goto out_free; } + blk = &state->path.blk[state->path.active - 1]; ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); ASSERT(state->extravalid); @@ -2145,7 +2111,7 @@ xfs_dir2_node_removename( error = xfs_dir2_leafn_remove(args, blk->bp, blk->index, &state->extrablk, &rval); if (error) - return error; + goto out_free; /* * Fix the hash values up the btree. */ @@ -2160,6 +2126,7 @@ xfs_dir2_node_removename( */ if (!error) error = xfs_dir2_node_to_leaf(state); +out_free: xfs_da_state_free(state); return error; } @@ -2190,8 +2157,6 @@ xfs_dir2_node_replace( state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; - state->blocksize = state->mp->m_dirblksize; - state->node_ents = state->mp->m_dir_node_ents; inum = args->inumber; /* * Lookup the entry to change in the btree. @@ -2212,7 +2177,7 @@ xfs_dir2_node_replace( blk = &state->path.blk[state->path.active - 1]; ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); leaf = blk->bp->b_addr; - ents = xfs_dir3_leaf_ents_p(leaf); + ents = args->dp->d_ops->leaf_ents_p(leaf); lep = &ents[blk->index]; ASSERT(state->extravalid); /* @@ -2223,14 +2188,15 @@ xfs_dir2_node_replace( hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); dep = (xfs_dir2_data_entry_t *) ((char *)hdr + - xfs_dir2_dataptr_to_off(state->mp, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(lep->address))); ASSERT(inum != be64_to_cpu(dep->inumber)); /* * Fill in the new inode number and log the entry. */ dep->inumber = cpu_to_be64(inum); - xfs_dir3_dirent_put_ftype(state->mp, dep, args->filetype); - xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep); + args->dp->d_ops->data_put_ftype(dep, args->filetype); + xfs_dir2_data_log_entry(args, state->extrablk.bp, dep); rval = 0; } /* @@ -2285,7 +2251,7 @@ xfs_dir2_node_trim_free( if (!bp) return 0; free = bp->b_addr; - xfs_dir3_free_hdr_from_disk(&freehdr, free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); /* * If there are used entries, there's nothing to do. @@ -2298,9 +2264,9 @@ xfs_dir2_node_trim_free( /* * Blow the block away. */ - if ((error = - xfs_dir2_shrink_inode(args, xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo), - bp))) { + error = xfs_dir2_shrink_inode(args, + xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo), bp); + if (error) { /* * Can't fail with ENOSPC since that only happens with no * space reservation, when breaking up an extent into two diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 1bad84c4082..27ce0794d19 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -20,6 +20,140 @@ struct dir_context; +/* + * Directory offset/block conversion functions. + * + * DB blocks here are logical directory block numbers, not filesystem blocks. + */ + +/* + * Convert dataptr to byte in file space + */ +static inline xfs_dir2_off_t +xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp) +{ + return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG; +} + +/* + * Convert byte in file space to dataptr. It had better be aligned. + */ +static inline xfs_dir2_dataptr_t +xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by) +{ + return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG); +} + +/* + * Convert byte in space to (DB) block + */ +static inline xfs_dir2_db_t +xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by) +{ + return (xfs_dir2_db_t)(by >> geo->blklog); +} + +/* + * Convert dataptr to a block number + */ +static inline xfs_dir2_db_t +xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp) +{ + return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp)); +} + +/* + * Convert byte in space to offset in a block + */ +static inline xfs_dir2_data_aoff_t +xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by) +{ + return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1)); +} + +/* + * Convert dataptr to a byte offset in a block + */ +static inline xfs_dir2_data_aoff_t +xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp) +{ + return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp)); +} + +/* + * Convert block and offset to byte in space + */ +static inline xfs_dir2_off_t +xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db, + xfs_dir2_data_aoff_t o) +{ + return ((xfs_dir2_off_t)db << geo->blklog) + o; +} + +/* + * Convert block (DB) to block (dablk) + */ +static inline xfs_dablk_t +xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog)); +} + +/* + * Convert byte in space to (DA) block + */ +static inline xfs_dablk_t +xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by) +{ + return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by)); +} + +/* + * Convert block and offset to dataptr + */ +static inline xfs_dir2_dataptr_t +xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db, + xfs_dir2_data_aoff_t o) +{ + return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o)); +} + +/* + * Convert block (dablk) to block (DB) + */ +static inline xfs_dir2_db_t +xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da) +{ + return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog)); +} + +/* + * Convert block (dablk) to byte offset in space + */ +static inline xfs_dir2_off_t +xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da) +{ + return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0); +} + +/* + * Directory tail pointer accessor functions. Based on block geometry. + */ +static inline struct xfs_dir2_block_tail * +xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr) +{ + return ((struct xfs_dir2_block_tail *) + ((char *)hdr + geo->blksize)) - 1; +} + +static inline struct xfs_dir2_leaf_tail * +xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp) +{ + return (struct xfs_dir2_leaf_tail *) + ((char *)lp + geo->blksize - + sizeof(struct xfs_dir2_leaf_tail)); +} + /* xfs_dir2.c */ extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, @@ -54,12 +188,13 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); -extern int xfs_dir3_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t bno, xfs_daddr_t mapped_bno); +extern int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno, + xfs_daddr_t mapped_bno); extern struct xfs_dir2_data_free * xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, - struct xfs_dir2_data_unused *dup, int *loghead); + struct xfs_dir2_data_free *bf, struct xfs_dir2_data_unused *dup, + int *loghead); extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, struct xfs_buf **bpp); @@ -76,9 +211,9 @@ extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr, int *lowstalep, int *highstalep, int *lowlogp, int *highlogp); extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno, struct xfs_buf **bpp, __uint16_t magic); -extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp, - int first, int last); -extern void xfs_dir3_leaf_log_header(struct xfs_trans *tp, +extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args, + struct xfs_buf *bp, int first, int last); +extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args, struct xfs_buf *bp); extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args); extern int xfs_dir2_leaf_removename(struct xfs_da_args *args); @@ -93,21 +228,18 @@ xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr, int lowstale, int highstale, int *lfloglow, int *lfloghigh); extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state); -extern void xfs_dir3_leaf_hdr_from_disk(struct xfs_dir3_icleaf_hdr *to, - struct xfs_dir2_leaf *from); -extern void xfs_dir3_leaf_hdr_to_disk(struct xfs_dir2_leaf *to, - struct xfs_dir3_icleaf_hdr *from); -extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, +extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, struct xfs_inode *dp, struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf); /* xfs_dir2_node.c */ extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, struct xfs_buf *lbp); -extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_buf *bp, int *count); +extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_inode *dp, + struct xfs_buf *bp, int *count); extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp, struct xfs_da_args *args, int *indexp, struct xfs_da_state *state); -extern int xfs_dir2_leafn_order(struct xfs_buf *leaf1_bp, +extern int xfs_dir2_leafn_order(struct xfs_inode *dp, struct xfs_buf *leaf1_bp, struct xfs_buf *leaf2_bp); extern int xfs_dir2_leafn_split(struct xfs_da_state *state, struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk); diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 8f84153e98a..48e99afb9cb 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -18,23 +18,23 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_dir2_format.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_bmap.h" +#include "xfs_trans.h" +#include "xfs_dinode.h" /* * Directory file type support functions @@ -76,26 +76,25 @@ const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = { STATIC int xfs_dir2_sf_getdents( - xfs_inode_t *dp, /* incore directory inode */ + struct xfs_da_args *args, struct dir_context *ctx) { int i; /* shortform entry number */ - xfs_mount_t *mp; /* filesystem mount point */ + struct xfs_inode *dp = args->dp; /* incore directory inode */ xfs_dir2_dataptr_t off; /* current entry's offset */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ xfs_dir2_dataptr_t dot_offset; xfs_dir2_dataptr_t dotdot_offset; xfs_ino_t ino; - - mp = dp->i_mount; + struct xfs_da_geometry *geo = args->geo; ASSERT(dp->i_df.if_flags & XFS_IFINLINE); /* * Give up if the directory is way too short. */ if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); return XFS_ERROR(EIO); } @@ -109,19 +108,19 @@ xfs_dir2_sf_getdents( /* * If the block number in the offset is out of range, we're done. */ - if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk) + if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) return 0; /* * Precalculate offsets for . and .. as we will always need them. * * XXX(hch): the second argument is sometimes 0 and sometimes - * mp->m_dirdatablk. + * geo->datablk */ - dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, - xfs_dir3_data_dot_offset(mp)); - dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, - xfs_dir3_data_dotdot_offset(mp)); + dot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + dp->d_ops->data_dot_offset); + dotdot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + dp->d_ops->data_dotdot_offset); /* * Put . entry unless we're starting past it. @@ -136,7 +135,7 @@ xfs_dir2_sf_getdents( * Put .. entry unless we're starting past it. */ if (ctx->pos <= dotdot_offset) { - ino = xfs_dir2_sf_get_parent_ino(sfp); + ino = dp->d_ops->sf_get_parent_ino(sfp); ctx->pos = dotdot_offset & 0x7fffffff; if (!dir_emit(ctx, "..", 2, ino, DT_DIR)) return 0; @@ -149,25 +148,25 @@ xfs_dir2_sf_getdents( for (i = 0; i < sfp->count; i++) { __uint8_t filetype; - off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, + off = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, xfs_dir2_sf_get_offset(sfep)); if (ctx->pos > off) { - sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep); + sfep = dp->d_ops->sf_nextentry(sfp, sfep); continue; } - ino = xfs_dir3_sfe_get_ino(mp, sfp, sfep); - filetype = xfs_dir3_sfe_get_ftype(mp, sfp, sfep); + ino = dp->d_ops->sf_get_ino(sfp, sfep); + filetype = dp->d_ops->sf_get_ftype(sfep); ctx->pos = off & 0x7fffffff; if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino, - xfs_dir3_get_dtype(mp, filetype))) + xfs_dir3_get_dtype(dp->i_mount, filetype))) return 0; - sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep); + sfep = dp->d_ops->sf_nextentry(sfp, sfep); } - ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & - 0x7fffffff; + ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) & + 0x7fffffff; return 0; } @@ -176,9 +175,10 @@ xfs_dir2_sf_getdents( */ STATIC int xfs_dir2_block_getdents( - xfs_inode_t *dp, /* incore inode */ + struct xfs_da_args *args, struct dir_context *ctx) { + struct xfs_inode *dp = args->dp; /* incore directory inode */ xfs_dir2_data_hdr_t *hdr; /* block header */ struct xfs_buf *bp; /* buffer for block */ xfs_dir2_block_tail_t *btp; /* block tail */ @@ -186,16 +186,15 @@ xfs_dir2_block_getdents( xfs_dir2_data_unused_t *dup; /* block unused entry */ char *endptr; /* end of the data entries */ int error; /* error return value */ - xfs_mount_t *mp; /* filesystem mount point */ char *ptr; /* current data entry */ int wantoff; /* starting block offset */ xfs_off_t cook; + struct xfs_da_geometry *geo = args->geo; - mp = dp->i_mount; /* * If the block number in the offset is out of range, we're done. */ - if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk) + if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) return 0; error = xfs_dir3_block_read(NULL, dp, &bp); @@ -206,14 +205,14 @@ xfs_dir2_block_getdents( * Extract the byte offset we start at from the seek pointer. * We'll skip entries before this. */ - wantoff = xfs_dir2_dataptr_to_off(mp, ctx->pos); + wantoff = xfs_dir2_dataptr_to_off(geo, ctx->pos); hdr = bp->b_addr; xfs_dir3_data_check(dp, bp); /* * Set up values for the loop. */ - btp = xfs_dir2_block_tail_p(mp, hdr); - ptr = (char *)xfs_dir3_data_entry_p(hdr); + btp = xfs_dir2_block_tail_p(geo, hdr); + ptr = (char *)dp->d_ops->data_entry_p(hdr); endptr = (char *)xfs_dir2_block_leaf_p(btp); /* @@ -237,24 +236,24 @@ xfs_dir2_block_getdents( /* * Bump pointer for the next iteration. */ - ptr += xfs_dir3_data_entsize(mp, dep->namelen); + ptr += dp->d_ops->data_entsize(dep->namelen); /* * The entry is before the desired starting point, skip it. */ if ((char *)dep - (char *)hdr < wantoff) continue; - cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, + cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, (char *)dep - (char *)hdr); ctx->pos = cook & 0x7fffffff; - filetype = xfs_dir3_dirent_get_ftype(mp, dep); + filetype = dp->d_ops->data_get_ftype(dep); /* * If it didn't fit, set the final offset to here & return. */ if (!dir_emit(ctx, (char *)dep->name, dep->namelen, be64_to_cpu(dep->inumber), - xfs_dir3_get_dtype(mp, filetype))) { + xfs_dir3_get_dtype(dp->i_mount, filetype))) { xfs_trans_brelse(NULL, bp); return 0; } @@ -264,8 +263,8 @@ xfs_dir2_block_getdents( * Reached the end of the block. * Set the offset to a non-existent block 1 and return. */ - ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & - 0x7fffffff; + ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) & + 0x7fffffff; xfs_trans_brelse(NULL, bp); return 0; } @@ -286,13 +285,13 @@ struct xfs_dir2_leaf_map_info { STATIC int xfs_dir2_leaf_readbuf( - struct xfs_inode *dp, + struct xfs_da_args *args, size_t bufsize, struct xfs_dir2_leaf_map_info *mip, xfs_dir2_off_t *curoff, struct xfs_buf **bpp) { - struct xfs_mount *mp = dp->i_mount; + struct xfs_inode *dp = args->dp; struct xfs_buf *bp = *bpp; struct xfs_bmbt_irec *map = mip->map; struct blk_plug plug; @@ -300,6 +299,7 @@ xfs_dir2_leaf_readbuf( int length; int i; int j; + struct xfs_da_geometry *geo = args->geo; /* * If we have a buffer, we need to release it and @@ -309,12 +309,12 @@ xfs_dir2_leaf_readbuf( if (bp) { xfs_trans_brelse(NULL, bp); bp = NULL; - mip->map_blocks -= mp->m_dirblkfsbs; + mip->map_blocks -= geo->fsbcount; /* * Loop to get rid of the extents for the * directory block. */ - for (i = mp->m_dirblkfsbs; i > 0; ) { + for (i = geo->fsbcount; i > 0; ) { j = min_t(int, map->br_blockcount, i); map->br_blockcount -= j; map->br_startblock += j; @@ -333,8 +333,7 @@ xfs_dir2_leaf_readbuf( /* * Recalculate the readahead blocks wanted. */ - mip->ra_want = howmany(bufsize + mp->m_dirblksize, - mp->m_sb.sb_blocksize) - 1; + mip->ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog)) - 1; ASSERT(mip->ra_want >= 0); /* @@ -342,14 +341,14 @@ xfs_dir2_leaf_readbuf( * run out of data blocks, get some more mappings. */ if (1 + mip->ra_want > mip->map_blocks && - mip->map_off < xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) { + mip->map_off < xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET)) { /* * Get more bmaps, fill in after the ones * we already have in the table. */ mip->nmap = mip->map_size - mip->map_valid; error = xfs_bmapi_read(dp, mip->map_off, - xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) - + xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET) - mip->map_off, &map[mip->map_valid], &mip->nmap, 0); @@ -370,7 +369,7 @@ xfs_dir2_leaf_readbuf( i = mip->map_valid + mip->nmap - 1; mip->map_off = map[i].br_startoff + map[i].br_blockcount; } else - mip->map_off = xfs_dir2_byte_to_da(mp, + mip->map_off = xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET); /* @@ -396,18 +395,18 @@ xfs_dir2_leaf_readbuf( * No valid mappings, so no more data blocks. */ if (!mip->map_valid) { - *curoff = xfs_dir2_da_to_byte(mp, mip->map_off); + *curoff = xfs_dir2_da_to_byte(geo, mip->map_off); goto out; } /* * Read the directory block starting at the first mapping. */ - mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff); + mip->curdb = xfs_dir2_da_to_db(geo, map->br_startoff); error = xfs_dir3_data_read(NULL, dp, map->br_startoff, - map->br_blockcount >= mp->m_dirblkfsbs ? - XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp); - + map->br_blockcount >= geo->fsbcount ? + XFS_FSB_TO_DADDR(dp->i_mount, map->br_startblock) : + -1, &bp); /* * Should just skip over the data block instead of giving up. */ @@ -419,7 +418,7 @@ xfs_dir2_leaf_readbuf( * was previously ra. */ if (mip->ra_current) - mip->ra_current -= mp->m_dirblkfsbs; + mip->ra_current -= geo->fsbcount; /* * Do we need more readahead? @@ -427,16 +426,16 @@ xfs_dir2_leaf_readbuf( blk_start_plug(&plug); for (mip->ra_index = mip->ra_offset = i = 0; mip->ra_want > mip->ra_current && i < mip->map_blocks; - i += mp->m_dirblkfsbs) { + i += geo->fsbcount) { ASSERT(mip->ra_index < mip->map_valid); /* * Read-ahead a contiguous directory block. */ if (i > mip->ra_current && - map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) { - xfs_dir3_data_readahead(NULL, dp, + map[mip->ra_index].br_blockcount >= geo->fsbcount) { + xfs_dir3_data_readahead(dp, map[mip->ra_index].br_startoff + mip->ra_offset, - XFS_FSB_TO_DADDR(mp, + XFS_FSB_TO_DADDR(dp->i_mount, map[mip->ra_index].br_startblock + mip->ra_offset)); mip->ra_current = i; @@ -447,7 +446,7 @@ xfs_dir2_leaf_readbuf( * use our mapping, but this is a very rare case. */ else if (i > mip->ra_current) { - xfs_dir3_data_readahead(NULL, dp, + xfs_dir3_data_readahead(dp, map[mip->ra_index].br_startoff + mip->ra_offset, -1); mip->ra_current = i; @@ -456,15 +455,14 @@ xfs_dir2_leaf_readbuf( /* * Advance offset through the mapping table. */ - for (j = 0; j < mp->m_dirblkfsbs; j++) { + for (j = 0; j < geo->fsbcount; j += length ) { /* * The rest of this extent but not more than a dir * block. */ - length = min_t(int, mp->m_dirblkfsbs, + length = min_t(int, geo->fsbcount, map[mip->ra_index].br_blockcount - mip->ra_offset); - j += length; mip->ra_offset += length; /* @@ -489,22 +487,23 @@ out: */ STATIC int xfs_dir2_leaf_getdents( - xfs_inode_t *dp, /* incore directory inode */ + struct xfs_da_args *args, struct dir_context *ctx, size_t bufsize) { + struct xfs_inode *dp = args->dp; struct xfs_buf *bp = NULL; /* data block buffer */ xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_data_entry_t *dep; /* data entry */ xfs_dir2_data_unused_t *dup; /* unused entry */ int error = 0; /* error return value */ int length; /* temporary length value */ - xfs_mount_t *mp; /* filesystem mount point */ int byteoff; /* offset in current block */ xfs_dir2_off_t curoff; /* current overall offset */ xfs_dir2_off_t newoff; /* new curoff after new blk */ char *ptr = NULL; /* pointer to current data */ struct xfs_dir2_leaf_map_info *map_info; + struct xfs_da_geometry *geo = args->geo; /* * If the offset is at or past the largest allowed value, @@ -513,15 +512,12 @@ xfs_dir2_leaf_getdents( if (ctx->pos >= XFS_DIR2_MAX_DATAPTR) return 0; - mp = dp->i_mount; - /* * Set up to bmap a number of blocks based on the caller's * buffer size, the directory block size, and the filesystem * block size. */ - length = howmany(bufsize + mp->m_dirblksize, - mp->m_sb.sb_blocksize); + length = howmany(bufsize + geo->blksize, (1 << geo->fsblog)); map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) + (length * sizeof(struct xfs_bmbt_irec)), KM_SLEEP | KM_NOFS); @@ -531,14 +527,14 @@ xfs_dir2_leaf_getdents( * Inside the loop we keep the main offset value as a byte offset * in the directory file. */ - curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos); + curoff = xfs_dir2_dataptr_to_byte(ctx->pos); /* * Force this conversion through db so we truncate the offset * down to get the start of the data block. */ - map_info->map_off = xfs_dir2_db_to_da(mp, - xfs_dir2_byte_to_db(mp, curoff)); + map_info->map_off = xfs_dir2_db_to_da(geo, + xfs_dir2_byte_to_db(geo, curoff)); /* * Loop over directory entries until we reach the end offset. @@ -551,9 +547,9 @@ xfs_dir2_leaf_getdents( * If we have no buffer, or we're off the end of the * current buffer, need to get another one. */ - if (!bp || ptr >= (char *)bp->b_addr + mp->m_dirblksize) { + if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) { - error = xfs_dir2_leaf_readbuf(dp, bufsize, map_info, + error = xfs_dir2_leaf_readbuf(args, bufsize, map_info, &curoff, &bp); if (error || !map_info->map_valid) break; @@ -561,7 +557,8 @@ xfs_dir2_leaf_getdents( /* * Having done a read, we need to set a new offset. */ - newoff = xfs_dir2_db_off_to_byte(mp, map_info->curdb, 0); + newoff = xfs_dir2_db_off_to_byte(geo, + map_info->curdb, 0); /* * Start of the current block. */ @@ -571,20 +568,20 @@ xfs_dir2_leaf_getdents( * Make sure we're in the right block. */ else if (curoff > newoff) - ASSERT(xfs_dir2_byte_to_db(mp, curoff) == + ASSERT(xfs_dir2_byte_to_db(geo, curoff) == map_info->curdb); hdr = bp->b_addr; xfs_dir3_data_check(dp, bp); /* * Find our position in the block. */ - ptr = (char *)xfs_dir3_data_entry_p(hdr); - byteoff = xfs_dir2_byte_to_off(mp, curoff); + ptr = (char *)dp->d_ops->data_entry_p(hdr); + byteoff = xfs_dir2_byte_to_off(geo, curoff); /* * Skip past the header. */ if (byteoff == 0) - curoff += xfs_dir3_data_entry_offset(hdr); + curoff += dp->d_ops->data_entry_offset; /* * Skip past entries until we reach our offset. */ @@ -601,17 +598,17 @@ xfs_dir2_leaf_getdents( } dep = (xfs_dir2_data_entry_t *)ptr; length = - xfs_dir3_data_entsize(mp, dep->namelen); + dp->d_ops->data_entsize(dep->namelen); ptr += length; } /* * Now set our real offset. */ curoff = - xfs_dir2_db_off_to_byte(mp, - xfs_dir2_byte_to_db(mp, curoff), + xfs_dir2_db_off_to_byte(geo, + xfs_dir2_byte_to_db(geo, curoff), (char *)ptr - (char *)hdr); - if (ptr >= (char *)hdr + mp->m_dirblksize) { + if (ptr >= (char *)hdr + geo->blksize) { continue; } } @@ -632,13 +629,13 @@ xfs_dir2_leaf_getdents( } dep = (xfs_dir2_data_entry_t *)ptr; - length = xfs_dir3_data_entsize(mp, dep->namelen); - filetype = xfs_dir3_dirent_get_ftype(mp, dep); + length = dp->d_ops->data_entsize(dep->namelen); + filetype = dp->d_ops->data_get_ftype(dep); - ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff; + ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff; if (!dir_emit(ctx, (char *)dep->name, dep->namelen, be64_to_cpu(dep->inumber), - xfs_dir3_get_dtype(mp, filetype))) + xfs_dir3_get_dtype(dp->i_mount, filetype))) break; /* @@ -653,10 +650,10 @@ xfs_dir2_leaf_getdents( /* * All done. Set output offset value to current offset. */ - if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR)) + if (curoff > xfs_dir2_dataptr_to_byte(XFS_DIR2_MAX_DATAPTR)) ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff; else - ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff; + ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff; kmem_free(map_info); if (bp) xfs_trans_brelse(NULL, bp); @@ -668,12 +665,14 @@ xfs_dir2_leaf_getdents( */ int xfs_readdir( - xfs_inode_t *dp, - struct dir_context *ctx, - size_t bufsize) + struct xfs_inode *dp, + struct dir_context *ctx, + size_t bufsize) { - int rval; /* return value */ - int v; /* type-checking value */ + struct xfs_da_args args = { NULL }; + int rval; + int v; + uint lock_mode; trace_xfs_readdir(dp); @@ -683,13 +682,19 @@ xfs_readdir( ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_getdents); + args.dp = dp; + args.geo = dp->i_mount->m_dir_geo; + + lock_mode = xfs_ilock_data_map_shared(dp); if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_getdents(dp, ctx); - else if ((rval = xfs_dir2_isblock(NULL, dp, &v))) + rval = xfs_dir2_sf_getdents(&args, ctx); + else if ((rval = xfs_dir2_isblock(&args, &v))) ; else if (v) - rval = xfs_dir2_block_getdents(dp, ctx); + rval = xfs_dir2_block_getdents(&args, ctx); else - rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize); + rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize); + xfs_iunlock(dp, lock_mode); + return rval; } diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index 3ef6d402084..53c3be619db 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -17,22 +17,22 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_error.h" -#include "xfs_dir2_format.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_trace.h" +#include "xfs_dinode.h" /* * Prototypes for internal functions. @@ -57,89 +57,6 @@ static void xfs_dir2_sf_toino8(xfs_da_args_t *args); #endif /* XFS_BIG_INUMS */ /* - * Inode numbers in short-form directories can come in two versions, - * either 4 bytes or 8 bytes wide. These helpers deal with the - * two forms transparently by looking at the headers i8count field. - * - * For 64-bit inode number the most significant byte must be zero. - */ -static xfs_ino_t -xfs_dir2_sf_get_ino( - struct xfs_dir2_sf_hdr *hdr, - xfs_dir2_inou_t *from) -{ - if (hdr->i8count) - return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL; - else - return get_unaligned_be32(&from->i4.i); -} - -static void -xfs_dir2_sf_put_ino( - struct xfs_dir2_sf_hdr *hdr, - xfs_dir2_inou_t *to, - xfs_ino_t ino) -{ - ASSERT((ino & 0xff00000000000000ULL) == 0); - - if (hdr->i8count) - put_unaligned_be64(ino, &to->i8.i); - else - put_unaligned_be32(ino, &to->i4.i); -} - -xfs_ino_t -xfs_dir2_sf_get_parent_ino( - struct xfs_dir2_sf_hdr *hdr) -{ - return xfs_dir2_sf_get_ino(hdr, &hdr->parent); -} - -void -xfs_dir2_sf_put_parent_ino( - struct xfs_dir2_sf_hdr *hdr, - xfs_ino_t ino) -{ - xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino); -} - -/* - * In short-form directory entries the inode numbers are stored at variable - * offset behind the entry name. If the entry stores a filetype value, then it - * sits between the name and the inode number. Hence the inode numbers may only - * be accessed through the helpers below. - */ -static xfs_dir2_inou_t * -xfs_dir3_sfe_inop( - struct xfs_mount *mp, - struct xfs_dir2_sf_entry *sfep) -{ - __uint8_t *ptr = &sfep->name[sfep->namelen]; - if (xfs_sb_version_hasftype(&mp->m_sb)) - ptr++; - return (xfs_dir2_inou_t *)ptr; -} - -xfs_ino_t -xfs_dir3_sfe_get_ino( - struct xfs_mount *mp, - struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep) -{ - return xfs_dir2_sf_get_ino(hdr, xfs_dir3_sfe_inop(mp, sfep)); -} - -void -xfs_dir3_sfe_put_ino( - struct xfs_mount *mp, - struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep, - xfs_ino_t ino) -{ - xfs_dir2_sf_put_ino(hdr, xfs_dir3_sfe_inop(mp, sfep), ino); -} - -/* * Given a block directory (dp/block), calculate its size as a shortform (sf) * directory and a header for the sf directory, if it will fit it the * space currently present in the inode. If it won't fit, the output @@ -165,8 +82,10 @@ xfs_dir2_block_sfsize( xfs_ino_t parent = 0; /* parent inode number */ int size=0; /* total computed size */ int has_ftype; + struct xfs_da_geometry *geo; mp = dp->i_mount; + geo = mp->m_dir_geo; /* * if there is a filetype field, add the extra byte to the namelen @@ -175,7 +94,7 @@ xfs_dir2_block_sfsize( has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0; count = i8count = namelen = 0; - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* @@ -187,8 +106,8 @@ xfs_dir2_block_sfsize( /* * Calculate the pointer to the entry at hand. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)hdr + xfs_dir2_dataptr_to_off(mp, addr)); + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(geo, addr)); /* * Detect . and .., so we can special-case them. * . is not included in sf directories. @@ -226,7 +145,7 @@ xfs_dir2_block_sfsize( */ sfhp->count = count; sfhp->i8count = i8count; - xfs_dir2_sf_put_parent_ino(sfhp, parent); + dp->d_ops->sf_put_parent_ino(sfhp, parent); return size; } @@ -253,6 +172,7 @@ xfs_dir2_block_to_sf( char *ptr; /* current data pointer */ xfs_dir2_sf_entry_t *sfep; /* shortform entry */ xfs_dir2_sf_hdr_t *sfp; /* shortform directory header */ + xfs_dir2_sf_hdr_t *dst; /* temporary data buffer */ trace_xfs_dir2_block_to_sf(args); @@ -260,40 +180,25 @@ xfs_dir2_block_to_sf( mp = dp->i_mount; /* - * Make a copy of the block data, so we can shrink the inode - * and add local data. + * allocate a temporary destination buffer the size of the inode + * to format the data into. Once we have formatted the data, we + * can free the block and copy the formatted data into the inode literal + * area. */ - hdr = kmem_alloc(mp->m_dirblksize, KM_SLEEP); - memcpy(hdr, bp->b_addr, mp->m_dirblksize); - logflags = XFS_ILOG_CORE; - if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) { - ASSERT(error != ENOSPC); - goto out; - } + dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP); + hdr = bp->b_addr; /* - * The buffer is now unconditionally gone, whether - * xfs_dir2_shrink_inode worked or not. - * - * Convert the inode to local format. - */ - dp->i_df.if_flags &= ~XFS_IFEXTENTS; - dp->i_df.if_flags |= XFS_IFINLINE; - dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; - ASSERT(dp->i_df.if_bytes == 0); - xfs_idata_realloc(dp, size, XFS_DATA_FORK); - logflags |= XFS_ILOG_DDATA; - /* * Copy the header into the newly allocate local space. */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = (xfs_dir2_sf_hdr_t *)dst; memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count)); - dp->i_d.di_size = size; + /* * Set up to loop over the block's entries. */ - btp = xfs_dir2_block_tail_p(mp, hdr); - ptr = (char *)xfs_dir3_data_entry_p(hdr); + btp = xfs_dir2_block_tail_p(args->geo, hdr); + ptr = (char *)dp->d_ops->data_entry_p(hdr); endptr = (char *)xfs_dir2_block_leaf_p(btp); sfep = xfs_dir2_sf_firstentry(sfp); /* @@ -321,7 +226,7 @@ xfs_dir2_block_to_sf( else if (dep->namelen == 2 && dep->name[0] == '.' && dep->name[1] == '.') ASSERT(be64_to_cpu(dep->inumber) == - xfs_dir2_sf_get_parent_ino(sfp)); + dp->d_ops->sf_get_parent_ino(sfp)); /* * Normal entry, copy it into shortform. */ @@ -331,20 +236,44 @@ xfs_dir2_block_to_sf( (xfs_dir2_data_aoff_t) ((char *)dep - (char *)hdr)); memcpy(sfep->name, dep->name, dep->namelen); - xfs_dir3_sfe_put_ino(mp, sfp, sfep, - be64_to_cpu(dep->inumber)); - xfs_dir3_sfe_put_ftype(mp, sfp, sfep, - xfs_dir3_dirent_get_ftype(mp, dep)); + dp->d_ops->sf_put_ino(sfp, sfep, + be64_to_cpu(dep->inumber)); + dp->d_ops->sf_put_ftype(sfep, + dp->d_ops->data_get_ftype(dep)); - sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep); + sfep = dp->d_ops->sf_nextentry(sfp, sfep); } - ptr += xfs_dir3_data_entsize(mp, dep->namelen); + ptr += dp->d_ops->data_entsize(dep->namelen); } ASSERT((char *)sfep - (char *)sfp == size); + + /* now we are done with the block, we can shrink the inode */ + logflags = XFS_ILOG_CORE; + error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp); + if (error) { + ASSERT(error != ENOSPC); + goto out; + } + + /* + * The buffer is now unconditionally gone, whether + * xfs_dir2_shrink_inode worked or not. + * + * Convert the inode to local format and copy the data in. + */ + dp->i_df.if_flags &= ~XFS_IFEXTENTS; + dp->i_df.if_flags |= XFS_IFINLINE; + dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + ASSERT(dp->i_df.if_bytes == 0); + xfs_idata_realloc(dp, size, XFS_DATA_FORK); + + logflags |= XFS_ILOG_DDATA; + memcpy(dp->i_df.if_u1.if_data, dst, size); + dp->i_d.di_size = size; xfs_dir2_sf_check(args); out: xfs_trans_log_inode(args->trans, dp, logflags); - kmem_free(hdr); + kmem_free(dst); return error; } @@ -358,14 +287,12 @@ int /* error */ xfs_dir2_sf_addname( xfs_da_args_t *args) /* operation arguments */ { - int add_entsize; /* size of the new entry */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return value */ int incr_isize; /* total change in size */ int new_isize; /* di_size after adding name */ int objchange; /* changing to 8-byte inodes */ xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */ - int old_isize; /* di_size before adding name */ int pick; /* which algorithm to use */ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */ @@ -389,8 +316,7 @@ xfs_dir2_sf_addname( /* * Compute entry (and change in) size. */ - add_entsize = xfs_dir3_sf_entsize(dp->i_mount, sfp, args->namelen); - incr_isize = add_entsize; + incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen); objchange = 0; #if XFS_BIG_INUMS /* @@ -398,11 +324,8 @@ xfs_dir2_sf_addname( */ if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) { /* - * Yes, adjust the entry size and the total size. + * Yes, adjust the inode size. old count + (parent + new) */ - add_entsize += - (uint)sizeof(xfs_dir2_ino8_t) - - (uint)sizeof(xfs_dir2_ino4_t); incr_isize += (sfp->count + 2) * ((uint)sizeof(xfs_dir2_ino8_t) - @@ -410,8 +333,7 @@ xfs_dir2_sf_addname( objchange = 1; } #endif - old_isize = (int)dp->i_d.di_size; - new_isize = old_isize + incr_isize; + new_isize = (int)dp->i_d.di_size + incr_isize; /* * Won't fit as shortform any more (due to size), * or the pick routine says it won't (due to offset values). @@ -483,8 +405,7 @@ xfs_dir2_sf_addname_easy( /* * Grow the in-inode space. */ - xfs_idata_realloc(dp, - xfs_dir3_sf_entsize(dp->i_mount, sfp, args->namelen), + xfs_idata_realloc(dp, dp->d_ops->sf_entsize(sfp, args->namelen), XFS_DATA_FORK); /* * Need to set up again due to realloc of the inode data. @@ -497,8 +418,8 @@ xfs_dir2_sf_addname_easy( sfep->namelen = args->namelen; xfs_dir2_sf_put_offset(sfep, offset); memcpy(sfep->name, args->name, sfep->namelen); - xfs_dir3_sfe_put_ino(dp->i_mount, sfp, sfep, args->inumber); - xfs_dir3_sfe_put_ftype(dp->i_mount, sfp, sfep, args->filetype); + dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); + dp->d_ops->sf_put_ftype(sfep, args->filetype); /* * Update the header and inode. @@ -557,13 +478,13 @@ xfs_dir2_sf_addname_hard( * to insert the new entry. * If it's going to end up at the end then oldsfep will point there. */ - for (offset = xfs_dir3_data_first_offset(mp), + for (offset = dp->d_ops->data_first_offset, oldsfep = xfs_dir2_sf_firstentry(oldsfp), - add_datasize = xfs_dir3_data_entsize(mp, args->namelen), + add_datasize = dp->d_ops->data_entsize(args->namelen), eof = (char *)oldsfep == &buf[old_isize]; !eof; - offset = new_offset + xfs_dir3_data_entsize(mp, oldsfep->namelen), - oldsfep = xfs_dir3_sf_nextentry(mp, oldsfp, oldsfep), + offset = new_offset + dp->d_ops->data_entsize(oldsfep->namelen), + oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep), eof = (char *)oldsfep == &buf[old_isize]) { new_offset = xfs_dir2_sf_get_offset(oldsfep); if (offset + add_datasize <= new_offset) @@ -592,8 +513,8 @@ xfs_dir2_sf_addname_hard( sfep->namelen = args->namelen; xfs_dir2_sf_put_offset(sfep, offset); memcpy(sfep->name, args->name, sfep->namelen); - xfs_dir3_sfe_put_ino(mp, sfp, sfep, args->inumber); - xfs_dir3_sfe_put_ftype(mp, sfp, sfep, args->filetype); + dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); + dp->d_ops->sf_put_ftype(sfep, args->filetype); sfp->count++; #if XFS_BIG_INUMS if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange) @@ -603,7 +524,7 @@ xfs_dir2_sf_addname_hard( * If there's more left to copy, do that. */ if (!eof) { - sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep); + sfep = dp->d_ops->sf_nextentry(sfp, sfep); memcpy(sfep, oldsfep, old_isize - nbytes); } kmem_free(buf); @@ -639,8 +560,8 @@ xfs_dir2_sf_addname_pick( mp = dp->i_mount; sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - size = xfs_dir3_data_entsize(mp, args->namelen); - offset = xfs_dir3_data_first_offset(mp); + size = dp->d_ops->data_entsize(args->namelen); + offset = dp->d_ops->data_first_offset; sfep = xfs_dir2_sf_firstentry(sfp); holefit = 0; /* @@ -652,8 +573,8 @@ xfs_dir2_sf_addname_pick( if (!holefit) holefit = offset + size <= xfs_dir2_sf_get_offset(sfep); offset = xfs_dir2_sf_get_offset(sfep) + - xfs_dir3_data_entsize(mp, sfep->namelen); - sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep); + dp->d_ops->data_entsize(sfep->namelen); + sfep = dp->d_ops->sf_nextentry(sfp, sfep); } /* * Calculate data bytes used excluding the new entry, if this @@ -667,7 +588,7 @@ xfs_dir2_sf_addname_pick( * we'll go back, convert to block, then try the insert and convert * to leaf. */ - if (used + (holefit ? 0 : size) > mp->m_dirblksize) + if (used + (holefit ? 0 : size) > args->geo->blksize) return 0; /* * If changing the inode number size, do it the hard way. @@ -682,7 +603,7 @@ xfs_dir2_sf_addname_pick( /* * If it won't fit at the end then do it the hard way (use the hole). */ - if (used + size > mp->m_dirblksize) + if (used + size > args->geo->blksize) return 2; /* * Do it the easy way. @@ -713,28 +634,27 @@ xfs_dir2_sf_check( mp = dp->i_mount; sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - offset = xfs_dir3_data_first_offset(mp); - ino = xfs_dir2_sf_get_parent_ino(sfp); + offset = dp->d_ops->data_first_offset; + ino = dp->d_ops->sf_get_parent_ino(sfp); i8count = ino > XFS_DIR2_MAX_SHORT_INUM; for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; - i++, sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep)) { + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset); - ino = xfs_dir3_sfe_get_ino(mp, sfp, sfep); + ino = dp->d_ops->sf_get_ino(sfp, sfep); i8count += ino > XFS_DIR2_MAX_SHORT_INUM; offset = xfs_dir2_sf_get_offset(sfep) + - xfs_dir3_data_entsize(mp, sfep->namelen); - ASSERT(xfs_dir3_sfe_get_ftype(mp, sfp, sfep) < - XFS_DIR3_FT_MAX); + dp->d_ops->data_entsize(sfep->namelen); + ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX); } ASSERT(i8count == sfp->i8count); ASSERT(XFS_BIG_INUMS || i8count == 0); ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size); ASSERT(offset + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + - (uint)sizeof(xfs_dir2_block_tail_t) <= mp->m_dirblksize); + (uint)sizeof(xfs_dir2_block_tail_t) <= args->geo->blksize); } #endif /* DEBUG */ @@ -783,7 +703,7 @@ xfs_dir2_sf_create( /* * Now can put in the inode number, since i8count is set. */ - xfs_dir2_sf_put_parent_ino(sfp, pino); + dp->d_ops->sf_put_parent_ino(sfp, pino); sfp->count = 0; dp->i_d.di_size = size; xfs_dir2_sf_check(args); @@ -838,7 +758,7 @@ xfs_dir2_sf_lookup( */ if (args->namelen == 2 && args->name[0] == '.' && args->name[1] == '.') { - args->inumber = xfs_dir2_sf_get_parent_ino(sfp); + args->inumber = dp->d_ops->sf_get_parent_ino(sfp); args->cmpresult = XFS_CMP_EXACT; args->filetype = XFS_DIR3_FT_DIR; return XFS_ERROR(EEXIST); @@ -848,7 +768,7 @@ xfs_dir2_sf_lookup( */ ci_sfep = NULL; for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; - i++, sfep = xfs_dir3_sf_nextentry(dp->i_mount, sfp, sfep)) { + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { /* * Compare name and if it's an exact match, return the inode * number. If it's the first case-insensitive match, store the @@ -858,10 +778,8 @@ xfs_dir2_sf_lookup( sfep->namelen); if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { args->cmpresult = cmp; - args->inumber = xfs_dir3_sfe_get_ino(dp->i_mount, - sfp, sfep); - args->filetype = xfs_dir3_sfe_get_ftype(dp->i_mount, - sfp, sfep); + args->inumber = dp->d_ops->sf_get_ino(sfp, sfep); + args->filetype = dp->d_ops->sf_get_ftype(sfep); if (cmp == XFS_CMP_EXACT) return XFS_ERROR(EEXIST); ci_sfep = sfep; @@ -917,10 +835,10 @@ xfs_dir2_sf_removename( * Find the one we're deleting. */ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; - i++, sfep = xfs_dir3_sf_nextentry(dp->i_mount, sfp, sfep)) { + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { if (xfs_da_compname(args, sfep->name, sfep->namelen) == XFS_CMP_EXACT) { - ASSERT(xfs_dir3_sfe_get_ino(dp->i_mount, sfp, sfep) == + ASSERT(dp->d_ops->sf_get_ino(sfp, sfep) == args->inumber); break; } @@ -934,7 +852,7 @@ xfs_dir2_sf_removename( * Calculate sizes. */ byteoff = (int)((char *)sfep - (char *)sfp); - entsize = xfs_dir3_sf_entsize(dp->i_mount, sfp, args->namelen); + entsize = dp->d_ops->sf_entsize(sfp, args->namelen); newsize = oldsize - entsize; /* * Copy the part if any after the removed entry, sliding it down. @@ -1041,28 +959,25 @@ xfs_dir2_sf_replace( if (args->namelen == 2 && args->name[0] == '.' && args->name[1] == '.') { #if XFS_BIG_INUMS || defined(DEBUG) - ino = xfs_dir2_sf_get_parent_ino(sfp); + ino = dp->d_ops->sf_get_parent_ino(sfp); ASSERT(args->inumber != ino); #endif - xfs_dir2_sf_put_parent_ino(sfp, args->inumber); + dp->d_ops->sf_put_parent_ino(sfp, args->inumber); } /* * Normal entry, look for the name. */ else { for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; - i++, sfep = xfs_dir3_sf_nextentry(dp->i_mount, sfp, sfep)) { + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { if (xfs_da_compname(args, sfep->name, sfep->namelen) == XFS_CMP_EXACT) { #if XFS_BIG_INUMS || defined(DEBUG) - ino = xfs_dir3_sfe_get_ino(dp->i_mount, - sfp, sfep); + ino = dp->d_ops->sf_get_ino(sfp, sfep); ASSERT(args->inumber != ino); #endif - xfs_dir3_sfe_put_ino(dp->i_mount, sfp, sfep, - args->inumber); - xfs_dir3_sfe_put_ftype(dp->i_mount, sfp, sfep, - args->filetype); + dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); + dp->d_ops->sf_put_ftype(sfep, args->filetype); break; } } @@ -1165,22 +1080,21 @@ xfs_dir2_sf_toino4( */ sfp->count = oldsfp->count; sfp->i8count = 0; - xfs_dir2_sf_put_parent_ino(sfp, xfs_dir2_sf_get_parent_ino(oldsfp)); + dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp)); /* * Copy the entries field by field. */ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), oldsfep = xfs_dir2_sf_firstentry(oldsfp); i < sfp->count; - i++, sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep), - oldsfep = xfs_dir3_sf_nextentry(mp, oldsfp, oldsfep)) { + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), + oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; sfep->offset = oldsfep->offset; memcpy(sfep->name, oldsfep->name, sfep->namelen); - xfs_dir3_sfe_put_ino(mp, sfp, sfep, - xfs_dir3_sfe_get_ino(mp, oldsfp, oldsfep)); - xfs_dir3_sfe_put_ftype(mp, sfp, sfep, - xfs_dir3_sfe_get_ftype(mp, oldsfp, oldsfep)); + dp->d_ops->sf_put_ino(sfp, sfep, + dp->d_ops->sf_get_ino(oldsfp, oldsfep)); + dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep)); } /* * Clean up the inode. @@ -1191,9 +1105,9 @@ xfs_dir2_sf_toino4( } /* - * Convert from 4-byte inode numbers to 8-byte inode numbers. - * The new 8-byte inode number is not there yet, we leave with the - * count 1 but no corresponding entry. + * Convert existing entries from 4-byte inode numbers to 8-byte inode numbers. + * The new entry w/ an 8-byte inode number is not there yet; we leave with + * i8count set to 1, but no corresponding 8-byte entry. */ static void xfs_dir2_sf_toino8( @@ -1226,7 +1140,7 @@ xfs_dir2_sf_toino8( ASSERT(oldsfp->i8count == 0); memcpy(buf, oldsfp, oldsize); /* - * Compute the new inode size. + * Compute the new inode size (nb: entry count + 1 for parent) */ newsize = oldsize + @@ -1244,22 +1158,21 @@ xfs_dir2_sf_toino8( */ sfp->count = oldsfp->count; sfp->i8count = 1; - xfs_dir2_sf_put_parent_ino(sfp, xfs_dir2_sf_get_parent_ino(oldsfp)); + dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp)); /* * Copy the entries field by field. */ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), oldsfep = xfs_dir2_sf_firstentry(oldsfp); i < sfp->count; - i++, sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep), - oldsfep = xfs_dir3_sf_nextentry(mp, oldsfp, oldsfep)) { + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), + oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; sfep->offset = oldsfep->offset; memcpy(sfep->name, oldsfep->name, sfep->namelen); - xfs_dir3_sfe_put_ino(mp, sfp, sfep, - xfs_dir3_sfe_get_ino(mp, oldsfp, oldsfep)); - xfs_dir3_sfe_put_ftype(mp, sfp, sfep, - xfs_dir3_sfe_get_ftype(mp, oldsfp, oldsfep)); + dp->d_ops->sf_put_ino(sfp, sfep, + dp->d_ops->sf_get_ino(oldsfp, oldsfep)); + dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep)); } /* * Clean up the inode. diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 45560ee1a4b..4f11ef01113 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -17,22 +17,21 @@ */ #include "xfs.h" #include "xfs_format.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_quota.h" -#include "xfs_alloc_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_btree.h" #include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_alloc_btree.h" #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_extent_busy.h" #include "xfs_discard.h" #include "xfs_trace.h" +#include "xfs_log.h" STATIC int xfs_trim_extents( @@ -158,7 +157,7 @@ xfs_ioc_trim( struct xfs_mount *mp, struct fstrim_range __user *urange) { - struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue(mp->m_ddev_targp->bt_bdev); unsigned int granularity = q->limits.discard_granularity; struct fstrim_range range; xfs_daddr_t start, end, minlen; @@ -181,7 +180,8 @@ xfs_ioc_trim( * matter as trimming blocks is an advisory interface. */ if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) || - range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp))) + range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) || + range.len < mp->m_sb.sb_blocksize) return -XFS_ERROR(EINVAL); start = BTOBB(range.start); diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 1ee776d477c..3ee0cd43edc 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -18,28 +18,28 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_alloc.h" -#include "xfs_quota.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" -#include "xfs_rtalloc.h" +#include "xfs_alloc.h" +#include "xfs_quota.h" #include "xfs_error.h" -#include "xfs_itable.h" -#include "xfs_attr.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_trans_space.h" #include "xfs_trans_priv.h" #include "xfs_qm.h" #include "xfs_cksum.h" #include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_bmap_btree.h" /* * Lock order: @@ -292,118 +292,6 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5; } -STATIC bool -xfs_dquot_buf_verify_crc( - struct xfs_mount *mp, - struct xfs_buf *bp) -{ - struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; - int ndquots; - int i; - - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return true; - - /* - * if we are in log recovery, the quota subsystem has not been - * initialised so we have no quotainfo structure. In that case, we need - * to manually calculate the number of dquots in the buffer. - */ - if (mp->m_quotainfo) - ndquots = mp->m_quotainfo->qi_dqperchunk; - else - ndquots = xfs_qm_calc_dquots_per_chunk(mp, - XFS_BB_TO_FSB(mp, bp->b_length)); - - for (i = 0; i < ndquots; i++, d++) { - if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk), - XFS_DQUOT_CRC_OFF)) - return false; - if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid)) - return false; - } - return true; -} - -STATIC bool -xfs_dquot_buf_verify( - struct xfs_mount *mp, - struct xfs_buf *bp) -{ - struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; - xfs_dqid_t id = 0; - int ndquots; - int i; - - /* - * if we are in log recovery, the quota subsystem has not been - * initialised so we have no quotainfo structure. In that case, we need - * to manually calculate the number of dquots in the buffer. - */ - if (mp->m_quotainfo) - ndquots = mp->m_quotainfo->qi_dqperchunk; - else - ndquots = xfs_qm_calc_dquots_per_chunk(mp, bp->b_length); - - /* - * On the first read of the buffer, verify that each dquot is valid. - * We don't know what the id of the dquot is supposed to be, just that - * they should be increasing monotonically within the buffer. If the - * first id is corrupt, then it will fail on the second dquot in the - * buffer so corruptions could point to the wrong dquot in this case. - */ - for (i = 0; i < ndquots; i++) { - struct xfs_disk_dquot *ddq; - int error; - - ddq = &d[i].dd_diskdq; - - if (i == 0) - id = be32_to_cpu(ddq->d_id); - - error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, - "xfs_dquot_buf_verify"); - if (error) - return false; - } - return true; -} - -static void -xfs_dquot_buf_read_verify( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_target->bt_mount; - - if (!xfs_dquot_buf_verify_crc(mp, bp) || !xfs_dquot_buf_verify(mp, bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } -} - -/* - * we don't calculate the CRC here as that is done when the dquot is flushed to - * the buffer after the update is done. This ensures that the dquot in the - * buffer always has an up-to-date CRC value. - */ -void -xfs_dquot_buf_write_verify( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_target->bt_mount; - - if (!xfs_dquot_buf_verify(mp, bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); - xfs_buf_ioerror(bp, EFSCORRUPTED); - return; - } -} - -const struct xfs_buf_ops xfs_dquot_buf_ops = { - .verify_read = xfs_dquot_buf_read_verify, - .verify_write = xfs_dquot_buf_write_verify, -}; - /* * Allocate a block and fill it with dquots. * This is called when the bmapi finds a hole. @@ -465,10 +353,10 @@ xfs_qm_dqalloc( dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0); - - error = xfs_buf_geterror(bp); - if (error) + if (!bp) { + error = ENOMEM; goto error1; + } bp->b_ops = &xfs_dquot_buf_ops; /* @@ -514,6 +402,7 @@ xfs_qm_dqalloc( return (error); } + STATIC int xfs_qm_dqrepair( struct xfs_mount *mp, @@ -547,7 +436,7 @@ xfs_qm_dqrepair( /* Do the actual repair of dquots in this buffer */ for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { ddq = &d[i].dd_diskdq; - error = xfs_qm_dqcheck(mp, ddq, firstid + i, + error = xfs_dqcheck(mp, ddq, firstid + i, dqp->dq_flags & XFS_DQ_ALLTYPES, XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair"); if (error) { @@ -580,16 +469,17 @@ xfs_qm_dqtobp( struct xfs_mount *mp = dqp->q_mount; xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); struct xfs_trans *tp = (tpp ? *tpp : NULL); + uint lock_mode; dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; - xfs_ilock(quotip, XFS_ILOCK_SHARED); + lock_mode = xfs_ilock_data_map_shared(quotip); if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { /* * Return if this type of quotas is turned off while we * didn't have the quota inode lock. */ - xfs_iunlock(quotip, XFS_ILOCK_SHARED); + xfs_iunlock(quotip, lock_mode); return ESRCH; } @@ -599,7 +489,7 @@ xfs_qm_dqtobp( error = xfs_bmapi_read(quotip, dqp->q_fileoffset, XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0); - xfs_iunlock(quotip, XFS_ILOCK_SHARED); + xfs_iunlock(quotip, lock_mode); if (error) return error; @@ -725,7 +615,7 @@ xfs_qm_dqread( if (flags & XFS_QMOPT_DQALLOC) { tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_attrsetm, + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_dqalloc, XFS_QM_DQALLOC_SPACE_RES(mp), 0); if (error) goto error1; @@ -942,47 +832,6 @@ restart: return (0); } - -STATIC void -xfs_qm_dqput_final( - struct xfs_dquot *dqp) -{ - struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; - struct xfs_dquot *gdqp; - struct xfs_dquot *pdqp; - - trace_xfs_dqput_free(dqp); - - if (list_lru_add(&qi->qi_lru, &dqp->q_lru)) - XFS_STATS_INC(xs_qm_dquot_unused); - - /* - * If we just added a udquot to the freelist, then we want to release - * the gdquot/pdquot reference that it (probably) has. Otherwise it'll - * keep the gdquot/pdquot from getting reclaimed. - */ - gdqp = dqp->q_gdquot; - if (gdqp) { - xfs_dqlock(gdqp); - dqp->q_gdquot = NULL; - } - - pdqp = dqp->q_pdquot; - if (pdqp) { - xfs_dqlock(pdqp); - dqp->q_pdquot = NULL; - } - xfs_dqunlock(dqp); - - /* - * If we had a group/project quota hint, release it now. - */ - if (gdqp) - xfs_qm_dqput(gdqp); - if (pdqp) - xfs_qm_dqput(pdqp); -} - /* * Release a reference to the dquot (decrement ref-count) and unlock it. * @@ -998,10 +847,14 @@ xfs_qm_dqput( trace_xfs_dqput(dqp); - if (--dqp->q_nrefs > 0) - xfs_dqunlock(dqp); - else - xfs_qm_dqput_final(dqp); + if (--dqp->q_nrefs == 0) { + struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; + trace_xfs_dqput_free(dqp); + + if (list_lru_add(&qi->qi_lru, &dqp->q_lru)) + XFS_STATS_INC(xs_qm_dquot_unused); + } + xfs_dqunlock(dqp); } /* @@ -1133,7 +986,7 @@ xfs_qm_dqflush( /* * A simple sanity check in case we got a corrupted dquot.. */ - error = xfs_qm_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0, + error = xfs_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0, XFS_QMOPT_DOWARN, "dqflush (incore copy)"); if (error) { xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 55abbca2883..68a68f70483 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -52,8 +52,6 @@ typedef struct xfs_dquot { int q_bufoffset; /* off of dq in buffer (# dquots) */ xfs_fileoff_t q_fileoffset; /* offset in quotas file */ - struct xfs_dquot*q_gdquot; /* group dquot, hint only */ - struct xfs_dquot*q_pdquot; /* project dquot, hint only */ xfs_disk_dquot_t q_core; /* actual usage & quotas */ xfs_dq_logitem_t q_logitem; /* dquot log item */ xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */ @@ -172,6 +170,4 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) return dqp; } -extern const struct xfs_buf_ops xfs_dquot_buf_ops; - #endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/xfs_dquot_buf.c new file mode 100644 index 00000000000..c2ac0c611ad --- /dev/null +++ b/fs/xfs/xfs_dquot_buf.c @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_trans.h" +#include "xfs_qm.h" +#include "xfs_error.h" +#include "xfs_cksum.h" +#include "xfs_trace.h" + +int +xfs_calc_dquots_per_chunk( + unsigned int nbblks) /* basic block units */ +{ + unsigned int ndquots; + + ASSERT(nbblks > 0); + ndquots = BBTOB(nbblks); + do_div(ndquots, sizeof(xfs_dqblk_t)); + + return ndquots; +} + +/* + * Do some primitive error checking on ondisk dquot data structures. + */ +int +xfs_dqcheck( + struct xfs_mount *mp, + xfs_disk_dquot_t *ddq, + xfs_dqid_t id, + uint type, /* used only when IO_dorepair is true */ + uint flags, + char *str) +{ + xfs_dqblk_t *d = (xfs_dqblk_t *)ddq; + int errs = 0; + + /* + * We can encounter an uninitialized dquot buffer for 2 reasons: + * 1. If we crash while deleting the quotainode(s), and those blks got + * used for user data. This is because we take the path of regular + * file deletion; however, the size field of quotainodes is never + * updated, so all the tricks that we play in itruncate_finish + * don't quite matter. + * + * 2. We don't play the quota buffers when there's a quotaoff logitem. + * But the allocation will be replayed so we'll end up with an + * uninitialized quota block. + * + * This is all fine; things are still consistent, and we haven't lost + * any quota information. Just don't complain about bad dquot blks. + */ + if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x", + str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC); + errs++; + } + if (ddq->d_version != XFS_DQUOT_VERSION) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x", + str, id, ddq->d_version, XFS_DQUOT_VERSION); + errs++; + } + + if (ddq->d_flags != XFS_DQ_USER && + ddq->d_flags != XFS_DQ_PROJ && + ddq->d_flags != XFS_DQ_GROUP) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : XFS dquot ID 0x%x, unknown flags 0x%x", + str, id, ddq->d_flags); + errs++; + } + + if (id != -1 && id != be32_to_cpu(ddq->d_id)) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : ondisk-dquot 0x%p, ID mismatch: " + "0x%x expected, found id 0x%x", + str, ddq, id, be32_to_cpu(ddq->d_id)); + errs++; + } + + if (!errs && ddq->d_id) { + if (ddq->d_blk_softlimit && + be64_to_cpu(ddq->d_bcount) > + be64_to_cpu(ddq->d_blk_softlimit)) { + if (!ddq->d_btimer) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED", + str, (int)be32_to_cpu(ddq->d_id), ddq); + errs++; + } + } + if (ddq->d_ino_softlimit && + be64_to_cpu(ddq->d_icount) > + be64_to_cpu(ddq->d_ino_softlimit)) { + if (!ddq->d_itimer) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED", + str, (int)be32_to_cpu(ddq->d_id), ddq); + errs++; + } + } + if (ddq->d_rtb_softlimit && + be64_to_cpu(ddq->d_rtbcount) > + be64_to_cpu(ddq->d_rtb_softlimit)) { + if (!ddq->d_rtbtimer) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED", + str, (int)be32_to_cpu(ddq->d_id), ddq); + errs++; + } + } + } + + if (!errs || !(flags & XFS_QMOPT_DQREPAIR)) + return errs; + + if (flags & XFS_QMOPT_DOWARN) + xfs_notice(mp, "Re-initializing dquot ID 0x%x", id); + + /* + * Typically, a repair is only requested by quotacheck. + */ + ASSERT(id != -1); + ASSERT(flags & XFS_QMOPT_DQREPAIR); + memset(d, 0, sizeof(xfs_dqblk_t)); + + d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + d->dd_diskdq.d_version = XFS_DQUOT_VERSION; + d->dd_diskdq.d_flags = type; + d->dd_diskdq.d_id = cpu_to_be32(id); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + + return errs; +} + +STATIC bool +xfs_dquot_buf_verify_crc( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + int ndquots; + int i; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return true; + + /* + * if we are in log recovery, the quota subsystem has not been + * initialised so we have no quotainfo structure. In that case, we need + * to manually calculate the number of dquots in the buffer. + */ + if (mp->m_quotainfo) + ndquots = mp->m_quotainfo->qi_dqperchunk; + else + ndquots = xfs_calc_dquots_per_chunk( + XFS_BB_TO_FSB(mp, bp->b_length)); + + for (i = 0; i < ndquots; i++, d++) { + if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF)) + return false; + if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid)) + return false; + } + return true; +} + +STATIC bool +xfs_dquot_buf_verify( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + xfs_dqid_t id = 0; + int ndquots; + int i; + + /* + * if we are in log recovery, the quota subsystem has not been + * initialised so we have no quotainfo structure. In that case, we need + * to manually calculate the number of dquots in the buffer. + */ + if (mp->m_quotainfo) + ndquots = mp->m_quotainfo->qi_dqperchunk; + else + ndquots = xfs_calc_dquots_per_chunk(bp->b_length); + + /* + * On the first read of the buffer, verify that each dquot is valid. + * We don't know what the id of the dquot is supposed to be, just that + * they should be increasing monotonically within the buffer. If the + * first id is corrupt, then it will fail on the second dquot in the + * buffer so corruptions could point to the wrong dquot in this case. + */ + for (i = 0; i < ndquots; i++) { + struct xfs_disk_dquot *ddq; + int error; + + ddq = &d[i].dd_diskdq; + + if (i == 0) + id = be32_to_cpu(ddq->d_id); + + error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, + "xfs_dquot_buf_verify"); + if (error) + return false; + } + return true; +} + +static void +xfs_dquot_buf_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (!xfs_dquot_buf_verify_crc(mp, bp)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_dquot_buf_verify(mp, bp)) + xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +/* + * we don't calculate the CRC here as that is done when the dquot is flushed to + * the buffer after the update is done. This ensures that the dquot in the + * buffer always has an up-to-date CRC value. + */ +static void +xfs_dquot_buf_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (!xfs_dquot_buf_verify(mp, bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } +} + +const struct xfs_buf_ops xfs_dquot_buf_ops = { + .verify_read = xfs_dquot_buf_read_verify, + .verify_write = xfs_dquot_buf_write_verify, +}; + diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index e838d84b4e8..f33fbaaa4d8 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -18,23 +18,19 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_format.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_alloc.h" -#include "xfs_quota.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" #include "xfs_inode.h" -#include "xfs_bmap.h" -#include "xfs_rtalloc.h" +#include "xfs_quota.h" #include "xfs_error.h" -#include "xfs_itable.h" -#include "xfs_attr.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" #include "xfs_qm.h" +#include "xfs_log.h" static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip) { @@ -61,20 +57,24 @@ xfs_qm_dquot_logitem_size( STATIC void xfs_qm_dquot_logitem_format( struct xfs_log_item *lip, - struct xfs_log_iovec *logvec) + struct xfs_log_vec *lv) { struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); - - logvec->i_addr = &qlip->qli_format; - logvec->i_len = sizeof(xfs_dq_logformat_t); - logvec->i_type = XLOG_REG_TYPE_QFORMAT; - logvec++; - logvec->i_addr = &qlip->qli_dquot->q_core; - logvec->i_len = sizeof(xfs_disk_dquot_t); - logvec->i_type = XLOG_REG_TYPE_DQUOT; - - qlip->qli_format.qlf_size = 2; - + struct xfs_log_iovec *vecp = NULL; + struct xfs_dq_logformat *qlf; + + qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT); + qlf->qlf_type = XFS_LI_DQUOT; + qlf->qlf_size = 2; + qlf->qlf_id = be32_to_cpu(qlip->qli_dquot->q_core.d_id); + qlf->qlf_blkno = qlip->qli_dquot->q_blkno; + qlf->qlf_len = 1; + qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset; + xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat)); + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT, + &qlip->qli_dquot->q_core, + sizeof(struct xfs_disk_dquot)); } /* @@ -261,18 +261,6 @@ xfs_qm_dquot_logitem_init( xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT, &xfs_dquot_item_ops); lp->qli_dquot = dqp; - lp->qli_format.qlf_type = XFS_LI_DQUOT; - lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id); - lp->qli_format.qlf_blkno = dqp->q_blkno; - lp->qli_format.qlf_len = 1; - /* - * This is just the offset of this dquot within its buffer - * (which is currently 1 FSB and probably won't change). - * Hence 32 bits for this offset should be just fine. - * Alternatively, we can store (bufoffset / sizeof(xfs_dqblk_t)) - * here, and recompute it at recovery time. - */ - lp->qli_format.qlf_boffset = (__uint32_t)dqp->q_bufoffset; } /*------------------ QUOTAOFF LOG ITEMS -------------------*/ @@ -298,26 +286,20 @@ xfs_qm_qoff_logitem_size( *nbytes += sizeof(struct xfs_qoff_logitem); } -/* - * This is called to fill in the vector of log iovecs for the - * given quotaoff log item. We use only 1 iovec, and we point that - * at the quotaoff_log_format structure embedded in the quotaoff item. - * It is at this point that we assert that all of the extent - * slots in the quotaoff item have been filled. - */ STATIC void xfs_qm_qoff_logitem_format( struct xfs_log_item *lip, - struct xfs_log_iovec *log_vector) + struct xfs_log_vec *lv) { struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip); - - ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF); - - log_vector->i_addr = &qflip->qql_format; - log_vector->i_len = sizeof(xfs_qoff_logitem_t); - log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF; - qflip->qql_format.qf_size = 1; + struct xfs_log_iovec *vecp = NULL; + struct xfs_qoff_logformat *qlf; + + qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QUOTAOFF); + qlf->qf_type = XFS_LI_QUOTAOFF; + qlf->qf_size = 1; + qlf->qf_flags = qflip->qql_flags; + xlog_finish_iovec(lv, vecp, sizeof(struct xfs_qoff_logitem)); } /* @@ -457,8 +439,7 @@ xfs_qm_qoff_logitem_init( xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); qf->qql_item.li_mountp = mp; - qf->qql_format.qf_type = XFS_LI_QUOTAOFF; - qf->qql_format.qf_flags = flags; qf->qql_start_lip = start; + qf->qql_flags = flags; return qf; } diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h index 5acae2ada70..502e9464634 100644 --- a/fs/xfs/xfs_dquot_item.h +++ b/fs/xfs/xfs_dquot_item.h @@ -27,13 +27,12 @@ typedef struct xfs_dq_logitem { xfs_log_item_t qli_item; /* common portion */ struct xfs_dquot *qli_dquot; /* dquot ptr */ xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ - xfs_dq_logformat_t qli_format; /* logged structure */ } xfs_dq_logitem_t; typedef struct xfs_qoff_logitem { xfs_log_item_t qql_item; /* common portion */ struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */ - xfs_qoff_logformat_t qql_format; /* logged structure */ + unsigned int qql_flags; } xfs_qoff_logitem_t; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 1123d93ff79..edac5b057d2 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -16,16 +16,13 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" +#include "xfs_format.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" #include "xfs_error.h" #ifdef DEBUG @@ -159,7 +156,7 @@ xfs_error_report( { if (level <= xfs_error_level) { xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT, - "Internal error %s at line %d of file %s. Caller 0x%p\n", + "Internal error %s at line %d of file %s. Caller %pF", tag, linenum, filename, ra); xfs_stack_trace(); @@ -181,3 +178,28 @@ xfs_corruption_error( xfs_error_report(tag, level, mp, filename, linenum, ra); xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair"); } + +/* + * Warnings specifically for verifier errors. Differentiate CRC vs. invalid + * values, and omit the stack trace unless the error level is tuned high. + */ +void +xfs_verifier_error( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx", + bp->b_error == EFSBADCRC ? "CRC error" : "corruption", + __return_address, bp->b_bn); + + xfs_alert(mp, "Unmount and run xfs_repair"); + + if (xfs_error_level >= XFS_ERRLEVEL_LOW) { + xfs_alert(mp, "First 64 bytes of corrupted metadata buffer:"); + xfs_hex_dump(xfs_buf_offset(bp, 0), 64); + } + + if (xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); +} diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 079a367f44e..c1c57d4a4b5 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -34,6 +34,7 @@ extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, extern void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp, void *p, const char *filename, int linenum, inst_t *ra); +extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_ERROR_REPORT(e, lvl, mp) \ xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 066df425c14..753e467aa1a 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -16,21 +16,21 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_da_btree.h" -#include "xfs_dir2_format.h" +#include "xfs_da_format.h" #include "xfs_dir2.h" #include "xfs_export.h" -#include "xfs_bmap_btree.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_log.h" /* * Note that we only accept fileids which are long enough rather than allow @@ -237,7 +237,7 @@ xfs_fs_nfs_commit_metadata( if (!lsn) return 0; - return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); } const struct export_operations xfs_export_operations = { diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index e43708e2f08..fd22f69049d 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -19,17 +19,18 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" #include "xfs_alloc.h" -#include "xfs_inode.h" #include "xfs_extent_busy.h" #include "xfs_trace.h" +#include "xfs_trans.h" +#include "xfs_log.h" void xfs_extent_busy_insert( diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index 985412d65ba..bfff284d2dc 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -20,6 +20,10 @@ #ifndef __XFS_EXTENT_BUSY_H__ #define __XFS_EXTENT_BUSY_H__ +struct xfs_mount; +struct xfs_trans; +struct xfs_alloc_arg; + /* * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that * have been freed but whose transactions aren't committed to disk yet. diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index dc53e8febbb..fb7a4c1ce1c 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -17,15 +17,16 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_buf_item.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_trans.h" #include "xfs_trans_priv.h" +#include "xfs_buf_item.h" #include "xfs_extfree_item.h" +#include "xfs_log.h" kmem_zone_t *xfs_efi_zone; @@ -101,9 +102,10 @@ xfs_efi_item_size( STATIC void xfs_efi_item_format( struct xfs_log_item *lip, - struct xfs_log_iovec *log_vector) + struct xfs_log_vec *lv) { struct xfs_efi_log_item *efip = EFI_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; ASSERT(atomic_read(&efip->efi_next_extent) == efip->efi_format.efi_nextents); @@ -111,10 +113,9 @@ xfs_efi_item_format( efip->efi_format.efi_type = XFS_LI_EFI; efip->efi_format.efi_size = 1; - log_vector->i_addr = &efip->efi_format; - log_vector->i_len = xfs_efi_item_sizeof(efip); - log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT; - ASSERT(log_vector->i_len >= sizeof(xfs_efi_log_format_t)); + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT, + &efip->efi_format, + xfs_efi_item_sizeof(efip)); } @@ -368,19 +369,19 @@ xfs_efd_item_size( STATIC void xfs_efd_item_format( struct xfs_log_item *lip, - struct xfs_log_iovec *log_vector) + struct xfs_log_vec *lv) { struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents); efdp->efd_format.efd_type = XFS_LI_EFD; efdp->efd_format.efd_size = 1; - log_vector->i_addr = &efdp->efd_format; - log_vector->i_len = xfs_efd_item_sizeof(efdp); - log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT; - ASSERT(log_vector->i_len >= sizeof(xfs_efd_log_format_t)); + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT, + &efdp->efd_format, + xfs_efd_item_sizeof(efdp)); } /* diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 4c749ab543d..1f66779d7a4 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -17,25 +17,27 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_log.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_trans.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc.h" -#include "xfs_dinode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_error.h" -#include "xfs_da_btree.h" -#include "xfs_dir2_format.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_ioctl.h" #include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_dinode.h" #include <linux/aio.h> #include <linux/dcache.h> @@ -153,7 +155,7 @@ xfs_dir_fsync( if (!lsn) return 0; - return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); } STATIC int @@ -227,39 +229,33 @@ xfs_file_fsync( } STATIC ssize_t -xfs_file_aio_read( +xfs_file_read_iter( struct kiocb *iocb, - const struct iovec *iovp, - unsigned long nr_segs, - loff_t pos) + struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - size_t size = 0; + size_t size = iov_iter_count(to); ssize_t ret = 0; int ioflags = 0; xfs_fsize_t n; + loff_t pos = iocb->ki_pos; XFS_STATS_INC(xs_read_calls); - BUG_ON(iocb->ki_pos != pos); - if (unlikely(file->f_flags & O_DIRECT)) ioflags |= IO_ISDIRECT; if (file->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; - ret = generic_segment_checks(iovp, &nr_segs, &size, VERIFY_WRITE); - if (ret < 0) - return ret; - if (unlikely(ioflags & IO_ISDIRECT)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; - if ((pos & target->bt_smask) || (size & target->bt_smask)) { + /* DIO must be aligned to device logical sector size */ + if ((pos | size) & target->bt_logical_sectormask) { if (pos == i_size_read(inode)) return 0; return -XFS_ERROR(EINVAL); @@ -292,7 +288,7 @@ xfs_file_aio_read( xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { - ret = -filemap_write_and_wait_range( + ret = filemap_write_and_wait_range( VFS_I(ip)->i_mapping, pos, -1); if (ret) { @@ -306,7 +302,7 @@ xfs_file_aio_read( trace_xfs_file_read(ip, size, pos, ioflags); - ret = generic_file_aio_read(iocb, iovp, nr_segs, pos); + ret = generic_file_read_iter(iocb, to); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); @@ -347,47 +343,6 @@ xfs_file_splice_read( } /* - * xfs_file_splice_write() does not use xfs_rw_ilock() because - * generic_file_splice_write() takes the i_mutex itself. This, in theory, - * couuld cause lock inversions between the aio_write path and the splice path - * if someone is doing concurrent splice(2) based writes and write(2) based - * writes to the same inode. The only real way to fix this is to re-implement - * the generic code here with correct locking orders. - */ -STATIC ssize_t -xfs_file_splice_write( - struct pipe_inode_info *pipe, - struct file *outfilp, - loff_t *ppos, - size_t count, - unsigned int flags) -{ - struct inode *inode = outfilp->f_mapping->host; - struct xfs_inode *ip = XFS_I(inode); - int ioflags = 0; - ssize_t ret; - - XFS_STATS_INC(xs_write_calls); - - if (outfilp->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EIO; - - xfs_ilock(ip, XFS_IOLOCK_EXCL); - - trace_xfs_file_splice_write(ip, count, *ppos, ioflags); - - ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); - if (ret > 0) - XFS_STATS_ADD(xs_write_bytes, ret); - - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return ret; -} - -/* * This routine is called to handle zeroing any space in the last block of the * file that is beyond the EOF. We do this since the size is being increased * without writing anything to that block and we don't want to read the @@ -622,10 +577,7 @@ restart: STATIC ssize_t xfs_file_dio_aio_write( struct kiocb *iocb, - const struct iovec *iovp, - unsigned long nr_segs, - loff_t pos, - size_t ocount) + struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -633,15 +585,18 @@ xfs_file_dio_aio_write( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t ret = 0; - size_t count = ocount; int unaligned_io = 0; int iolock; + size_t count = iov_iter_count(from); + loff_t pos = iocb->ki_pos; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; - if ((pos & target->bt_smask) || (count & target->bt_smask)) + /* DIO must be aligned to device logical sector size */ + if ((pos | count) & target->bt_logical_sectormask) return -XFS_ERROR(EINVAL); + /* "unaligned" here means not aligned to a filesystem block */ if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) unaligned_io = 1; @@ -672,9 +627,10 @@ xfs_file_dio_aio_write( ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); if (ret) goto out; + iov_iter_truncate(from, count); if (mapping->nrpages) { - ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, pos, -1); if (ret) goto out; @@ -693,8 +649,7 @@ xfs_file_dio_aio_write( } trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); - ret = generic_file_direct_write(iocb, iovp, - &nr_segs, pos, &iocb->ki_pos, count, ocount); + ret = generic_file_direct_write(iocb, from, pos); out: xfs_rw_iunlock(ip, iolock); @@ -707,10 +662,7 @@ out: STATIC ssize_t xfs_file_buffered_aio_write( struct kiocb *iocb, - const struct iovec *iovp, - unsigned long nr_segs, - loff_t pos, - size_t ocount) + struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -719,7 +671,8 @@ xfs_file_buffered_aio_write( ssize_t ret; int enospc = 0; int iolock = XFS_IOLOCK_EXCL; - size_t count = ocount; + loff_t pos = iocb->ki_pos; + size_t count = iov_iter_count(from); xfs_rw_ilock(ip, iolock); @@ -727,14 +680,15 @@ xfs_file_buffered_aio_write( if (ret) goto out; + iov_iter_truncate(from, count); /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; write_retry: trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); - ret = generic_file_buffered_write(iocb, iovp, nr_segs, - pos, &iocb->ki_pos, count, 0); - + ret = generic_perform_write(file, from, pos); + if (likely(ret >= 0)) + iocb->ki_pos = pos + ret; /* * If we just got an ENOSPC, try to write back all dirty inodes to * convert delalloc space to free up some of the excess reserved @@ -753,40 +707,29 @@ out: } STATIC ssize_t -xfs_file_aio_write( +xfs_file_write_iter( struct kiocb *iocb, - const struct iovec *iovp, - unsigned long nr_segs, - loff_t pos) + struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; - size_t ocount = 0; + size_t ocount = iov_iter_count(from); XFS_STATS_INC(xs_write_calls); - BUG_ON(iocb->ki_pos != pos); - - ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); - if (ret) - return ret; - if (ocount == 0) return 0; - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { - ret = -EIO; - goto out; - } + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; if (unlikely(file->f_flags & O_DIRECT)) - ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount); + ret = xfs_file_dio_aio_write(iocb, from); else - ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, - ocount); + ret = xfs_file_buffered_aio_write(iocb, from); if (ret > 0) { ssize_t err; @@ -794,55 +737,99 @@ xfs_file_aio_write( XFS_STATS_ADD(xs_write_bytes, ret); /* Handle various SYNC-type writes */ - err = generic_write_sync(file, pos, ret); + err = generic_write_sync(file, iocb->ki_pos - ret, ret); if (err < 0) ret = err; } - -out: return ret; } STATIC long xfs_file_fallocate( - struct file *file, - int mode, - loff_t offset, - loff_t len) + struct file *file, + int mode, + loff_t offset, + loff_t len) { - struct inode *inode = file_inode(file); - long error; - loff_t new_size = 0; - xfs_flock64_t bf; - xfs_inode_t *ip = XFS_I(inode); - int cmd = XFS_IOC_RESVSP; - int attr_flags = XFS_ATTR_NOLOCK; + struct inode *inode = file_inode(file); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_trans *tp; + long error; + loff_t new_size = 0; - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; - bf.l_whence = 0; - bf.l_start = offset; - bf.l_len = len; - xfs_ilock(ip, XFS_IOLOCK_EXCL); + if (mode & FALLOC_FL_PUNCH_HOLE) { + error = xfs_free_file_space(ip, offset, len); + if (error) + goto out_unlock; + } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { + unsigned blksize_mask = (1 << inode->i_blkbits) - 1; + + if (offset & blksize_mask || len & blksize_mask) { + error = EINVAL; + goto out_unlock; + } + + /* + * There is no need to overlap collapse range with EOF, + * in which case it is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) { + error = EINVAL; + goto out_unlock; + } + + new_size = i_size_read(inode) - len; - if (mode & FALLOC_FL_PUNCH_HOLE) - cmd = XFS_IOC_UNRESVSP; + error = xfs_collapse_file_space(ip, offset, len); + if (error) + goto out_unlock; + } else { + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + error = -inode_newsize_ok(inode, new_size); + if (error) + goto out_unlock; + } - /* check the new inode size is valid before allocating */ - if (!(mode & FALLOC_FL_KEEP_SIZE) && - offset + len > i_size_read(inode)) { - new_size = offset + len; - error = inode_newsize_ok(inode, new_size); + if (mode & FALLOC_FL_ZERO_RANGE) + error = xfs_zero_file_space(ip, offset, len); + else + error = xfs_alloc_file_space(ip, offset, len, + XFS_BMAPI_PREALLOC); if (error) goto out_unlock; } - if (file->f_flags & O_DSYNC) - attr_flags |= XFS_ATTR_SYNC; + tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID); + error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_unlock; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + ip->i_d.di_mode &= ~S_ISUID; + if (ip->i_d.di_mode & S_IXGRP) + ip->i_d.di_mode &= ~S_ISGID; + + if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE))) + ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; + + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags); + if (file->f_flags & O_DSYNC) + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); if (error) goto out_unlock; @@ -852,12 +839,12 @@ xfs_file_fallocate( iattr.ia_valid = ATTR_SIZE; iattr.ia_size = new_size; - error = -xfs_setattr_size(ip, &iattr, XFS_ATTR_NOLOCK); + error = xfs_setattr_size(ip, &iattr); } out_unlock: xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return error; + return -error; } @@ -890,9 +877,9 @@ xfs_dir_open( * If there are any blocks, read-ahead block 0 as we're almost * certain to have the next operation be a read there. */ - mode = xfs_ilock_map_shared(ip); + mode = xfs_ilock_data_map_shared(ip); if (ip->i_d.di_nextents > 0) - xfs_dir3_data_readahead(NULL, ip, 0, -1); + xfs_dir3_data_readahead(ip, 0, -1); xfs_iunlock(ip, mode); return 0; } @@ -1193,7 +1180,7 @@ xfs_seek_data( uint lock; int error; - lock = xfs_ilock_map_shared(ip); + lock = xfs_ilock_data_map_shared(ip); isize = i_size_read(inode); if (start >= isize) { @@ -1272,7 +1259,7 @@ out: offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out_unlock: - xfs_iunlock_map_shared(ip, lock); + xfs_iunlock(ip, lock); if (error) return -error; @@ -1297,7 +1284,7 @@ xfs_seek_hole( if (XFS_FORCED_SHUTDOWN(mp)) return -XFS_ERROR(EIO); - lock = xfs_ilock_map_shared(ip); + lock = xfs_ilock_data_map_shared(ip); isize = i_size_read(inode); if (start >= isize) { @@ -1380,7 +1367,7 @@ out: offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out_unlock: - xfs_iunlock_map_shared(ip, lock); + xfs_iunlock(ip, lock); if (error) return -error; @@ -1409,12 +1396,12 @@ xfs_file_llseek( const struct file_operations xfs_file_operations = { .llseek = xfs_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = xfs_file_aio_read, - .aio_write = xfs_file_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = xfs_file_read_iter, + .write_iter = xfs_file_write_iter, .splice_read = xfs_file_splice_read, - .splice_write = xfs_file_splice_write, + .splice_write = iter_file_splice_write, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, @@ -1440,6 +1427,7 @@ const struct file_operations xfs_dir_file_operations = { static const struct vm_operations_struct xfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = xfs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index ce78e654d37..8ec81bed799 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * Copyright (c) 2014 Christoph Hellwig. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -16,116 +17,36 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" -#include "xfs_log.h" -#include "xfs_bmap_btree.h" -#include "xfs_inum.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_ag.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_inum.h" +#include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_alloc.h" #include "xfs_mru_cache.h" +#include "xfs_dinode.h" #include "xfs_filestream.h" #include "xfs_trace.h" -#ifdef XFS_FILESTREAMS_TRACE - -ktrace_t *xfs_filestreams_trace_buf; - -STATIC void -xfs_filestreams_trace( - xfs_mount_t *mp, /* mount point */ - int type, /* type of trace */ - const char *func, /* source function */ - int line, /* source line number */ - __psunsigned_t arg0, - __psunsigned_t arg1, - __psunsigned_t arg2, - __psunsigned_t arg3, - __psunsigned_t arg4, - __psunsigned_t arg5) -{ - ktrace_enter(xfs_filestreams_trace_buf, - (void *)(__psint_t)(type | (line << 16)), - (void *)func, - (void *)(__psunsigned_t)current_pid(), - (void *)mp, - (void *)(__psunsigned_t)arg0, - (void *)(__psunsigned_t)arg1, - (void *)(__psunsigned_t)arg2, - (void *)(__psunsigned_t)arg3, - (void *)(__psunsigned_t)arg4, - (void *)(__psunsigned_t)arg5, - NULL, NULL, NULL, NULL, NULL, NULL); -} - -#define TRACE0(mp,t) TRACE6(mp,t,0,0,0,0,0,0) -#define TRACE1(mp,t,a0) TRACE6(mp,t,a0,0,0,0,0,0) -#define TRACE2(mp,t,a0,a1) TRACE6(mp,t,a0,a1,0,0,0,0) -#define TRACE3(mp,t,a0,a1,a2) TRACE6(mp,t,a0,a1,a2,0,0,0) -#define TRACE4(mp,t,a0,a1,a2,a3) TRACE6(mp,t,a0,a1,a2,a3,0,0) -#define TRACE5(mp,t,a0,a1,a2,a3,a4) TRACE6(mp,t,a0,a1,a2,a3,a4,0) -#define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \ - xfs_filestreams_trace(mp, t, __func__, __LINE__, \ - (__psunsigned_t)a0, (__psunsigned_t)a1, \ - (__psunsigned_t)a2, (__psunsigned_t)a3, \ - (__psunsigned_t)a4, (__psunsigned_t)a5) - -#define TRACE_AG_SCAN(mp, ag, ag2) \ - TRACE2(mp, XFS_FSTRM_KTRACE_AGSCAN, ag, ag2); -#define TRACE_AG_PICK1(mp, max_ag, maxfree) \ - TRACE2(mp, XFS_FSTRM_KTRACE_AGPICK1, max_ag, maxfree); -#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) \ - TRACE6(mp, XFS_FSTRM_KTRACE_AGPICK2, ag, ag2, \ - cnt, free, scan, flag) -#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) \ - TRACE5(mp, XFS_FSTRM_KTRACE_UPDATE, ip, ag, cnt, ag2, cnt2) -#define TRACE_FREE(mp, ip, pip, ag, cnt) \ - TRACE4(mp, XFS_FSTRM_KTRACE_FREE, ip, pip, ag, cnt) -#define TRACE_LOOKUP(mp, ip, pip, ag, cnt) \ - TRACE4(mp, XFS_FSTRM_KTRACE_ITEM_LOOKUP, ip, pip, ag, cnt) -#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) \ - TRACE4(mp, XFS_FSTRM_KTRACE_ASSOCIATE, ip, pip, ag, cnt) -#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) \ - TRACE6(mp, XFS_FSTRM_KTRACE_MOVEAG, ip, pip, oag, ocnt, nag, ncnt) -#define TRACE_ORPHAN(mp, ip, ag) \ - TRACE2(mp, XFS_FSTRM_KTRACE_ORPHAN, ip, ag); - - -#else -#define TRACE_AG_SCAN(mp, ag, ag2) -#define TRACE_AG_PICK1(mp, max_ag, maxfree) -#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) -#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) -#define TRACE_FREE(mp, ip, pip, ag, cnt) -#define TRACE_LOOKUP(mp, ip, pip, ag, cnt) -#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) -#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) -#define TRACE_ORPHAN(mp, ip, ag) -#endif - -static kmem_zone_t *item_zone; +struct xfs_fstrm_item { + struct xfs_mru_cache_elem mru; + struct xfs_inode *ip; + xfs_agnumber_t ag; /* AG in use for this directory */ +}; -/* - * Structure for associating a file or a directory with an allocation group. - * The parent directory pointer is only needed for files, but since there will - * generally be vastly more files than directories in the cache, using the same - * data structure simplifies the code with very little memory overhead. - */ -typedef struct fstrm_item -{ - xfs_agnumber_t ag; /* AG currently in use for the file/directory. */ - xfs_inode_t *ip; /* inode self-pointer. */ - xfs_inode_t *pip; /* Parent directory inode pointer. */ -} fstrm_item_t; +enum xfs_fstrm_alloc { + XFS_PICK_USERDATA = 1, + XFS_PICK_LOWSPACE = 2, +}; /* * Allocation group filestream associations are tracked with per-ag atomic - * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a + * counters. These counters allow xfs_filestream_pick_ag() to tell whether a * particular AG already has active filestreams associated with it. The mount * point's m_peraglock is used to protect these counters from per-ag array * re-allocation during a growfs operation. When xfs_growfs_data_private() is @@ -160,7 +81,7 @@ typedef struct fstrm_item * the cache that reference per-ag array elements that have since been * reallocated. */ -static int +int xfs_filestream_peek_ag( xfs_mount_t *mp, xfs_agnumber_t agno) @@ -200,23 +121,40 @@ xfs_filestream_put_ag( xfs_perag_put(pag); } +static void +xfs_fstrm_free_func( + struct xfs_mru_cache_elem *mru) +{ + struct xfs_fstrm_item *item = + container_of(mru, struct xfs_fstrm_item, mru); + + xfs_filestream_put_ag(item->ip->i_mount, item->ag); + + trace_xfs_filestream_free(item->ip, item->ag); + + kmem_free(item); +} + /* * Scan the AGs starting at startag looking for an AG that isn't in use and has * at least minlen blocks free. */ static int -_xfs_filestream_pick_ag( - xfs_mount_t *mp, - xfs_agnumber_t startag, - xfs_agnumber_t *agp, - int flags, - xfs_extlen_t minlen) +xfs_filestream_pick_ag( + struct xfs_inode *ip, + xfs_agnumber_t startag, + xfs_agnumber_t *agp, + int flags, + xfs_extlen_t minlen) { - int streams, max_streams; - int err, trylock, nscan; - xfs_extlen_t longest, free, minfree, maxfree = 0; - xfs_agnumber_t ag, max_ag = NULLAGNUMBER; - struct xfs_perag *pag; + struct xfs_mount *mp = ip->i_mount; + struct xfs_fstrm_item *item; + struct xfs_perag *pag; + xfs_extlen_t longest, free = 0, minfree, maxfree = 0; + xfs_agnumber_t ag, max_ag = NULLAGNUMBER; + int err, trylock, nscan; + + ASSERT(S_ISDIR(ip->i_d.di_mode)); /* 2% of an AG's blocks must be free for it to be chosen. */ minfree = mp->m_sb.sb_agblocks / 50; @@ -228,8 +166,9 @@ _xfs_filestream_pick_ag( trylock = XFS_ALLOC_FLAG_TRYLOCK; for (nscan = 0; 1; nscan++) { + trace_xfs_filestream_scan(ip, ag); + pag = xfs_perag_get(mp, ag); - TRACE_AG_SCAN(mp, ag, atomic_read(&pag->pagf_fstrms)); if (!pag->pagf_init) { err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); @@ -246,7 +185,6 @@ _xfs_filestream_pick_ag( /* Keep track of the AG with the most free blocks. */ if (pag->pagf_freeblks > maxfree) { maxfree = pag->pagf_freeblks; - max_streams = atomic_read(&pag->pagf_fstrms); max_ag = ag; } @@ -269,7 +207,6 @@ _xfs_filestream_pick_ag( /* Break out, retaining the reference on the AG. */ free = pag->pagf_freeblks; - streams = atomic_read(&pag->pagf_fstrms); xfs_perag_put(pag); *agp = ag; break; @@ -305,317 +242,98 @@ next_ag: */ if (max_ag != NULLAGNUMBER) { xfs_filestream_get_ag(mp, max_ag); - TRACE_AG_PICK1(mp, max_ag, maxfree); - streams = max_streams; free = maxfree; *agp = max_ag; break; } /* take AG 0 if none matched */ - TRACE_AG_PICK1(mp, max_ag, maxfree); + trace_xfs_filestream_pick(ip, *agp, free, nscan); *agp = 0; return 0; } - TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags); - - return 0; -} - -/* - * Set the allocation group number for a file or a directory, updating inode - * references and per-AG references as appropriate. - */ -static int -_xfs_filestream_update_ag( - xfs_inode_t *ip, - xfs_inode_t *pip, - xfs_agnumber_t ag) -{ - int err = 0; - xfs_mount_t *mp; - xfs_mru_cache_t *cache; - fstrm_item_t *item; - xfs_agnumber_t old_ag; - xfs_inode_t *old_pip; + trace_xfs_filestream_pick(ip, *agp, free, nscan); - /* - * Either ip is a regular file and pip is a directory, or ip is a - * directory and pip is NULL. - */ - ASSERT(ip && ((S_ISREG(ip->i_d.di_mode) && pip && - S_ISDIR(pip->i_d.di_mode)) || - (S_ISDIR(ip->i_d.di_mode) && !pip))); - - mp = ip->i_mount; - cache = mp->m_filestream; - - item = xfs_mru_cache_lookup(cache, ip->i_ino); - if (item) { - ASSERT(item->ip == ip); - old_ag = item->ag; - item->ag = ag; - old_pip = item->pip; - item->pip = pip; - xfs_mru_cache_done(cache); - - /* - * If the AG has changed, drop the old ref and take a new one, - * effectively transferring the reference from old to new AG. - */ - if (ag != old_ag) { - xfs_filestream_put_ag(mp, old_ag); - xfs_filestream_get_ag(mp, ag); - } - - /* - * If ip is a file and its pip has changed, drop the old ref and - * take a new one. - */ - if (pip && pip != old_pip) { - IRELE(old_pip); - IHOLD(pip); - } - - TRACE_UPDATE(mp, ip, old_ag, xfs_filestream_peek_ag(mp, old_ag), - ag, xfs_filestream_peek_ag(mp, ag)); + if (*agp == NULLAGNUMBER) return 0; - } - item = kmem_zone_zalloc(item_zone, KM_MAYFAIL); + err = ENOMEM; + item = kmem_alloc(sizeof(*item), KM_MAYFAIL); if (!item) - return ENOMEM; + goto out_put_ag; - item->ag = ag; + item->ag = *agp; item->ip = ip; - item->pip = pip; - err = xfs_mru_cache_insert(cache, ip->i_ino, item); + err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru); if (err) { - kmem_zone_free(item_zone, item); - return err; + if (err == EEXIST) + err = 0; + goto out_free_item; } - /* Take a reference on the AG. */ - xfs_filestream_get_ag(mp, ag); - - /* - * Take a reference on the inode itself regardless of whether it's a - * regular file or a directory. - */ - IHOLD(ip); - - /* - * In the case of a regular file, take a reference on the parent inode - * as well to ensure it remains in-core. - */ - if (pip) - IHOLD(pip); - - TRACE_UPDATE(mp, ip, ag, xfs_filestream_peek_ag(mp, ag), - ag, xfs_filestream_peek_ag(mp, ag)); - - return 0; -} - -/* xfs_fstrm_free_func(): callback for freeing cached stream items. */ -STATIC void -xfs_fstrm_free_func( - unsigned long ino, - void *data) -{ - fstrm_item_t *item = (fstrm_item_t *)data; - xfs_inode_t *ip = item->ip; - - ASSERT(ip->i_ino == ino); - - xfs_iflags_clear(ip, XFS_IFILESTREAM); - - /* Drop the reference taken on the AG when the item was added. */ - xfs_filestream_put_ag(ip->i_mount, item->ag); - - TRACE_FREE(ip->i_mount, ip, item->pip, item->ag, - xfs_filestream_peek_ag(ip->i_mount, item->ag)); - - /* - * _xfs_filestream_update_ag() always takes a reference on the inode - * itself, whether it's a file or a directory. Release it here. - * This can result in the inode being freed and so we must - * not hold any inode locks when freeing filesstreams objects - * otherwise we can deadlock here. - */ - IRELE(ip); - - /* - * In the case of a regular file, _xfs_filestream_update_ag() also - * takes a ref on the parent inode to keep it in-core. Release that - * too. - */ - if (item->pip) - IRELE(item->pip); - - /* Finally, free the memory allocated for the item. */ - kmem_zone_free(item_zone, item); -} - -/* - * xfs_filestream_init() is called at xfs initialisation time to set up the - * memory zone that will be used for filestream data structure allocation. - */ -int -xfs_filestream_init(void) -{ - item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item"); - if (!item_zone) - return -ENOMEM; - return 0; -} - -/* - * xfs_filestream_uninit() is called at xfs termination time to destroy the - * memory zone that was used for filestream data structure allocation. - */ -void -xfs_filestream_uninit(void) -{ - kmem_zone_destroy(item_zone); -} - -/* - * xfs_filestream_mount() is called when a file system is mounted with the - * filestream option. It is responsible for allocating the data structures - * needed to track the new file system's file streams. - */ -int -xfs_filestream_mount( - xfs_mount_t *mp) -{ - int err; - unsigned int lifetime, grp_count; - - /* - * The filestream timer tunable is currently fixed within the range of - * one second to four minutes, with five seconds being the default. The - * group count is somewhat arbitrary, but it'd be nice to adhere to the - * timer tunable to within about 10 percent. This requires at least 10 - * groups. - */ - lifetime = xfs_fstrm_centisecs * 10; - grp_count = 10; - - err = xfs_mru_cache_create(&mp->m_filestream, lifetime, grp_count, - xfs_fstrm_free_func); +out_free_item: + kmem_free(item); +out_put_ag: + xfs_filestream_put_ag(mp, *agp); return err; } -/* - * xfs_filestream_unmount() is called when a file system that was mounted with - * the filestream option is unmounted. It drains the data structures created - * to track the file system's file streams and frees all the memory that was - * allocated. - */ -void -xfs_filestream_unmount( - xfs_mount_t *mp) +static struct xfs_inode * +xfs_filestream_get_parent( + struct xfs_inode *ip) { - xfs_mru_cache_destroy(mp->m_filestream); -} + struct inode *inode = VFS_I(ip), *dir = NULL; + struct dentry *dentry, *parent; -/* - * Return the AG of the filestream the file or directory belongs to, or - * NULLAGNUMBER otherwise. - */ -xfs_agnumber_t -xfs_filestream_lookup_ag( - xfs_inode_t *ip) -{ - xfs_mru_cache_t *cache; - fstrm_item_t *item; - xfs_agnumber_t ag; - int ref; - - if (!S_ISREG(ip->i_d.di_mode) && !S_ISDIR(ip->i_d.di_mode)) { - ASSERT(0); - return NULLAGNUMBER; - } + dentry = d_find_alias(inode); + if (!dentry) + goto out; - cache = ip->i_mount->m_filestream; - item = xfs_mru_cache_lookup(cache, ip->i_ino); - if (!item) { - TRACE_LOOKUP(ip->i_mount, ip, NULL, NULLAGNUMBER, 0); - return NULLAGNUMBER; - } + parent = dget_parent(dentry); + if (!parent) + goto out_dput; - ASSERT(ip == item->ip); - ag = item->ag; - ref = xfs_filestream_peek_ag(ip->i_mount, ag); - xfs_mru_cache_done(cache); + dir = igrab(parent->d_inode); + dput(parent); - TRACE_LOOKUP(ip->i_mount, ip, item->pip, ag, ref); - return ag; +out_dput: + dput(dentry); +out: + return dir ? XFS_I(dir) : NULL; } /* - * xfs_filestream_associate() should only be called to associate a regular file - * with its parent directory. Calling it with a child directory isn't - * appropriate because filestreams don't apply to entire directory hierarchies. - * Creating a file in a child directory of an existing filestream directory - * starts a new filestream with its own allocation group association. + * Find the right allocation group for a file, either by finding an + * existing file stream or creating a new one. * - * Returns < 0 on error, 0 if successful association occurred, > 0 if - * we failed to get an association because of locking issues. + * Returns NULLAGNUMBER in case of an error. */ -int -xfs_filestream_associate( - xfs_inode_t *pip, - xfs_inode_t *ip) +xfs_agnumber_t +xfs_filestream_lookup_ag( + struct xfs_inode *ip) { - xfs_mount_t *mp; - xfs_mru_cache_t *cache; - fstrm_item_t *item; - xfs_agnumber_t ag, rotorstep, startag; - int err = 0; + struct xfs_mount *mp = ip->i_mount; + struct xfs_inode *pip = NULL; + xfs_agnumber_t startag, ag = NULLAGNUMBER; + struct xfs_mru_cache_elem *mru; - ASSERT(S_ISDIR(pip->i_d.di_mode)); ASSERT(S_ISREG(ip->i_d.di_mode)); - if (!S_ISDIR(pip->i_d.di_mode) || !S_ISREG(ip->i_d.di_mode)) - return -EINVAL; - - mp = pip->i_mount; - cache = mp->m_filestream; - - /* - * We have a problem, Houston. - * - * Taking the iolock here violates inode locking order - we already - * hold the ilock. Hence if we block getting this lock we may never - * wake. Unfortunately, that means if we can't get the lock, we're - * screwed in terms of getting a stream association - we can't spin - * waiting for the lock because someone else is waiting on the lock we - * hold and we cannot drop that as we are in a transaction here. - * - * Lucky for us, this inversion is not a problem because it's a - * directory inode that we are trying to lock here. - * - * So, if we can't get the iolock without sleeping then just give up - */ - if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) - return 1; - /* If the parent directory is already in the cache, use its AG. */ - item = xfs_mru_cache_lookup(cache, pip->i_ino); - if (item) { - ASSERT(item->ip == pip); - ag = item->ag; - xfs_mru_cache_done(cache); + pip = xfs_filestream_get_parent(ip); + if (!pip) + goto out; - TRACE_LOOKUP(mp, pip, pip, ag, xfs_filestream_peek_ag(mp, ag)); - err = _xfs_filestream_update_ag(ip, pip, ag); + mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino); + if (mru) { + ag = container_of(mru, struct xfs_fstrm_item, mru)->ag; + xfs_mru_cache_done(mp->m_filestream); - goto exit; + trace_xfs_filestream_lookup(ip, ag); + goto out; } /* @@ -623,202 +341,94 @@ xfs_filestream_associate( * use the directory inode's AG. */ if (mp->m_flags & XFS_MOUNT_32BITINODES) { - rotorstep = xfs_rotorstep; + xfs_agnumber_t rotorstep = xfs_rotorstep; startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount; mp->m_agfrotor = (mp->m_agfrotor + 1) % (mp->m_sb.sb_agcount * rotorstep); } else startag = XFS_INO_TO_AGNO(mp, pip->i_ino); - /* Pick a new AG for the parent inode starting at startag. */ - err = _xfs_filestream_pick_ag(mp, startag, &ag, 0, 0); - if (err || ag == NULLAGNUMBER) - goto exit_did_pick; - - /* Associate the parent inode with the AG. */ - err = _xfs_filestream_update_ag(pip, NULL, ag); - if (err) - goto exit_did_pick; - - /* Associate the file inode with the AG. */ - err = _xfs_filestream_update_ag(ip, pip, ag); - if (err) - goto exit_did_pick; - - TRACE_ASSOCIATE(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag)); - -exit_did_pick: - /* - * If _xfs_filestream_pick_ag() returned a valid AG, remove the - * reference it took on it, since the file and directory will have taken - * their own now if they were successfully cached. - */ - if (ag != NULLAGNUMBER) - xfs_filestream_put_ag(mp, ag); - -exit: - xfs_iunlock(pip, XFS_IOLOCK_EXCL); - return -err; + if (xfs_filestream_pick_ag(pip, startag, &ag, 0, 0)) + ag = NULLAGNUMBER; +out: + IRELE(pip); + return ag; } /* - * Pick a new allocation group for the current file and its file stream. This - * function is called by xfs_bmap_filestreams() with the mount point's per-ag - * lock held. + * Pick a new allocation group for the current file and its file stream. + * + * This is called when the allocator can't find a suitable extent in the + * current AG, and we have to move the stream into a new AG with more space. */ int xfs_filestream_new_ag( struct xfs_bmalloca *ap, xfs_agnumber_t *agp) { - int flags, err; - xfs_inode_t *ip, *pip = NULL; - xfs_mount_t *mp; - xfs_mru_cache_t *cache; - xfs_extlen_t minlen; - fstrm_item_t *dir, *file; - xfs_agnumber_t ag = NULLAGNUMBER; - - ip = ap->ip; - mp = ip->i_mount; - cache = mp->m_filestream; - minlen = ap->length; - *agp = NULLAGNUMBER; + struct xfs_inode *ip = ap->ip, *pip; + struct xfs_mount *mp = ip->i_mount; + xfs_extlen_t minlen = ap->length; + xfs_agnumber_t startag = 0; + int flags, err = 0; + struct xfs_mru_cache_elem *mru; - /* - * Look for the file in the cache, removing it if it's found. Doing - * this allows it to be held across the dir lookup that follows. - */ - file = xfs_mru_cache_remove(cache, ip->i_ino); - if (file) { - ASSERT(ip == file->ip); - - /* Save the file's parent inode and old AG number for later. */ - pip = file->pip; - ag = file->ag; - - /* Look for the file's directory in the cache. */ - dir = xfs_mru_cache_lookup(cache, pip->i_ino); - if (dir) { - ASSERT(pip == dir->ip); - - /* - * If the directory has already moved on to a new AG, - * use that AG as the new AG for the file. Don't - * forget to twiddle the AG refcounts to match the - * movement. - */ - if (dir->ag != file->ag) { - xfs_filestream_put_ag(mp, file->ag); - xfs_filestream_get_ag(mp, dir->ag); - *agp = file->ag = dir->ag; - } - - xfs_mru_cache_done(cache); - } + *agp = NULLAGNUMBER; - /* - * Put the file back in the cache. If this fails, the free - * function needs to be called to tidy up in the same way as if - * the item had simply expired from the cache. - */ - err = xfs_mru_cache_insert(cache, ip->i_ino, file); - if (err) { - xfs_fstrm_free_func(ip->i_ino, file); - return err; - } + pip = xfs_filestream_get_parent(ip); + if (!pip) + goto exit; - /* - * If the file's AG was moved to the directory's new AG, there's - * nothing more to be done. - */ - if (*agp != NULLAGNUMBER) { - TRACE_MOVEAG(mp, ip, pip, - ag, xfs_filestream_peek_ag(mp, ag), - *agp, xfs_filestream_peek_ag(mp, *agp)); - return 0; - } + mru = xfs_mru_cache_remove(mp->m_filestream, pip->i_ino); + if (mru) { + struct xfs_fstrm_item *item = + container_of(mru, struct xfs_fstrm_item, mru); + startag = (item->ag + 1) % mp->m_sb.sb_agcount; } - /* - * If the file's parent directory is known, take its iolock in exclusive - * mode to prevent two sibling files from racing each other to migrate - * themselves and their parent to different AGs. - * - * Note that we lock the parent directory iolock inside the child - * iolock here. That's fine as we never hold both parent and child - * iolock in any other place. This is different from the ilock, - * which requires locking of the child after the parent for namespace - * operations. - */ - if (pip) - xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); - - /* - * A new AG needs to be found for the file. If the file's parent - * directory is also known, it will be moved to the new AG as well to - * ensure that files created inside it in future use the new AG. - */ - ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount; flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0); - err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen); - if (err || *agp == NULLAGNUMBER) - goto exit; + err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen); /* - * If the file wasn't found in the file cache, then its parent directory - * inode isn't known. For this to have happened, the file must either - * be pre-existing, or it was created long enough ago that its cache - * entry has expired. This isn't the sort of usage that the filestreams - * allocator is trying to optimise, so there's no point trying to track - * its new AG somehow in the filestream data structures. + * Only free the item here so we skip over the old AG earlier. */ - if (!pip) { - TRACE_ORPHAN(mp, ip, *agp); - goto exit; - } - - /* Associate the parent inode with the AG. */ - err = _xfs_filestream_update_ag(pip, NULL, *agp); - if (err) - goto exit; - - /* Associate the file inode with the AG. */ - err = _xfs_filestream_update_ag(ip, pip, *agp); - if (err) - goto exit; - - TRACE_MOVEAG(mp, ip, pip, NULLAGNUMBER, 0, - *agp, xfs_filestream_peek_ag(mp, *agp)); + if (mru) + xfs_fstrm_free_func(mru); + IRELE(pip); exit: - /* - * If _xfs_filestream_pick_ag() returned a valid AG, remove the - * reference it took on it, since the file and directory will have taken - * their own now if they were successfully cached. - */ - if (*agp != NULLAGNUMBER) - xfs_filestream_put_ag(mp, *agp); - else + if (*agp == NULLAGNUMBER) *agp = 0; - - if (pip) - xfs_iunlock(pip, XFS_IOLOCK_EXCL); - return err; } -/* - * Remove an association between an inode and a filestream object. - * Typically this is done on last close of an unlinked file. - */ void xfs_filestream_deassociate( - xfs_inode_t *ip) + struct xfs_inode *ip) { - xfs_mru_cache_t *cache = ip->i_mount->m_filestream; + xfs_mru_cache_delete(ip->i_mount->m_filestream, ip->i_ino); +} - xfs_mru_cache_delete(cache, ip->i_ino); +int +xfs_filestream_mount( + xfs_mount_t *mp) +{ + /* + * The filestream timer tunable is currently fixed within the range of + * one second to four minutes, with five seconds being the default. The + * group count is somewhat arbitrary, but it'd be nice to adhere to the + * timer tunable to within about 10 percent. This requires at least 10 + * groups. + */ + return xfs_mru_cache_create(&mp->m_filestream, xfs_fstrm_centisecs * 10, + 10, xfs_fstrm_free_func); +} + +void +xfs_filestream_unmount( + xfs_mount_t *mp) +{ + xfs_mru_cache_destroy(mp->m_filestream); } diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h index 6d61dbee856..2ef43406e53 100644 --- a/fs/xfs/xfs_filestream.h +++ b/fs/xfs/xfs_filestream.h @@ -20,50 +20,20 @@ struct xfs_mount; struct xfs_inode; -struct xfs_perag; struct xfs_bmalloca; -#ifdef XFS_FILESTREAMS_TRACE -#define XFS_FSTRM_KTRACE_INFO 1 -#define XFS_FSTRM_KTRACE_AGSCAN 2 -#define XFS_FSTRM_KTRACE_AGPICK1 3 -#define XFS_FSTRM_KTRACE_AGPICK2 4 -#define XFS_FSTRM_KTRACE_UPDATE 5 -#define XFS_FSTRM_KTRACE_FREE 6 -#define XFS_FSTRM_KTRACE_ITEM_LOOKUP 7 -#define XFS_FSTRM_KTRACE_ASSOCIATE 8 -#define XFS_FSTRM_KTRACE_MOVEAG 9 -#define XFS_FSTRM_KTRACE_ORPHAN 10 - -#define XFS_FSTRM_KTRACE_SIZE 16384 -extern ktrace_t *xfs_filestreams_trace_buf; - -#endif - -/* allocation selection flags */ -typedef enum xfs_fstrm_alloc { - XFS_PICK_USERDATA = 1, - XFS_PICK_LOWSPACE = 2, -} xfs_fstrm_alloc_t; - -/* prototypes for filestream.c */ -int xfs_filestream_init(void); -void xfs_filestream_uninit(void); int xfs_filestream_mount(struct xfs_mount *mp); void xfs_filestream_unmount(struct xfs_mount *mp); -xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip); -int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip); void xfs_filestream_deassociate(struct xfs_inode *ip); +xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip); int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp); +int xfs_filestream_peek_ag(struct xfs_mount *mp, xfs_agnumber_t agno); - -/* filestreams for the inode? */ static inline int xfs_inode_is_filestream( struct xfs_inode *ip) { return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) || - xfs_iflags_test(ip, XFS_IFILESTREAM) || (ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM); } diff --git a/fs/xfs/xfs_format.h b/fs/xfs/xfs_format.h index 35c08ff54ca..34d85aca305 100644 --- a/fs/xfs/xfs_format.h +++ b/fs/xfs/xfs_format.h @@ -145,6 +145,8 @@ struct xfs_dsymlink_hdr { __be64 sl_lsn; }; +#define XFS_SYMLINK_CRC_OFF offsetof(struct xfs_dsymlink_hdr, sl_crc) + /* * The maximum pathlen is 1024 bytes. Since the minimum file system * blocksize is 512 bytes, we can get a max of 3 extents back from @@ -156,14 +158,271 @@ struct xfs_dsymlink_hdr { ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ sizeof(struct xfs_dsymlink_hdr) : 0)) -int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen); -int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset, - uint32_t size, struct xfs_buf *bp); -bool xfs_symlink_hdr_ok(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset, - uint32_t size, struct xfs_buf *bp); -void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, - struct xfs_inode *ip, struct xfs_ifork *ifp); -extern const struct xfs_buf_ops xfs_symlink_buf_ops; +/* + * Allocation Btree format definitions + * + * There are two on-disk btrees, one sorted by blockno and one sorted + * by blockcount and blockno. All blocks look the same to make the code + * simpler; if we have time later, we'll make the optimizations. + */ +#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */ +#define XFS_ABTB_CRC_MAGIC 0x41423342 /* 'AB3B' */ +#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */ +#define XFS_ABTC_CRC_MAGIC 0x41423343 /* 'AB3C' */ + +/* + * Data record/key structure + */ +typedef struct xfs_alloc_rec { + __be32 ar_startblock; /* starting block number */ + __be32 ar_blockcount; /* count of free blocks */ +} xfs_alloc_rec_t, xfs_alloc_key_t; + +typedef struct xfs_alloc_rec_incore { + xfs_agblock_t ar_startblock; /* starting block number */ + xfs_extlen_t ar_blockcount; /* count of free blocks */ +} xfs_alloc_rec_incore_t; + +/* btree pointer type */ +typedef __be32 xfs_alloc_ptr_t; + +/* + * Block numbers in the AG: + * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3. + */ +#define XFS_BNO_BLOCK(mp) ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1)) +#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1)) + + +/* + * Inode Allocation Btree format definitions + * + * There is a btree for the inode map per allocation group. + */ +#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */ +#define XFS_IBT_CRC_MAGIC 0x49414233 /* 'IAB3' */ +#define XFS_FIBT_MAGIC 0x46494254 /* 'FIBT' */ +#define XFS_FIBT_CRC_MAGIC 0x46494233 /* 'FIB3' */ + +typedef __uint64_t xfs_inofree_t; +#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t)) +#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3) +#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) +#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) + +static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) +{ + return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i; +} + +/* + * Data record structure + */ +typedef struct xfs_inobt_rec { + __be32 ir_startino; /* starting inode number */ + __be32 ir_freecount; /* count of free inodes (set bits) */ + __be64 ir_free; /* free inode mask */ +} xfs_inobt_rec_t; + +typedef struct xfs_inobt_rec_incore { + xfs_agino_t ir_startino; /* starting inode number */ + __int32_t ir_freecount; /* count of free inodes (set bits) */ + xfs_inofree_t ir_free; /* free inode mask */ +} xfs_inobt_rec_incore_t; + + +/* + * Key structure + */ +typedef struct xfs_inobt_key { + __be32 ir_startino; /* starting inode number */ +} xfs_inobt_key_t; + +/* btree pointer type */ +typedef __be32 xfs_inobt_ptr_t; + +/* + * block numbers in the AG. + */ +#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1)) +#define XFS_FIBT_BLOCK(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1)) + +/* + * The first data block of an AG depends on whether the filesystem was formatted + * with the finobt feature. If so, account for the finobt reserved root btree + * block. + */ +#define XFS_PREALLOC_BLOCKS(mp) \ + (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \ + XFS_FIBT_BLOCK(mp) + 1 : \ + XFS_IBT_BLOCK(mp) + 1) + + + +/* + * BMAP Btree format definitions + * + * This includes both the root block definition that sits inside an inode fork + * and the record/pointer formats for the leaf/node in the blocks. + */ +#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */ +#define XFS_BMAP_CRC_MAGIC 0x424d4133 /* 'BMA3' */ + +/* + * Bmap root header, on-disk form only. + */ +typedef struct xfs_bmdr_block { + __be16 bb_level; /* 0 is a leaf */ + __be16 bb_numrecs; /* current # of data records */ +} xfs_bmdr_block_t; + +/* + * Bmap btree record and extent descriptor. + * l0:63 is an extent flag (value 1 indicates non-normal). + * l0:9-62 are startoff. + * l0:0-8 and l1:21-63 are startblock. + * l1:0-20 are blockcount. + */ +#define BMBT_EXNTFLAG_BITLEN 1 +#define BMBT_STARTOFF_BITLEN 54 +#define BMBT_STARTBLOCK_BITLEN 52 +#define BMBT_BLOCKCOUNT_BITLEN 21 + +typedef struct xfs_bmbt_rec { + __be64 l0, l1; +} xfs_bmbt_rec_t; + +typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */ +typedef xfs_bmbt_rec_t xfs_bmdr_rec_t; + +typedef struct xfs_bmbt_rec_host { + __uint64_t l0, l1; +} xfs_bmbt_rec_host_t; + +/* + * Values and macros for delayed-allocation startblock fields. + */ +#define STARTBLOCKVALBITS 17 +#define STARTBLOCKMASKBITS (15 + XFS_BIG_BLKNOS * 20) +#define DSTARTBLOCKMASKBITS (15 + 20) +#define STARTBLOCKMASK \ + (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) +#define DSTARTBLOCKMASK \ + (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) + +static inline int isnullstartblock(xfs_fsblock_t x) +{ + return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK; +} + +static inline int isnulldstartblock(xfs_dfsbno_t x) +{ + return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK; +} + +static inline xfs_fsblock_t nullstartblock(int k) +{ + ASSERT(k < (1 << STARTBLOCKVALBITS)); + return STARTBLOCKMASK | (k); +} + +static inline xfs_filblks_t startblockval(xfs_fsblock_t x) +{ + return (xfs_filblks_t)((x) & ~STARTBLOCKMASK); +} + +/* + * Possible extent formats. + */ +typedef enum { + XFS_EXTFMT_NOSTATE = 0, + XFS_EXTFMT_HASSTATE +} xfs_exntfmt_t; + +/* + * Possible extent states. + */ +typedef enum { + XFS_EXT_NORM, XFS_EXT_UNWRITTEN, + XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID +} xfs_exntst_t; + +/* + * Incore version of above. + */ +typedef struct xfs_bmbt_irec +{ + xfs_fileoff_t br_startoff; /* starting file offset */ + xfs_fsblock_t br_startblock; /* starting block number */ + xfs_filblks_t br_blockcount; /* number of blocks */ + xfs_exntst_t br_state; /* extent state */ +} xfs_bmbt_irec_t; + +/* + * Key structure for non-leaf levels of the tree. + */ +typedef struct xfs_bmbt_key { + __be64 br_startoff; /* starting file offset */ +} xfs_bmbt_key_t, xfs_bmdr_key_t; + +/* btree pointer type */ +typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; + + +/* + * Generic Btree block format definitions + * + * This is a combination of the actual format used on disk for short and long + * format btrees. The first three fields are shared by both format, but the + * pointers are different and should be used with care. + * + * To get the size of the actual short or long form headers please use the size + * macros below. Never use sizeof(xfs_btree_block). + * + * The blkno, crc, lsn, owner and uuid fields are only available in filesystems + * with the crc feature bit, and all accesses to them must be conditional on + * that flag. + */ +struct xfs_btree_block { + __be32 bb_magic; /* magic number for block type */ + __be16 bb_level; /* 0 is a leaf */ + __be16 bb_numrecs; /* current # of data records */ + union { + struct { + __be32 bb_leftsib; + __be32 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be32 bb_owner; + __le32 bb_crc; + } s; /* short form pointers */ + struct { + __be64 bb_leftsib; + __be64 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be64 bb_owner; + __le32 bb_crc; + __be32 bb_pad; /* padding for alignment */ + } l; /* long form pointers */ + } bb_u; /* rest */ +}; + +#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */ +#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */ + +/* sizes of CRC enabled btree blocks */ +#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40) +#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48) + +#define XFS_BTREE_SBLOCK_CRC_OFF \ + offsetof(struct xfs_btree_block, bb_u.s.bb_crc) +#define XFS_BTREE_LBLOCK_CRC_OFF \ + offsetof(struct xfs_btree_block, bb_u.l.bb_crc) #endif /* __XFS_FORMAT_H__ */ diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 18272c766a5..d34703dbcb4 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -233,11 +233,12 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */ #define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */ #define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */ -#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */ +#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */ #define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ #define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ #define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */ - +#define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */ +#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */ /* * Minimum and maximum sizes need for growth checks. diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index e64ee5288b8..d2295561570 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -17,28 +17,31 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" -#include "xfs_btree.h" #include "xfs_error.h" +#include "xfs_btree.h" +#include "xfs_alloc_btree.h" #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_fsops.h" #include "xfs_itable.h" #include "xfs_trans_space.h" #include "xfs_rtalloc.h" -#include "xfs_filestream.h" #include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_dinode.h" +#include "xfs_filestream.h" /* * File system operations @@ -73,23 +76,18 @@ xfs_fs_geometry( } if (new_version >= 3) { geo->version = XFS_FSOP_GEOM_VERSION; - geo->flags = + geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | + XFS_FSOP_GEOM_FLAGS_DIRV2 | (xfs_sb_version_hasattr(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_ATTR : 0) | - (xfs_sb_version_hasnlink(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_NLINK : 0) | (xfs_sb_version_hasquota(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_QUOTA : 0) | (xfs_sb_version_hasalign(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_IALIGN : 0) | (xfs_sb_version_hasdalign(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_DALIGN : 0) | - (xfs_sb_version_hasshared(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_SHARED : 0) | (xfs_sb_version_hasextflgbit(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_EXTFLG : 0) | - (xfs_sb_version_hasdirv2(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_DIRV2 : 0) | (xfs_sb_version_hassector(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_SECTOR : 0) | (xfs_sb_version_hasasciici(&mp->m_sb) ? @@ -101,11 +99,15 @@ xfs_fs_geometry( (xfs_sb_version_hasprojid32bit(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_PROJID32 : 0) | (xfs_sb_version_hascrc(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_V5SB : 0); + XFS_FSOP_GEOM_FLAGS_V5SB : 0) | + (xfs_sb_version_hasftype(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_FTYPE : 0) | + (xfs_sb_version_hasfinobt(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_FINOBT : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; - geo->dirblocksize = mp->m_dirblksize; + geo->dirblocksize = mp->m_dir_geo->blksize; } if (new_version >= 4) { geo->flags |= @@ -153,7 +155,7 @@ xfs_growfs_data_private( xfs_buf_t *bp; int bucket; int dpct; - int error; + int error, saved_error = 0; xfs_agnumber_t nagcount; xfs_agnumber_t nagimax = 0; xfs_rfsblock_t nb, nb_mod; @@ -217,6 +219,8 @@ xfs_growfs_data_private( */ nfree = 0; for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { + __be32 *agfl_bno; + /* * AG freespace header block */ @@ -276,8 +280,10 @@ xfs_growfs_data_private( agfl->agfl_seqno = cpu_to_be32(agno); uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid); } + + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) - agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); + agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); error = xfs_bwrite(bp); xfs_buf_relse(bp); @@ -309,6 +315,10 @@ xfs_growfs_data_private( agi->agi_dirino = cpu_to_be32(NULLAGINO); if (xfs_sb_version_hascrc(&mp->m_sb)) uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_uuid); + if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp)); + agi->agi_free_level = cpu_to_be32(1); + } for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); @@ -400,6 +410,34 @@ xfs_growfs_data_private( xfs_buf_relse(bp); if (error) goto error0; + + /* + * FINO btree root block + */ + if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_FIBT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_inobt_buf_ops); + if (!bp) { + error = ENOMEM; + goto error0; + } + + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_FIBT_CRC_MAGIC, + 0, 0, agno, + XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_FIBT_MAGIC, 0, + 0, agno, 0); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + } + } xfs_trans_agblocks_delta(tp, nfree); /* @@ -496,29 +534,33 @@ xfs_growfs_data_private( error = ENOMEM; } + /* + * If we get an error reading or writing alternate superblocks, + * continue. xfs_repair chooses the "best" superblock based + * on most matches; if we break early, we'll leave more + * superblocks un-updated than updated, and xfs_repair may + * pick them over the properly-updated primary. + */ if (error) { xfs_warn(mp, "error %d reading secondary superblock for ag %d", error, agno); - break; + saved_error = error; + continue; } xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS); - /* - * If we get an error writing out the alternate superblocks, - * just issue a warning and continue. The real work is - * already done and committed. - */ error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) { xfs_warn(mp, "write error %d updating secondary superblock for ag %d", error, agno); - break; /* no point in continuing */ + saved_error = error; + continue; } } - return error; + return saved_error ? saved_error : error; error0: xfs_trans_cancel(tp, XFS_TRANS_ABORT); diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index ccf2fb14396..5960e5593fe 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -17,29 +17,30 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" #include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" #include "xfs_alloc.h" #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_bmap.h" #include "xfs_cksum.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_icreate_item.h" #include "xfs_icache.h" +#include "xfs_dinode.h" +#include "xfs_trace.h" /* @@ -51,7 +52,7 @@ xfs_ialloc_cluster_alignment( { if (xfs_sb_version_hasalign(&args->mp->m_sb) && args->mp->m_sb.sb_inoalignmt >= - XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp))) + XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size)) return args->mp->m_sb.sb_inoalignmt; return 1; } @@ -111,6 +112,66 @@ xfs_inobt_get_rec( } /* + * Insert a single inobt record. Cursor must already point to desired location. + */ +STATIC int +xfs_inobt_insert_rec( + struct xfs_btree_cur *cur, + __int32_t freecount, + xfs_inofree_t free, + int *stat) +{ + cur->bc_rec.i.ir_freecount = freecount; + cur->bc_rec.i.ir_free = free; + return xfs_btree_insert(cur, stat); +} + +/* + * Insert records describing a newly allocated inode chunk into the inobt. + */ +STATIC int +xfs_inobt_insert( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agino_t newino, + xfs_agino_t newlen, + xfs_btnum_t btnum) +{ + struct xfs_btree_cur *cur; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + xfs_agino_t thisino; + int i; + int error; + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); + + for (thisino = newino; + thisino < newino + newlen; + thisino += XFS_INODES_PER_CHUNK) { + error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i); + if (error) { + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + ASSERT(i == 0); + + error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK, + XFS_INOBT_ALL_FREE, &i); + if (error) { + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + ASSERT(i == 1); + } + + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + + return 0; +} + +/* * Verify that the number of free inodes in the AGI is correct. */ #ifdef DEBUG @@ -169,27 +230,20 @@ xfs_ialloc_inode_init( { struct xfs_buf *fbuf; struct xfs_dinode *free; - int blks_per_cluster, nbufs, ninodes; + int nbufs, blks_per_cluster, inodes_per_cluster; int version; int i, j; xfs_daddr_t d; xfs_ino_t ino = 0; /* - * Loop over the new block(s), filling in the inodes. - * For small block sizes, manipulate the inodes in buffers - * which are multiples of the blocks size. + * Loop over the new block(s), filling in the inodes. For small block + * sizes, manipulate the inodes in buffers which are multiples of the + * blocks size. */ - if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { - blks_per_cluster = 1; - nbufs = length; - ninodes = mp->m_sb.sb_inopblock; - } else { - blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / - mp->m_sb.sb_blocksize; - nbufs = length / blks_per_cluster; - ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; - } + blks_per_cluster = xfs_icluster_size_fsb(mp); + inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; + nbufs = length / blks_per_cluster; /* * Figure out what version number to use in the inodes we create. If @@ -224,12 +278,10 @@ xfs_ialloc_inode_init( * they track in the AIL as if they were physically logged. */ if (tp) - xfs_icreate_log(tp, agno, agbno, XFS_IALLOC_INODES(mp), + xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos, mp->m_sb.sb_inodesize, length, gen); - } else if (xfs_sb_version_hasnlink(&mp->m_sb)) + } else version = 2; - else - version = 1; for (j = 0; j < nbufs; j++) { /* @@ -245,7 +297,7 @@ xfs_ialloc_inode_init( /* Initialize the inode buffers and log them appropriately. */ fbuf->b_ops = &xfs_inode_buf_ops; xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); - for (i = 0; i < ninodes; i++) { + for (i = 0; i < inodes_per_cluster; i++) { int ioffset = i << mp->m_sb.sb_inodelog; uint isize = xfs_dinode_size(version); @@ -309,13 +361,10 @@ xfs_ialloc_ag_alloc( { xfs_agi_t *agi; /* allocation group header */ xfs_alloc_arg_t args; /* allocation argument structure */ - xfs_btree_cur_t *cur; /* inode btree cursor */ xfs_agnumber_t agno; int error; - int i; xfs_agino_t newino; /* new first inode's number */ xfs_agino_t newlen; /* new number of inodes */ - xfs_agino_t thisino; /* current inode number, for loop */ int isaligned = 0; /* inode allocation at stripe unit */ /* boundary */ struct xfs_perag *pag; @@ -328,11 +377,11 @@ xfs_ialloc_ag_alloc( * Locking will ensure that we don't have two callers in here * at one time. */ - newlen = XFS_IALLOC_INODES(args.mp); + newlen = args.mp->m_ialloc_inos; if (args.mp->m_maxicount && args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount) return XFS_ERROR(ENOSPC); - args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp); + args.minlen = args.maxlen = args.mp->m_ialloc_blks; /* * First try to allocate inodes contiguous with the last-allocated * chunk of inodes. If the filesystem is striped, this will fill @@ -342,7 +391,7 @@ xfs_ialloc_ag_alloc( newino = be32_to_cpu(agi->agi_newino); agno = be32_to_cpu(agi->agi_seqno); args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + - XFS_IALLOC_BLOCKS(args.mp); + args.mp->m_ialloc_blks; if (likely(newino != NULLAGINO && (args.agbno < be32_to_cpu(agi->agi_length)))) { args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); @@ -369,6 +418,18 @@ xfs_ialloc_ag_alloc( args.minleft = args.mp->m_in_maxlevels - 1; if ((error = xfs_alloc_vextent(&args))) return error; + + /* + * This request might have dirtied the transaction if the AG can + * satisfy the request, but the exact block was not available. + * If the allocation did fail, subsequent requests will relax + * the exact agbno requirement and increase the alignment + * instead. It is critical that the total size of the request + * (len + alignment + slop) does not increase from this point + * on, so reset minalignslop to ensure it is not included in + * subsequent requests. + */ + args.minalignslop = 0; } else args.fsbno = NULLFSBLOCK; @@ -453,29 +514,19 @@ xfs_ialloc_ag_alloc( agi->agi_newino = cpu_to_be32(newino); /* - * Insert records describing the new inode chunk into the btree. + * Insert records describing the new inode chunk into the btrees. */ - cur = xfs_inobt_init_cursor(args.mp, tp, agbp, agno); - for (thisino = newino; - thisino < newino + newlen; - thisino += XFS_INODES_PER_CHUNK) { - cur->bc_rec.i.ir_startino = thisino; - cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK; - cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE; - error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i); - if (error) { - xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - return error; - } - ASSERT(i == 0); - error = xfs_btree_insert(cur, &i); - if (error) { - xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, + XFS_BTNUM_INO); + if (error) + return error; + + if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, + XFS_BTNUM_FINO); + if (error) return error; - } - ASSERT(i == 1); } - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); /* * Log allocation group header fields */ @@ -584,7 +635,7 @@ xfs_ialloc_ag_select( * Is there enough free space for the file plus a block of * inodes? (if we need to allocate some)? */ - ineed = XFS_IALLOC_BLOCKS(mp); + ineed = mp->m_ialloc_blks; longest = pag->pagf_longest; if (!longest) longest = pag->pagf_flcount > 0; @@ -669,13 +720,10 @@ xfs_ialloc_get_rec( } /* - * Allocate an inode. - * - * The caller selected an AG for us, and made sure that free inodes are - * available. + * Allocate an inode using the inobt-only algorithm. */ STATIC int -xfs_dialloc_ag( +xfs_dialloc_ag_inobt( struct xfs_trans *tp, struct xfs_buf *agbp, xfs_ino_t parent, @@ -701,7 +749,7 @@ xfs_dialloc_ag( ASSERT(pag->pagi_freecount > 0); restart_pagno: - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. @@ -934,6 +982,294 @@ error0: } /* + * Use the free inode btree to allocate an inode based on distance from the + * parent. Note that the provided cursor may be deleted and replaced. + */ +STATIC int +xfs_dialloc_ag_finobt_near( + xfs_agino_t pagino, + struct xfs_btree_cur **ocur, + struct xfs_inobt_rec_incore *rec) +{ + struct xfs_btree_cur *lcur = *ocur; /* left search cursor */ + struct xfs_btree_cur *rcur; /* right search cursor */ + struct xfs_inobt_rec_incore rrec; + int error; + int i, j; + + error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i); + if (error) + return error; + + if (i == 1) { + error = xfs_inobt_get_rec(lcur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + + /* + * See if we've landed in the parent inode record. The finobt + * only tracks chunks with at least one free inode, so record + * existence is enough. + */ + if (pagino >= rec->ir_startino && + pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK)) + return 0; + } + + error = xfs_btree_dup_cursor(lcur, &rcur); + if (error) + return error; + + error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j); + if (error) + goto error_rcur; + if (j == 1) { + error = xfs_inobt_get_rec(rcur, &rrec, &j); + if (error) + goto error_rcur; + XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur); + } + + XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur); + if (i == 1 && j == 1) { + /* + * Both the left and right records are valid. Choose the closer + * inode chunk to the target. + */ + if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) > + (rrec.ir_startino - pagino)) { + *rec = rrec; + xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR); + *ocur = rcur; + } else { + xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR); + } + } else if (j == 1) { + /* only the right record is valid */ + *rec = rrec; + xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR); + *ocur = rcur; + } else if (i == 1) { + /* only the left record is valid */ + xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR); + } + + return 0; + +error_rcur: + xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR); + return error; +} + +/* + * Use the free inode btree to find a free inode based on a newino hint. If + * the hint is NULL, find the first free inode in the AG. + */ +STATIC int +xfs_dialloc_ag_finobt_newino( + struct xfs_agi *agi, + struct xfs_btree_cur *cur, + struct xfs_inobt_rec_incore *rec) +{ + int error; + int i; + + if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { + error = xfs_inobt_lookup(cur, agi->agi_newino, XFS_LOOKUP_EQ, + &i); + if (error) + return error; + if (i == 1) { + error = xfs_inobt_get_rec(cur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + + return 0; + } + } + + /* + * Find the first inode available in the AG. + */ + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + + error = xfs_inobt_get_rec(cur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + + return 0; +} + +/* + * Update the inobt based on a modification made to the finobt. Also ensure that + * the records from both trees are equivalent post-modification. + */ +STATIC int +xfs_dialloc_ag_update_inobt( + struct xfs_btree_cur *cur, /* inobt cursor */ + struct xfs_inobt_rec_incore *frec, /* finobt record */ + int offset) /* inode offset */ +{ + struct xfs_inobt_rec_incore rec; + int error; + int i; + + error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) % + XFS_INODES_PER_CHUNK) == 0); + + rec.ir_free &= ~XFS_INOBT_MASK(offset); + rec.ir_freecount--; + + XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) && + (rec.ir_freecount == frec->ir_freecount)); + + error = xfs_inobt_update(cur, &rec); + if (error) + return error; + + return 0; +} + +/* + * Allocate an inode using the free inode btree, if available. Otherwise, fall + * back to the inobt search algorithm. + * + * The caller selected an AG for us, and made sure that free inodes are + * available. + */ +STATIC int +xfs_dialloc_ag( + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_ino_t parent, + xfs_ino_t *inop) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); + xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); + struct xfs_perag *pag; + struct xfs_btree_cur *cur; /* finobt cursor */ + struct xfs_btree_cur *icur; /* inobt cursor */ + struct xfs_inobt_rec_incore rec; + xfs_ino_t ino; + int error; + int offset; + int i; + + if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + return xfs_dialloc_ag_inobt(tp, agbp, parent, inop); + + pag = xfs_perag_get(mp, agno); + + /* + * If pagino is 0 (this is the root inode allocation) use newino. + * This must work because we've just allocated some. + */ + if (!pagino) + pagino = be32_to_cpu(agi->agi_newino); + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO); + + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error_cur; + + /* + * The search algorithm depends on whether we're in the same AG as the + * parent. If so, find the closest available inode to the parent. If + * not, consider the agi hint or find the first free inode in the AG. + */ + if (agno == pagno) + error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec); + else + error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec); + if (error) + goto error_cur; + + offset = xfs_lowbit64(rec.ir_free); + ASSERT(offset >= 0); + ASSERT(offset < XFS_INODES_PER_CHUNK); + ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % + XFS_INODES_PER_CHUNK) == 0); + ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); + + /* + * Modify or remove the finobt record. + */ + rec.ir_free &= ~XFS_INOBT_MASK(offset); + rec.ir_freecount--; + if (rec.ir_freecount) + error = xfs_inobt_update(cur, &rec); + else + error = xfs_btree_delete(cur, &i); + if (error) + goto error_cur; + + /* + * The finobt has now been updated appropriately. We haven't updated the + * agi and superblock yet, so we can create an inobt cursor and validate + * the original freecount. If all is well, make the equivalent update to + * the inobt using the finobt record and offset information. + */ + icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + + error = xfs_check_agi_freecount(icur, agi); + if (error) + goto error_icur; + + error = xfs_dialloc_ag_update_inobt(icur, &rec, offset); + if (error) + goto error_icur; + + /* + * Both trees have now been updated. We must update the perag and + * superblock before we can check the freecount for each btree. + */ + be32_add_cpu(&agi->agi_freecount, -1); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); + pag->pagi_freecount--; + + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); + + error = xfs_check_agi_freecount(icur, agi); + if (error) + goto error_icur; + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error_icur; + + xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_perag_put(pag); + *inop = ino; + return 0; + +error_icur: + xfs_btree_del_cursor(icur, XFS_BTREE_ERROR); +error_cur: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + xfs_perag_put(pag); + return error; +} + +/* * Allocate an inode on disk. * * Mode is used to tell whether the new inode will need space, and whether it @@ -998,7 +1334,7 @@ xfs_dialloc( * inode. */ if (mp->m_maxicount && - mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) { + mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) { noroom = 1; okalloc = 0; } @@ -1092,78 +1428,34 @@ out_error: return XFS_ERROR(error); } -/* - * Free disk inode. Carefully avoids touching the incore inode, all - * manipulations incore are the caller's responsibility. - * The on-disk inode is not changed by this operation, only the - * btree (free inode mask) is changed. - */ -int -xfs_difree( - xfs_trans_t *tp, /* transaction pointer */ - xfs_ino_t inode, /* inode to be freed */ - xfs_bmap_free_t *flist, /* extents to free */ - int *delete, /* set if inode cluster was deleted */ - xfs_ino_t *first_ino) /* first inode in deleted cluster */ +STATIC int +xfs_difree_inobt( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agino_t agino, + struct xfs_bmap_free *flist, + int *deleted, + xfs_ino_t *first_ino, + struct xfs_inobt_rec_incore *orec) { - /* REFERENCED */ - xfs_agblock_t agbno; /* block number containing inode */ - xfs_buf_t *agbp; /* buffer containing allocation group header */ - xfs_agino_t agino; /* inode number relative to allocation group */ - xfs_agnumber_t agno; /* allocation group number */ - xfs_agi_t *agi; /* allocation group header */ - xfs_btree_cur_t *cur; /* inode btree cursor */ - int error; /* error return value */ - int i; /* result code */ - int ilen; /* inodes in an inode cluster */ - xfs_mount_t *mp; /* mount structure for filesystem */ - int off; /* offset of inode in inode chunk */ - xfs_inobt_rec_incore_t rec; /* btree record */ - struct xfs_perag *pag; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + struct xfs_perag *pag; + struct xfs_btree_cur *cur; + struct xfs_inobt_rec_incore rec; + int ilen; + int error; + int i; + int off; - mp = tp->t_mountp; - - /* - * Break up inode number into its components. - */ - agno = XFS_INO_TO_AGNO(mp, inode); - if (agno >= mp->m_sb.sb_agcount) { - xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).", - __func__, agno, mp->m_sb.sb_agcount); - ASSERT(0); - return XFS_ERROR(EINVAL); - } - agino = XFS_INO_TO_AGINO(mp, inode); - if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) { - xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).", - __func__, (unsigned long long)inode, - (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino)); - ASSERT(0); - return XFS_ERROR(EINVAL); - } - agbno = XFS_AGINO_TO_AGBNO(mp, agino); - if (agbno >= mp->m_sb.sb_agblocks) { - xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).", - __func__, agbno, mp->m_sb.sb_agblocks); - ASSERT(0); - return XFS_ERROR(EINVAL); - } - /* - * Get the allocation group header. - */ - error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); - if (error) { - xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", - __func__, error); - return error; - } - agi = XFS_BUF_TO_AGI(agbp); ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); - ASSERT(agbno < be32_to_cpu(agi->agi_length)); + ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length)); + /* * Initialize the cursor. */ - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); error = xfs_check_agi_freecount(cur, agi); if (error) @@ -1201,9 +1493,9 @@ xfs_difree( * When an inode cluster is free, it becomes eligible for removal */ if (!(mp->m_flags & XFS_MOUNT_IKEEP) && - (rec.ir_freecount == XFS_IALLOC_INODES(mp))) { + (rec.ir_freecount == mp->m_ialloc_inos)) { - *delete = 1; + *deleted = 1; *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); /* @@ -1211,7 +1503,7 @@ xfs_difree( * AGI and Superblock inode counts, and mark the disk space * to be freed when the transaction is committed. */ - ilen = XFS_IALLOC_INODES(mp); + ilen = mp->m_ialloc_inos; be32_add_cpu(&agi->agi_count, -ilen); be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); @@ -1227,11 +1519,11 @@ xfs_difree( goto error0; } - xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, - agno, XFS_INO_TO_AGBNO(mp,rec.ir_startino)), - XFS_IALLOC_BLOCKS(mp), flist, mp); + xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, + XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)), + mp->m_ialloc_blks, flist, mp); } else { - *delete = 0; + *deleted = 0; error = xfs_inobt_update(cur, &rec); if (error) { @@ -1255,6 +1547,7 @@ xfs_difree( if (error) goto error0; + *orec = rec; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return 0; @@ -1263,6 +1556,182 @@ error0: return error; } +/* + * Free an inode in the free inode btree. + */ +STATIC int +xfs_difree_finobt( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agino_t agino, + struct xfs_inobt_rec_incore *ibtrec) /* inobt record */ +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + struct xfs_btree_cur *cur; + struct xfs_inobt_rec_incore rec; + int offset = agino - ibtrec->ir_startino; + int error; + int i; + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO); + + error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + goto error; + if (i == 0) { + /* + * If the record does not exist in the finobt, we must have just + * freed an inode in a previously fully allocated chunk. If not, + * something is out of sync. + */ + XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error); + + error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount, + ibtrec->ir_free, &i); + if (error) + goto error; + ASSERT(i == 1); + + goto out; + } + + /* + * Read and update the existing record. We could just copy the ibtrec + * across here, but that would defeat the purpose of having redundant + * metadata. By making the modifications independently, we can catch + * corruptions that we wouldn't see if we just copied from one record + * to another. + */ + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error; + XFS_WANT_CORRUPTED_GOTO(i == 1, error); + + rec.ir_free |= XFS_INOBT_MASK(offset); + rec.ir_freecount++; + + XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) && + (rec.ir_freecount == ibtrec->ir_freecount), + error); + + /* + * The content of inobt records should always match between the inobt + * and finobt. The lifecycle of records in the finobt is different from + * the inobt in that the finobt only tracks records with at least one + * free inode. Hence, if all of the inodes are free and we aren't + * keeping inode chunks permanently on disk, remove the record. + * Otherwise, update the record with the new information. + */ + if (rec.ir_freecount == mp->m_ialloc_inos && + !(mp->m_flags & XFS_MOUNT_IKEEP)) { + error = xfs_btree_delete(cur, &i); + if (error) + goto error; + ASSERT(i == 1); + } else { + error = xfs_inobt_update(cur, &rec); + if (error) + goto error; + } + +out: + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error; + + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; + +error: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* + * Free disk inode. Carefully avoids touching the incore inode, all + * manipulations incore are the caller's responsibility. + * The on-disk inode is not changed by this operation, only the + * btree (free inode mask) is changed. + */ +int +xfs_difree( + struct xfs_trans *tp, /* transaction pointer */ + xfs_ino_t inode, /* inode to be freed */ + struct xfs_bmap_free *flist, /* extents to free */ + int *deleted,/* set if inode cluster was deleted */ + xfs_ino_t *first_ino)/* first inode in deleted cluster */ +{ + /* REFERENCED */ + xfs_agblock_t agbno; /* block number containing inode */ + struct xfs_buf *agbp; /* buffer for allocation group header */ + xfs_agino_t agino; /* allocation group inode number */ + xfs_agnumber_t agno; /* allocation group number */ + int error; /* error return value */ + struct xfs_mount *mp; /* mount structure for filesystem */ + struct xfs_inobt_rec_incore rec;/* btree record */ + + mp = tp->t_mountp; + + /* + * Break up inode number into its components. + */ + agno = XFS_INO_TO_AGNO(mp, inode); + if (agno >= mp->m_sb.sb_agcount) { + xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).", + __func__, agno, mp->m_sb.sb_agcount); + ASSERT(0); + return XFS_ERROR(EINVAL); + } + agino = XFS_INO_TO_AGINO(mp, inode); + if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) { + xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).", + __func__, (unsigned long long)inode, + (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino)); + ASSERT(0); + return XFS_ERROR(EINVAL); + } + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + if (agbno >= mp->m_sb.sb_agblocks) { + xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).", + __func__, agbno, mp->m_sb.sb_agblocks); + ASSERT(0); + return XFS_ERROR(EINVAL); + } + /* + * Get the allocation group header. + */ + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + if (error) { + xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", + __func__, error); + return error; + } + + /* + * Fix up the inode allocation btree. + */ + error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino, + &rec); + if (error) + goto error0; + + /* + * Fix up the free inode btree. + */ + if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + error = xfs_difree_finobt(mp, tp, agbp, agino, &rec); + if (error) + goto error0; + } + + return 0; + +error0: + return error; +} + STATIC int xfs_imap_lookup( struct xfs_mount *mp, @@ -1294,7 +1763,7 @@ xfs_imap_lookup( * we have a record, we need to ensure it contains the inode number * we are looking up. */ - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); if (!error) { if (i) @@ -1310,7 +1779,7 @@ xfs_imap_lookup( /* check that the returned record contains the required inode */ if (rec.ir_startino > agino || - rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino) + rec.ir_startino + mp->m_ialloc_inos <= agino) return EINVAL; /* for untrusted inodes check it is allocated first */ @@ -1383,7 +1852,7 @@ xfs_imap( return XFS_ERROR(EINVAL); } - blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog; + blks_per_cluster = xfs_icluster_size_fsb(mp); /* * For bulkstat and handle lookups, we have an untrusted inode number @@ -1404,7 +1873,7 @@ xfs_imap( * If the inode cluster size is the same as the blocksize or * smaller we get to the buffer by simple arithmetics. */ - if (XFS_INODE_CLUSTER_SIZE(mp) <= mp->m_sb.sb_blocksize) { + if (blks_per_cluster == 1) { offset = XFS_INO_TO_OFFSET(mp, ino); ASSERT(offset < mp->m_sb.sb_inopblock); @@ -1482,7 +1951,16 @@ xfs_ialloc_compute_maxlevels( } /* - * Log specified fields for the ag hdr (inode section) + * Log specified fields for the ag hdr (inode section). The growth of the agi + * structure over time requires that we interpret the buffer as two logical + * regions delineated by the end of the unlinked list. This is due to the size + * of the hash table and its location in the middle of the agi. + * + * For example, a request to log a field before agi_unlinked and a field after + * agi_unlinked could cause us to log the entire hash table and use an excessive + * amount of log space. To avoid this behavior, log the region up through + * agi_unlinked in one call and the region after agi_unlinked through the end of + * the structure in another. */ void xfs_ialloc_log_agi( @@ -1505,6 +1983,8 @@ xfs_ialloc_log_agi( offsetof(xfs_agi_t, agi_newino), offsetof(xfs_agi_t, agi_dirino), offsetof(xfs_agi_t, agi_unlinked), + offsetof(xfs_agi_t, agi_free_root), + offsetof(xfs_agi_t, agi_free_level), sizeof(xfs_agi_t) }; #ifdef DEBUG @@ -1513,15 +1993,30 @@ xfs_ialloc_log_agi( agi = XFS_BUF_TO_AGI(bp); ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); #endif + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF); + /* - * Compute byte offsets for the first and last fields. + * Compute byte offsets for the first and last fields in the first + * region and log the agi buffer. This only logs up through + * agi_unlinked. */ - xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS, &first, &last); + if (fields & XFS_AGI_ALL_BITS_R1) { + xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1, + &first, &last); + xfs_trans_log_buf(tp, bp, first, last); + } + /* - * Log the allocation group inode header buffer. + * Mask off the bits in the first region and calculate the first and + * last field offsets for any bits in the second region. */ - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF); - xfs_trans_log_buf(tp, bp, first, last); + fields &= ~XFS_AGI_ALL_BITS_R1; + if (fields) { + xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2, + &first, &last); + xfs_trans_log_buf(tp, bp, first, last); + } } #ifdef DEBUG @@ -1574,18 +2069,17 @@ xfs_agi_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - int agi_ok = 1; - - if (xfs_sb_version_hascrc(&mp->m_sb)) - agi_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - offsetof(struct xfs_agi, agi_crc)); - agi_ok = agi_ok && xfs_agi_verify(bp); - if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, - XFS_RANDOM_IALLOC_READ_AGI))) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp, + XFS_ERRTAG_IALLOC_READ_AGI, + XFS_RANDOM_IALLOC_READ_AGI)) xfs_buf_ioerror(bp, EFSCORRUPTED); - } + + if (bp->b_error) + xfs_verifier_error(bp); } static void @@ -1596,8 +2090,8 @@ xfs_agi_write_verify( struct xfs_buf_log_item *bip = bp->b_fspriv; if (!xfs_agi_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } @@ -1606,8 +2100,7 @@ xfs_agi_write_verify( if (bip) XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), - offsetof(struct xfs_agi, agi_crc)); + xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF); } const struct xfs_buf_ops xfs_agi_buf_ops = { @@ -1627,15 +2120,15 @@ xfs_read_agi( { int error; - ASSERT(agno != NULLAGNUMBER); + trace_xfs_read_agi(mp, agno); + ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops); if (error) return error; - ASSERT(!xfs_buf_geterror(*bpp)); xfs_buf_set_ref(*bpp, XFS_AGI_REF); return 0; } @@ -1651,6 +2144,8 @@ xfs_ialloc_read_agi( struct xfs_perag *pag; /* per allocation group data */ int error; + trace_xfs_ialloc_read_agi(mp, agno); + error = xfs_read_agi(mp, tp, agno, bpp); if (error) return error; diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index 68c07320f09..95ad1c002d6 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -23,18 +23,20 @@ struct xfs_dinode; struct xfs_imap; struct xfs_mount; struct xfs_trans; +struct xfs_btree_cur; -/* - * Allocation parameters for inode allocation. - */ -#define XFS_IALLOC_INODES(mp) (mp)->m_ialloc_inos -#define XFS_IALLOC_BLOCKS(mp) (mp)->m_ialloc_blks - -/* - * Move inodes in clusters of this size. - */ +/* Move inodes in clusters of this size */ #define XFS_INODE_BIG_CLUSTER_SIZE 8192 -#define XFS_INODE_CLUSTER_SIZE(mp) (mp)->m_inode_cluster_size + +/* Calculate and return the number of filesystem blocks per inode cluster */ +static inline int +xfs_icluster_size_fsb( + struct xfs_mount *mp) +{ + if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size) + return 1; + return mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog; +} /* * Make an inode pointer out of the buffer/offset. @@ -42,7 +44,7 @@ struct xfs_trans; static inline struct xfs_dinode * xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) { - return (xfs_dinode_t *) + return (struct xfs_dinode *) (xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog)); } @@ -88,7 +90,7 @@ xfs_difree( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t inode, /* inode to be freed */ struct xfs_bmap_free *flist, /* extents to free */ - int *delete, /* set if inode cluster was deleted */ + int *deleted, /* set if inode cluster was deleted */ xfs_ino_t *first_ino); /* first inode in deleted cluster */ /* @@ -158,6 +160,4 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, unsigned int gen); -extern const struct xfs_buf_ops xfs_agi_buf_ops; - #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index 5448eb6b8c1..726f83a681a 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -17,24 +17,23 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_cksum.h" +#include "xfs_trans.h" STATIC int @@ -50,7 +49,8 @@ xfs_inobt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agbp, cur->bc_private.a.agno); + cur->bc_private.a.agbp, cur->bc_private.a.agno, + cur->bc_btnum); } STATIC void @@ -67,12 +67,26 @@ xfs_inobt_set_root( xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL); } +STATIC void +xfs_finobt_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *nptr, + int inc) /* level change */ +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + + agi->agi_free_root = nptr->s; + be32_add_cpu(&agi->agi_free_level, inc); + xfs_ialloc_log_agi(cur->bc_tp, agbp, + XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL); +} + STATIC int xfs_inobt_alloc_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *start, union xfs_btree_ptr *new, - int length, int *stat) { xfs_alloc_arg_t args; /* block allocation args */ @@ -174,6 +188,17 @@ xfs_inobt_init_ptr_from_cur( ptr->s = agi->agi_root; } +STATIC void +xfs_finobt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); + + ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno)); + ptr->s = agi->agi_free_root; +} + STATIC __int64_t xfs_inobt_key_diff( struct xfs_btree_cur *cur, @@ -204,6 +229,7 @@ xfs_inobt_verify( */ switch (block->bb_magic) { case cpu_to_be32(XFS_IBT_CRC_MAGIC): + case cpu_to_be32(XFS_FIBT_CRC_MAGIC): if (!xfs_sb_version_hascrc(&mp->m_sb)) return false; if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) @@ -215,6 +241,7 @@ xfs_inobt_verify( return false; /* fall through */ case cpu_to_be32(XFS_IBT_MAGIC): + case cpu_to_be32(XFS_FIBT_MAGIC): break; default: return 0; @@ -244,12 +271,14 @@ static void xfs_inobt_read_verify( struct xfs_buf *bp) { - if (!(xfs_btree_sblock_verify_crc(bp) && - xfs_inobt_verify(bp))) { - trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, - bp->b_target->bt_mount, bp->b_addr); + if (!xfs_btree_sblock_verify_crc(bp)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_inobt_verify(bp)) xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); } } @@ -259,9 +288,9 @@ xfs_inobt_write_verify( { if (!xfs_inobt_verify(bp)) { trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, - bp->b_target->bt_mount, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; } xfs_btree_sblock_calc_crc(bp); @@ -316,6 +345,28 @@ static const struct xfs_btree_ops xfs_inobt_ops = { #endif }; +static const struct xfs_btree_ops xfs_finobt_ops = { + .rec_len = sizeof(xfs_inobt_rec_t), + .key_len = sizeof(xfs_inobt_key_t), + + .dup_cursor = xfs_inobt_dup_cursor, + .set_root = xfs_finobt_set_root, + .alloc_block = xfs_inobt_alloc_block, + .free_block = xfs_inobt_free_block, + .get_minrecs = xfs_inobt_get_minrecs, + .get_maxrecs = xfs_inobt_get_maxrecs, + .init_key_from_rec = xfs_inobt_init_key_from_rec, + .init_rec_from_key = xfs_inobt_init_rec_from_key, + .init_rec_from_cur = xfs_inobt_init_rec_from_cur, + .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur, + .key_diff = xfs_inobt_key_diff, + .buf_ops = &xfs_inobt_buf_ops, +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_inobt_keys_inorder, + .recs_inorder = xfs_inobt_recs_inorder, +#endif +}; + /* * Allocate a new inode btree cursor. */ @@ -324,7 +375,8 @@ xfs_inobt_init_cursor( struct xfs_mount *mp, /* file system mount point */ struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *agbp, /* buffer for agi structure */ - xfs_agnumber_t agno) /* allocation group number */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_btnum_t btnum) /* ialloc or free ino btree */ { struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); struct xfs_btree_cur *cur; @@ -333,11 +385,17 @@ xfs_inobt_init_cursor( cur->bc_tp = tp; cur->bc_mp = mp; - cur->bc_nlevels = be32_to_cpu(agi->agi_level); - cur->bc_btnum = XFS_BTNUM_INO; + cur->bc_btnum = btnum; + if (btnum == XFS_BTNUM_INO) { + cur->bc_nlevels = be32_to_cpu(agi->agi_level); + cur->bc_ops = &xfs_inobt_ops; + } else { + cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); + cur->bc_ops = &xfs_finobt_ops; + } + cur->bc_blocklog = mp->m_sb.sb_blocklog; - cur->bc_ops = &xfs_inobt_ops; if (xfs_sb_version_hascrc(&mp->m_sb)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h index 3ac36b7642e..d7ebea72c2d 100644 --- a/fs/xfs/xfs_ialloc_btree.h +++ b/fs/xfs/xfs_ialloc_btree.h @@ -27,55 +27,6 @@ struct xfs_btree_cur; struct xfs_mount; /* - * There is a btree for the inode map per allocation group. - */ -#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */ -#define XFS_IBT_CRC_MAGIC 0x49414233 /* 'IAB3' */ - -typedef __uint64_t xfs_inofree_t; -#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t)) -#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3) -#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) -#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) - -static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) -{ - return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i; -} - -/* - * Data record structure - */ -typedef struct xfs_inobt_rec { - __be32 ir_startino; /* starting inode number */ - __be32 ir_freecount; /* count of free inodes (set bits) */ - __be64 ir_free; /* free inode mask */ -} xfs_inobt_rec_t; - -typedef struct xfs_inobt_rec_incore { - xfs_agino_t ir_startino; /* starting inode number */ - __int32_t ir_freecount; /* count of free inodes (set bits) */ - xfs_inofree_t ir_free; /* free inode mask */ -} xfs_inobt_rec_incore_t; - - -/* - * Key structure - */ -typedef struct xfs_inobt_key { - __be32 ir_startino; /* starting inode number */ -} xfs_inobt_key_t; - -/* btree pointer type */ -typedef __be32 xfs_inobt_ptr_t; - -/* - * block numbers in the AG. - */ -#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1)) -#define XFS_PREALLOC_BLOCKS(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1)) - -/* * Btree block header size depends on a superblock flag. */ #define XFS_INOBT_BLOCK_LEN(mp) \ @@ -107,9 +58,8 @@ typedef __be32 xfs_inobt_ptr_t; ((index) - 1) * sizeof(xfs_inobt_ptr_t))) extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, - struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t); + struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t, + xfs_btnum_t); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); -extern const struct xfs_buf_ops xfs_inobt_buf_ops; - #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 474807a401c..c48df5f25b9 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -18,24 +18,19 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_format.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_log_priv.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" #include "xfs_inode.h" -#include "xfs_dinode.h" #include "xfs_error.h" -#include "xfs_filestream.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" #include "xfs_inode_item.h" #include "xfs_quota.h" #include "xfs_trace.h" -#include "xfs_fsops.h" #include "xfs_icache.h" #include "xfs_bmap_util.h" @@ -500,11 +495,6 @@ xfs_inode_ag_walk_grab( if (!igrab(inode)) return ENOENT; - if (is_bad_inode(inode)) { - IRELE(ip); - return ENOENT; - } - /* inode is valid */ return 0; @@ -517,8 +507,7 @@ STATIC int xfs_inode_ag_walk( struct xfs_mount *mp, struct xfs_perag *pag, - int (*execute)(struct xfs_inode *ip, - struct xfs_perag *pag, int flags, + int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args, @@ -592,7 +581,7 @@ restart: for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; - error = execute(batch[i], pag, flags, args); + error = execute(batch[i], flags, args); IRELE(batch[i]); if (error == EAGAIN) { skipped++; @@ -646,8 +635,7 @@ xfs_eofblocks_worker( int xfs_inode_ag_iterator( struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, - struct xfs_perag *pag, int flags, + int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args) @@ -674,8 +662,7 @@ xfs_inode_ag_iterator( int xfs_inode_ag_iterator_tag( struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, - struct xfs_perag *pag, int flags, + int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args, @@ -918,8 +905,6 @@ restart: xfs_iflock(ip); } - if (is_bad_inode(VFS_I(ip))) - goto reclaim; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); xfs_iflush_abort(ip, false); @@ -1221,7 +1206,6 @@ xfs_inode_match_id( STATIC int xfs_inode_free_eofblocks( struct xfs_inode *ip, - struct xfs_perag *pag, int flags, void *args) { diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 9ed68bb750f..9cf017b899b 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -60,12 +60,10 @@ int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); void xfs_eofblocks_worker(struct work_struct *); int xfs_inode_ag_iterator(struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, - int flags, void *args), + int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args); int xfs_inode_ag_iterator_tag(struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, - int flags, void *args), + int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args, int tag); static inline int diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 5a5a593994d..7e454923325 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -17,16 +17,18 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_error.h" #include "xfs_icreate_item.h" +#include "xfs_log.h" kmem_zone_t *xfs_icreate_zone; /* inode create item zone */ @@ -57,13 +59,14 @@ xfs_icreate_item_size( STATIC void xfs_icreate_item_format( struct xfs_log_item *lip, - struct xfs_log_iovec *log_vector) + struct xfs_log_vec *lv) { struct xfs_icreate_item *icp = ICR_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; - log_vector->i_addr = (xfs_caddr_t)&icp->ic_format; - log_vector->i_len = sizeof(struct xfs_icreate_log); - log_vector->i_type = XLOG_REG_TYPE_ICREATE; + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICREATE, + &icp->ic_format, + sizeof(struct xfs_icreate_log)); } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index e3d75385aa7..a6115fe1ac9 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -19,29 +19,24 @@ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" -#include "xfs_log.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_trans_space.h" -#include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_dir2_format.h" #include "xfs_dir2.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" #include "xfs_attr_sf.h" #include "xfs_attr.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" +#include "xfs_trans_space.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_inode_item.h" -#include "xfs_btree.h" -#include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" @@ -52,6 +47,9 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_symlink.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" +#include "xfs_bmap_btree.h" kmem_zone_t *xfs_inode_zone; @@ -63,6 +61,8 @@ kmem_zone_t *xfs_inode_zone; STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); +STATIC int xfs_iunlink_remove(xfs_trans_t *, xfs_inode_t *); + /* * helper function to extract extent size hint from inode */ @@ -78,48 +78,44 @@ xfs_get_extsz_hint( } /* - * This is a wrapper routine around the xfs_ilock() routine used to centralize - * some grungy code. It is used in places that wish to lock the inode solely - * for reading the extents. The reason these places can't just call - * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the - * extents from disk for a file in b-tree format. If the inode is in b-tree - * format, then we need to lock the inode exclusively until the extents are read - * in. Locking it exclusively all the time would limit our parallelism - * unnecessarily, though. What we do instead is check to see if the extents - * have been read in yet, and only lock the inode exclusively if they have not. + * These two are wrapper routines around the xfs_ilock() routine used to + * centralize some grungy code. They are used in places that wish to lock the + * inode solely for reading the extents. The reason these places can't just + * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to + * bringing in of the extents from disk for a file in b-tree format. If the + * inode is in b-tree format, then we need to lock the inode exclusively until + * the extents are read in. Locking it exclusively all the time would limit + * our parallelism unnecessarily, though. What we do instead is check to see + * if the extents have been read in yet, and only lock the inode exclusively + * if they have not. * - * The function returns a value which should be given to the corresponding - * xfs_iunlock_map_shared(). This value is the mode in which the lock was - * actually taken. + * The functions return a value which should be given to the corresponding + * xfs_iunlock() call. */ uint -xfs_ilock_map_shared( - xfs_inode_t *ip) +xfs_ilock_data_map_shared( + struct xfs_inode *ip) { - uint lock_mode; + uint lock_mode = XFS_ILOCK_SHARED; - if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) && - ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) { + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && + (ip->i_df.if_flags & XFS_IFEXTENTS) == 0) lock_mode = XFS_ILOCK_EXCL; - } else { - lock_mode = XFS_ILOCK_SHARED; - } - xfs_ilock(ip, lock_mode); - return lock_mode; } -/* - * This is simply the unlock routine to go with xfs_ilock_map_shared(). - * All it does is call xfs_iunlock() with the given lock_mode. - */ -void -xfs_iunlock_map_shared( - xfs_inode_t *ip, - unsigned int lock_mode) +uint +xfs_ilock_attr_map_shared( + struct xfs_inode *ip) { - xfs_iunlock(ip, lock_mode); + uint lock_mode = XFS_ILOCK_SHARED; + + if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE && + (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0) + lock_mode = XFS_ILOCK_EXCL; + xfs_ilock(ip, lock_mode); + return lock_mode; } /* @@ -589,9 +585,9 @@ xfs_lookup( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return XFS_ERROR(EIO); - lock_mode = xfs_ilock_map_shared(dp); + lock_mode = xfs_ilock_data_map_shared(dp); error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); - xfs_iunlock_map_shared(dp, lock_mode); + xfs_iunlock(dp, lock_mode); if (error) goto out; @@ -659,7 +655,6 @@ xfs_ialloc( uint flags; int error; timespec_t tv; - int filestreams = 0; /* * Call the space management code to pick @@ -686,6 +681,14 @@ xfs_ialloc( return error; ASSERT(ip != NULL); + /* + * We always convert v1 inodes to v2 now - we only support filesystems + * with >= v2 inode capability, so there is no reason for ever leaving + * an inode in v1 format. + */ + if (ip->i_d.di_version == 1) + ip->i_d.di_version = 2; + ip->i_d.di_mode = mode; ip->i_d.di_onlink = 0; ip->i_d.di_nlink = nlink; @@ -695,27 +698,6 @@ xfs_ialloc( xfs_set_projid(ip, prid); memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); - /* - * If the superblock version is up to where we support new format - * inodes and this is currently an old format inode, then change - * the inode version number now. This way we only do the conversion - * here rather than here and in the flush/logging code. - */ - if (xfs_sb_version_hasnlink(&mp->m_sb) && - ip->i_d.di_version == 1) { - ip->i_d.di_version = 2; - /* - * We've already zeroed the old link count, the projid field, - * and the pad field. - */ - } - - /* - * Project ids won't be stored on disk if we are using a version 1 inode. - */ - if ((prid != 0) && (ip->i_d.di_version == 1)) - xfs_bump_ino_vers2(tp, ip); - if (pip && XFS_INHERIT_GID(pip)) { ip->i_d.di_gid = pip->i_d.di_gid; if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) { @@ -776,13 +758,6 @@ xfs_ialloc( flags |= XFS_ILOG_DEV; break; case S_IFREG: - /* - * we can't set up filestreams until after the VFS inode - * is set up properly. - */ - if (pip && xfs_inode_is_filestream(pip)) - filestreams = 1; - /* fall through */ case S_IFDIR: if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { uint di_flags = 0; @@ -848,15 +823,6 @@ xfs_ialloc( /* now that we have an i_mode we can setup inode ops and unlock */ xfs_setup_inode(ip); - /* now we have set up the vfs inode we can associate the filestream */ - if (filestreams) { - error = xfs_filestream_associate(pip, ip); - if (error < 0) - return -error; - if (!error) - xfs_iflags_set(ip, XFS_IFILESTREAM); - } - *ipp = ip; return 0; } @@ -1077,40 +1043,6 @@ xfs_droplink( } /* - * This gets called when the inode's version needs to be changed from 1 to 2. - * Currently this happens when the nlink field overflows the old 16-bit value - * or when chproj is called to change the project for the first time. - * As a side effect the superblock version will also get rev'd - * to contain the NLINK bit. - */ -void -xfs_bump_ino_vers2( - xfs_trans_t *tp, - xfs_inode_t *ip) -{ - xfs_mount_t *mp; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(ip->i_d.di_version == 1); - - ip->i_d.di_version = 2; - ip->i_d.di_onlink = 0; - memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); - mp = tp->t_mountp; - if (!xfs_sb_version_hasnlink(&mp->m_sb)) { - spin_lock(&mp->m_sb_lock); - if (!xfs_sb_version_hasnlink(&mp->m_sb)) { - xfs_sb_version_addnlink(&mp->m_sb); - spin_unlock(&mp->m_sb_lock); - xfs_mod_sb(tp, XFS_SB_VERSIONNUM); - } else { - spin_unlock(&mp->m_sb_lock); - } - } - /* Caller must log the inode */ -} - -/* * Increment the link count on an inode & log the change. */ int @@ -1120,22 +1052,10 @@ xfs_bumplink( { xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); - ASSERT(ip->i_d.di_nlink > 0); + ASSERT(ip->i_d.di_version > 1); + ASSERT(ip->i_d.di_nlink > 0 || (VFS_I(ip)->i_state & I_LINKABLE)); ip->i_d.di_nlink++; inc_nlink(VFS_I(ip)); - if ((ip->i_d.di_version == 1) && - (ip->i_d.di_nlink > XFS_MAXLINK_1)) { - /* - * The inode has increased its number of links beyond - * what can fit in an old format inode. It now needs - * to be converted to a version 2 inode with a 32 bit - * link count. If this is the first inode in the file - * system to do this, then we need to bump the superblock - * version number as well. - */ - xfs_bump_ino_vers2(tp, ip); - } - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); return 0; } @@ -1170,10 +1090,7 @@ xfs_create( if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - prid = xfs_get_projid(dp); - else - prid = XFS_PROJID_DEFAULT; + prid = xfs_get_initial_prid(dp); /* * Make sure that we have allocated dquot(s) on disk. @@ -1338,6 +1255,114 @@ xfs_create( } int +xfs_create_tmpfile( + struct xfs_inode *dp, + struct dentry *dentry, + umode_t mode, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_inode *ip = NULL; + struct xfs_trans *tp = NULL; + int error; + uint cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + prid_t prid; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; + struct xfs_dquot *pdqp = NULL; + struct xfs_trans_res *tres; + uint resblks; + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + prid = xfs_get_initial_prid(dp); + + /* + * Make sure that we have allocated dquot(s) on disk. + */ + error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), + xfs_kgid_to_gid(current_fsgid()), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, + &udqp, &gdqp, &pdqp); + if (error) + return error; + + resblks = XFS_IALLOC_SPACE_RES(mp); + tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE); + + tres = &M_RES(mp)->tr_create_tmpfile; + error = xfs_trans_reserve(tp, tres, resblks, 0); + if (error == ENOSPC) { + /* No space at all so try a "no-allocation" reservation */ + resblks = 0; + error = xfs_trans_reserve(tp, tres, 0, 0); + } + if (error) { + cancel_flags = 0; + goto out_trans_cancel; + } + + error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, + pdqp, resblks, 1, 0); + if (error) + goto out_trans_cancel; + + error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, + prid, resblks > 0, &ip, NULL); + if (error) { + if (error == ENOSPC) + goto out_trans_cancel; + goto out_trans_abort; + } + + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + + /* + * Attach the dquot(s) to the inodes and modify them incore. + * These ids of the inode couldn't have changed since the new + * inode has been locked ever since it was created. + */ + xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); + + ip->i_d.di_nlink--; + error = xfs_iunlink(tp, ip); + if (error) + goto out_trans_abort; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto out_release_inode; + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + *ipp = ip; + return 0; + + out_trans_abort: + cancel_flags |= XFS_TRANS_ABORT; + out_trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + out_release_inode: + /* + * Wait until after the current transaction is aborted to + * release the inode. This prevents recursive transactions + * and deadlocks from xfs_inactive. + */ + if (ip) + IRELE(ip); + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + return error; +} + +int xfs_link( xfs_inode_t *tdp, xfs_inode_t *sip, @@ -1402,6 +1427,12 @@ xfs_link( xfs_bmap_init(&free_list, &first_block); + if (sip->i_d.di_nlink == 0) { + error = xfs_iunlink_remove(tp, sip); + if (error) + goto abort_return; + } + error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, &first_block, &free_list, resblks); if (error) @@ -1592,16 +1623,6 @@ xfs_release( int truncated; /* - * If we are using filestreams, and we have an unlinked - * file that we are processing the last close on, then nothing - * will be able to reopen and write to this file. Purge this - * inode from the filestreams cache so that it doesn't delay - * teardown of the inode. - */ - if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip)) - xfs_filestream_deassociate(ip); - - /* * If we previously truncated this file and removed old data * in the process, we want to initiate "early" writeout on * the last close. This is an attempt to combat the notorious @@ -1663,6 +1684,150 @@ xfs_release( } /* + * xfs_inactive_truncate + * + * Called to perform a truncate when an inode becomes unlinked. + */ +STATIC int +xfs_inactive_truncate( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* + * Log the inode size first to prevent stale data exposure in the event + * of a system crash before the truncate completes. See the related + * comment in xfs_setattr_size() for details. + */ + ip->i_d.di_size = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); + if (error) + goto error_trans_cancel; + + ASSERT(ip->i_d.di_nextents == 0); + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto error_unlock; + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; + +error_trans_cancel: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); +error_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * xfs_inactive_ifree() + * + * Perform the inode free when an inode is unlinked. + */ +STATIC int +xfs_inactive_ifree( + struct xfs_inode *ip) +{ + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int committed; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + + /* + * The ifree transaction might need to allocate blocks for record + * insertion to the finobt. We don't want to fail here at ENOSPC, so + * allow ifree to dip into the reserved block pool if necessary. + * + * Freeing large sets of inodes generally means freeing inode chunks, + * directory and file data blocks, so this should be relatively safe. + * Only under severe circumstances should it be possible to free enough + * inodes to exhaust the reserve block pool via finobt expansion while + * at the same time not creating free space in the filesystem. + * + * Send a warning if the reservation does happen to fail, as the inode + * now remains allocated and sits on the unlinked list until the fs is + * repaired. + */ + tp->t_flags |= XFS_TRANS_RESERVE; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, + XFS_IFREE_SPACE_RES(mp), 0); + if (error) { + if (error == ENOSPC) { + xfs_warn_ratelimited(mp, + "Failed to remove inode(s) from unlinked list. " + "Please free space, unmount and run xfs_repair."); + } else { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + } + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + xfs_bmap_init(&free_list, &first_block); + error = xfs_ifree(tp, ip, &free_list); + if (error) { + /* + * If we fail to free the inode, shut down. The cancel + * might do that, we need to make sure. Otherwise the + * inode might be lost for a long time or forever. + */ + if (!XFS_FORCED_SHUTDOWN(mp)) { + xfs_notice(mp, "%s: xfs_ifree returned error %d", + __func__, error); + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + } + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + } + + /* + * Credit the quota account(s). The inode is gone. + */ + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); + + /* + * Just ignore errors at this point. There is nothing we can + * do except to try to keep going. Make sure it's not a silent + * error. + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", + __func__, error); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + xfs_notice(mp, "%s: xfs_trans_commit returned error %d", + __func__, error); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; +} + +/* * xfs_inactive * * This is called when the vnode reference count for the vnode @@ -1670,16 +1835,11 @@ xfs_release( * now be truncated. Also, we clear all of the read-ahead state * kept for the inode here since the file is now closed. */ -int +void xfs_inactive( xfs_inode_t *ip) { - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - int committed; - struct xfs_trans *tp; struct xfs_mount *mp; - struct xfs_trans_res *resp; int error; int truncate = 0; @@ -1687,19 +1847,17 @@ xfs_inactive( * If the inode is already free, then there can be nothing * to clean up here. */ - if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) { + if (ip->i_d.di_mode == 0) { ASSERT(ip->i_df.if_real_bytes == 0); ASSERT(ip->i_df.if_broot_bytes == 0); - return VN_INACTIVE_CACHE; + return; } mp = ip->i_mount; - error = 0; - /* If this is a read-only mount, don't do this (would generate I/O) */ if (mp->m_flags & XFS_MOUNT_RDONLY) - goto out; + return; if (ip->i_d.di_nlink != 0) { /* @@ -1707,12 +1865,10 @@ xfs_inactive( * cache. Post-eof blocks must be freed, lest we end up with * broken free space accounting. */ - if (xfs_can_free_eofblocks(ip, true)) { - error = xfs_free_eofblocks(mp, ip, false); - if (error) - return VN_INACTIVE_CACHE; - } - goto out; + if (xfs_can_free_eofblocks(ip, true)) + xfs_free_eofblocks(mp, ip, false); + + return; } if (S_ISREG(ip->i_d.di_mode) && @@ -1722,36 +1878,14 @@ xfs_inactive( error = xfs_qm_dqattach(ip, 0); if (error) - return VN_INACTIVE_CACHE; - - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - resp = (truncate || S_ISLNK(ip->i_d.di_mode)) ? - &M_RES(mp)->tr_itruncate : &M_RES(mp)->tr_ifree; - - error = xfs_trans_reserve(tp, resp, 0, 0); - if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); - return VN_INACTIVE_CACHE; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - - if (S_ISLNK(ip->i_d.di_mode)) { - error = xfs_inactive_symlink(ip, &tp); - if (error) - goto out_cancel; - } else if (truncate) { - ip->i_d.di_size = 0; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return; - error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); - if (error) - goto out_cancel; - - ASSERT(ip->i_d.di_nextents == 0); - } + if (S_ISLNK(ip->i_d.di_mode)) + error = xfs_inactive_symlink(ip); + else if (truncate) + error = xfs_inactive_truncate(ip); + if (error) + return; /* * If there are attributes associated with the file then blow them away @@ -1762,25 +1896,9 @@ xfs_inactive( if (ip->i_d.di_anextents > 0) { ASSERT(ip->i_d.di_forkoff != 0); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - if (error) - goto out_unlock; - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - error = xfs_attr_inactive(ip); if (error) - goto out; - - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - goto out; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); + return; } if (ip->i_afp) @@ -1791,52 +1909,14 @@ xfs_inactive( /* * Free the inode. */ - xfs_bmap_init(&free_list, &first_block); - error = xfs_ifree(tp, ip, &free_list); - if (error) { - /* - * If we fail to free the inode, shut down. The cancel - * might do that, we need to make sure. Otherwise the - * inode might be lost for a long time or forever. - */ - if (!XFS_FORCED_SHUTDOWN(mp)) { - xfs_notice(mp, "%s: xfs_ifree returned error %d", - __func__, error); - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - } - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - } else { - /* - * Credit the quota account(s). The inode is gone. - */ - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); - - /* - * Just ignore errors at this point. There is nothing we can - * do except to try to keep going. Make sure it's not a silent - * error. - */ - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) - xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", - __func__, error); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - if (error) - xfs_notice(mp, "%s: xfs_trans_commit returned error %d", - __func__, error); - } + error = xfs_inactive_ifree(ip); + if (error) + return; /* * Release the dquots held by inode, if any. */ xfs_qm_dqdetach(ip); -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); -out: - return VN_INACTIVE_CACHE; -out_cancel: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - goto out_unlock; } /* @@ -2107,8 +2187,8 @@ xfs_ifree_cluster( { xfs_mount_t *mp = free_ip->i_mount; int blks_per_cluster; + int inodes_per_cluster; int nbufs; - int ninodes; int i, j; xfs_daddr_t blkno; xfs_buf_t *bp; @@ -2118,18 +2198,11 @@ xfs_ifree_cluster( struct xfs_perag *pag; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); - if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { - blks_per_cluster = 1; - ninodes = mp->m_sb.sb_inopblock; - nbufs = XFS_IALLOC_BLOCKS(mp); - } else { - blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / - mp->m_sb.sb_blocksize; - ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; - nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; - } + blks_per_cluster = xfs_icluster_size_fsb(mp); + inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; + nbufs = mp->m_ialloc_blks / blks_per_cluster; - for (j = 0; j < nbufs; j++, inum += ninodes) { + for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) { blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), XFS_INO_TO_AGBNO(mp, inum)); @@ -2191,7 +2264,7 @@ xfs_ifree_cluster( * transaction stale above, which means there is no point in * even trying to lock them. */ - for (i = 0; i < ninodes; i++) { + for (i = 0; i < inodes_per_cluster; i++) { retry: rcu_read_lock(); ip = radix_tree_lookup(&pag->pag_ici_root, @@ -2370,6 +2443,33 @@ xfs_iunpin_wait( __xfs_iunpin_wait(ip); } +/* + * Removing an inode from the namespace involves removing the directory entry + * and dropping the link count on the inode. Removing the directory entry can + * result in locking an AGF (directory blocks were freed) and removing a link + * count can result in placing the inode on an unlinked list which results in + * locking an AGI. + * + * The big problem here is that we have an ordering constraint on AGF and AGI + * locking - inode allocation locks the AGI, then can allocate a new extent for + * new inodes, locking the AGF after the AGI. Similarly, freeing the inode + * removes the inode from the unlinked list, requiring that we lock the AGI + * first, and then freeing the inode can result in an inode chunk being freed + * and hence freeing disk space requiring that we lock an AGF. + * + * Hence the ordering that is imposed by other parts of the code is AGI before + * AGF. This means we cannot remove the directory entry before we drop the inode + * reference count and put it on the unlinked list as this results in a lock + * order of AGF then AGI, and this can deadlock against inode allocation and + * freeing. Therefore we must drop the link counts before we remove the + * directory entry. + * + * This is still safe from a transactional point of view - it is not until we + * get to xfs_bmap_finish() that we have the possibility of multiple + * transactions in this operation. Hence as long as we remove the directory + * entry and drop the link count in the first transaction of the remove + * operation, there are no transactional constraints on the ordering here. + */ int xfs_remove( xfs_inode_t *dp, @@ -2439,6 +2539,7 @@ xfs_remove( /* * If we're removing a directory perform some additional validation. */ + cancel_flags |= XFS_TRANS_ABORT; if (is_dir) { ASSERT(ip->i_d.di_nlink >= 2); if (ip->i_d.di_nlink != 2) { @@ -2449,31 +2550,16 @@ xfs_remove( error = XFS_ERROR(ENOTEMPTY); goto out_trans_cancel; } - } - - xfs_bmap_init(&free_list, &first_block); - error = xfs_dir_removename(tp, dp, name, ip->i_ino, - &first_block, &free_list, resblks); - if (error) { - ASSERT(error != ENOENT); - goto out_bmap_cancel; - } - xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - if (is_dir) { - /* - * Drop the link from ip's "..". - */ + /* Drop the link from ip's "..". */ error = xfs_droplink(tp, dp); if (error) - goto out_bmap_cancel; + goto out_trans_cancel; - /* - * Drop the "." link from ip to self. - */ + /* Drop the "." link from ip to self. */ error = xfs_droplink(tp, ip); if (error) - goto out_bmap_cancel; + goto out_trans_cancel; } else { /* * When removing a non-directory we need to log the parent @@ -2482,20 +2568,24 @@ xfs_remove( */ xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); } + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - /* - * Drop the link from dp to ip. - */ + /* Drop the link from dp to ip. */ error = xfs_droplink(tp, ip); if (error) - goto out_bmap_cancel; + goto out_trans_cancel; - /* - * Determine if this is the last link while - * we are in the transaction. - */ + /* Determine if this is the last link while the inode is locked */ link_zero = (ip->i_d.di_nlink == 0); + xfs_bmap_init(&free_list, &first_block); + error = xfs_dir_removename(tp, dp, name, ip->i_ino, + &first_block, &free_list, resblks); + if (error) { + ASSERT(error != ENOENT); + goto out_bmap_cancel; + } + /* * If this is a synchronous mount, make sure that the * remove transaction goes to disk before returning to @@ -2512,20 +2602,13 @@ xfs_remove( if (error) goto std_return; - /* - * If we are using filestreams, kill the stream association. - * If the file is still open it may get a new one but that - * will get killed on last close in xfs_close() so we don't - * have to worry about that. - */ - if (!is_dir && link_zero && xfs_inode_is_filestream(ip)) + if (is_dir && xfs_inode_is_filestream(ip)) xfs_filestream_deassociate(ip); return 0; out_bmap_cancel: xfs_bmap_cancel(&free_list); - cancel_flags |= XFS_TRANS_ABORT; out_trans_cancel: xfs_trans_cancel(tp, cancel_flags); std_return: @@ -2856,13 +2939,13 @@ xfs_iflush_cluster( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog; + inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); if (!ilist) goto out_put; - mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); + mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1); first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; rcu_read_lock(); /* really need a gang lookup range call here */ @@ -3107,6 +3190,7 @@ xfs_iflush_int( ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); ASSERT(iip != NULL && iip->ili_fields != 0); + ASSERT(ip->i_d.di_version > 1); /* set *dip = inode's place in the buffer */ dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); @@ -3167,7 +3251,7 @@ xfs_iflush_int( } /* - * Inode item log recovery for v1/v2 inodes are dependent on the + * Inode item log recovery for v2 inodes are dependent on the * di_flushiter count for correct sequencing. We bump the flush * iteration count so we can detect flushes which postdate a log record * during recovery. This is redundant as we now log every change and @@ -3190,40 +3274,9 @@ xfs_iflush_int( if (ip->i_d.di_flushiter == DI_MAX_FLUSH) ip->i_d.di_flushiter = 0; - /* - * If this is really an old format inode and the superblock version - * has not been updated to support only new format inodes, then - * convert back to the old inode format. If the superblock version - * has been updated, then make the conversion permanent. - */ - ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb)); - if (ip->i_d.di_version == 1) { - if (!xfs_sb_version_hasnlink(&mp->m_sb)) { - /* - * Convert it back. - */ - ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); - dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink); - } else { - /* - * The superblock version has already been bumped, - * so just make the conversion to the new inode - * format permanent. - */ - ip->i_d.di_version = 2; - dip->di_version = 2; - ip->i_d.di_onlink = 0; - dip->di_onlink = 0; - memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); - memset(&(dip->di_pad[0]), 0, - sizeof(dip->di_pad)); - ASSERT(xfs_get_projid(ip) == 0); - } - } - - xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp); + xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); if (XFS_IFORK_Q(ip)) - xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp); + xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); xfs_inobp_check(mp, bp); /* diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 4a91358c147..f72bffa6726 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -20,11 +20,11 @@ #include "xfs_inode_buf.h" #include "xfs_inode_fork.h" +#include "xfs_dinode.h" /* * Kernel only inode definitions */ - struct xfs_dinode; struct xfs_inode; struct xfs_buf; @@ -50,6 +50,9 @@ typedef struct xfs_inode { xfs_ifork_t *i_afp; /* attribute fork pointer */ xfs_ifork_t i_df; /* data fork */ + /* operations vectors */ + const struct xfs_dir_ops *d_ops; /* directory ops vector */ + /* Transaction and locking information. */ struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ @@ -190,6 +193,15 @@ xfs_set_projid(struct xfs_inode *ip, ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff); } +static inline prid_t +xfs_get_initial_prid(struct xfs_inode *dp) +{ + if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + return xfs_get_projid(dp); + + return XFS_PROJID_DEFAULT; +} + /* * In-core inode flags. */ @@ -197,7 +209,6 @@ xfs_set_projid(struct xfs_inode *ip, #define XFS_ISTALE (1 << 1) /* inode has been staled */ #define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */ #define XFS_INEW (1 << 3) /* inode has just been allocated */ -#define XFS_IFILESTREAM (1 << 4) /* inode is in a filestream dir. */ #define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ #define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ #define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */ @@ -213,8 +224,7 @@ xfs_set_projid(struct xfs_inode *ip, */ #define XFS_IRECLAIM_RESET_FLAGS \ (XFS_IRECLAIMABLE | XFS_IRECLAIM | \ - XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | \ - XFS_IFILESTREAM); + XFS_IDIRTY_RELEASE | XFS_ITRUNCATED) /* * Synchronize processes attempting to flush the in-core inode back to disk. @@ -316,11 +326,13 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) int xfs_release(struct xfs_inode *ip); -int xfs_inactive(struct xfs_inode *ip); +void xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); int xfs_create(struct xfs_inode *dp, struct xfs_name *name, umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp); +int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry, + umode_t mode, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, @@ -335,8 +347,8 @@ int xfs_ilock_nowait(xfs_inode_t *, uint); void xfs_iunlock(xfs_inode_t *, uint); void xfs_ilock_demote(xfs_inode_t *, uint); int xfs_isilocked(xfs_inode_t *, uint); -uint xfs_ilock_map_shared(xfs_inode_t *); -void xfs_iunlock_map_shared(xfs_inode_t *, uint); +uint xfs_ilock_data_map_shared(struct xfs_inode *); +uint xfs_ilock_attr_map_shared(struct xfs_inode *); int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, struct xfs_buf **, xfs_inode_t **); @@ -365,7 +377,6 @@ int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, struct xfs_inode **, int *); int xfs_droplink(struct xfs_trans *, struct xfs_inode *); int xfs_bumplink(struct xfs_trans *, struct xfs_inode *); -void xfs_bump_ino_vers2(struct xfs_trans *, struct xfs_inode *); /* from xfs_file.c */ int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c index 63382d37f56..cb35ae41d4a 100644 --- a/fs/xfs/xfs_inode_buf.c +++ b/fs/xfs/xfs_inode_buf.c @@ -17,20 +17,20 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_error.h" #include "xfs_cksum.h" #include "xfs_icache.h" +#include "xfs_trans.h" #include "xfs_ialloc.h" +#include "xfs_dinode.h" /* * Check that none of the inode's in the buffer have a next @@ -102,8 +102,7 @@ xfs_inode_buf_verify( } xfs_buf_ioerror(bp, EFSCORRUPTED); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH, - mp, dip); + xfs_verifier_error(bp); #ifdef DEBUG xfs_alert(mp, "bad inode magic/vsn daddr %lld #%d (magic=%x)", @@ -306,7 +305,7 @@ xfs_dinode_verify( if (!xfs_sb_version_hascrc(&mp->m_sb)) return false; if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, - offsetof(struct xfs_dinode, di_crc))) + XFS_DINODE_CRC_OFF)) return false; if (be64_to_cpu(dip->di_ino) != ip->i_ino) return false; @@ -327,7 +326,7 @@ xfs_dinode_calc_crc( ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize, - offsetof(struct xfs_dinode, di_crc)); + XFS_DINODE_CRC_OFF); dip->di_crc = xfs_end_cksum(crc); } @@ -438,17 +437,16 @@ xfs_iread( } /* - * The inode format changed when we moved the link count and - * made it 32 bits long. If this is an old format inode, - * convert it in memory to look like a new one. If it gets - * flushed to disk we will convert back before flushing or - * logging it. We zero out the new projid field and the old link - * count field. We'll handle clearing the pad field (the remains - * of the old uuid field) when we actually convert the inode to - * the new format. We don't change the version number so that we - * can distinguish this from a real new format inode. + * Automatically convert version 1 inode formats in memory to version 2 + * inode format. If the inode is modified, it will get logged and + * rewritten as a version 2 inode. We can do this because we set the + * superblock feature bit for v2 inodes unconditionally during mount + * and it means the reast of the code can assume the inode version is 2 + * or higher. */ if (ip->i_d.di_version == 1) { + ip->i_d.di_version = 2; + memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); ip->i_d.di_nlink = ip->i_d.di_onlink; ip->i_d.di_onlink = 0; xfs_set_projid(ip, 0); diff --git a/fs/xfs/xfs_inode_buf.h b/fs/xfs/xfs_inode_buf.h index abba0ae8cf2..9308c47f2a5 100644 --- a/fs/xfs/xfs_inode_buf.h +++ b/fs/xfs/xfs_inode_buf.h @@ -47,7 +47,4 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); #define xfs_inobp_check(mp, bp) #endif /* DEBUG */ -extern const struct xfs_buf_ops xfs_inode_buf_ops; -extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; - #endif /* __XFS_INODE_BUF_H__ */ diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c index 02f1083955b..b031e8d0d92 100644 --- a/fs/xfs/xfs_inode_fork.c +++ b/fs/xfs/xfs_inode_fork.c @@ -20,31 +20,21 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_format.h" -#include "xfs_log.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_buf_item.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" -#include "xfs_btree.h" -#include "xfs_alloc.h" -#include "xfs_ialloc.h" +#include "xfs_bmap_btree.h" #include "xfs_bmap.h" #include "xfs_error.h" -#include "xfs_quota.h" -#include "xfs_filestream.h" -#include "xfs_cksum.h" #include "xfs_trace.h" -#include "xfs_icache.h" +#include "xfs_attr_sf.h" +#include "xfs_dinode.h" kmem_zone_t *xfs_ifork_zone; @@ -441,6 +431,8 @@ xfs_iread_extents( xfs_ifork_t *ifp; xfs_extnum_t nextents; + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, ip->i_mount); @@ -731,15 +723,16 @@ xfs_idestroy_fork( } /* - * xfs_iextents_copy() + * Convert in-core extents to on-disk form * - * This is called to copy the REAL extents (as opposed to the delayed - * allocation extents) from the inode into the given buffer. It - * returns the number of bytes copied into the buffer. + * For either the data or attr fork in extent format, we need to endian convert + * the in-core extent as we place them into the on-disk inode. * - * If there are no delayed allocation extents, then we can just - * memcpy() the extents into the buffer. Otherwise, we need to - * examine each extent in turn and skip those which are delayed. + * In the case of the data fork, the in-core and on-disk fork sizes can be + * different due to delayed allocation extents. We only copy on-disk extents + * here, so callers must always use the physical fork size to determine the + * size of the buffer passed to this routine. We will return the size actually + * used. */ int xfs_iextents_copy( @@ -805,8 +798,7 @@ xfs_iflush_fork( xfs_inode_t *ip, xfs_dinode_t *dip, xfs_inode_log_item_t *iip, - int whichfork, - xfs_buf_t *bp) + int whichfork) { char *cp; xfs_ifork_t *ifp; @@ -1031,15 +1023,14 @@ xfs_iext_add( * the next index needed in the indirection array. */ else { - int count = ext_diff; + uint count = ext_diff; while (count) { erp = xfs_iext_irec_new(ifp, erp_idx); - erp->er_extcount = count; - count -= MIN(count, (int)XFS_LINEAR_EXTS); - if (count) { + erp->er_extcount = min(count, XFS_LINEAR_EXTS); + count -= erp->er_extcount; + if (count) erp_idx++; - } } } } @@ -1359,7 +1350,7 @@ xfs_iext_remove_indirect( void xfs_iext_realloc_direct( xfs_ifork_t *ifp, /* inode fork pointer */ - int new_size) /* new size of extents */ + int new_size) /* new size of extents after adding */ { int rnew_size; /* real new size of extents */ @@ -1397,13 +1388,8 @@ xfs_iext_realloc_direct( rnew_size - ifp->if_real_bytes); } } - /* - * Switch from the inline extent buffer to a direct - * extent list. Be sure to include the inline extent - * bytes in new_size. - */ + /* Switch from the inline extent buffer to a direct extent list */ else { - new_size += ifp->if_bytes; if (!is_power_of_2(new_size)) { rnew_size = roundup_pow_of_two(new_size); } diff --git a/fs/xfs/xfs_inode_fork.h b/fs/xfs/xfs_inode_fork.h index 28661a0d905..7d3b1ed6dcb 100644 --- a/fs/xfs/xfs_inode_fork.h +++ b/fs/xfs/xfs_inode_fork.h @@ -19,6 +19,7 @@ #define __XFS_INODE_FORK_H__ struct xfs_inode_log_item; +struct xfs_dinode; /* * The following xfs_ext_irec_t struct introduces a second (top) level @@ -126,8 +127,7 @@ typedef struct xfs_ifork { int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, - struct xfs_inode_log_item *, int, - struct xfs_buf *); + struct xfs_inode_log_item *, int); void xfs_idestroy_fork(struct xfs_inode *, int); void xfs_idata_realloc(struct xfs_inode *, int, int); void xfs_iroot_realloc(struct xfs_inode *, int, int); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 37808110984..a640137b357 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -17,19 +17,20 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_trans_priv.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_trans_priv.h" +#include "xfs_dinode.h" +#include "xfs_log.h" kmem_zone_t *xfs_ili_zone; /* inode log item zone */ @@ -39,27 +40,14 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_inode_log_item, ili_item); } - -/* - * This returns the number of iovecs needed to log the given inode item. - * - * We need one iovec for the inode log format structure, one for the - * inode core, and possibly one for the inode data/extents/b-tree root - * and one for the inode attribute data/extents/b-tree root. - */ STATIC void -xfs_inode_item_size( - struct xfs_log_item *lip, +xfs_inode_item_data_fork_size( + struct xfs_inode_log_item *iip, int *nvecs, int *nbytes) { - struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; - *nvecs += 2; - *nbytes += sizeof(struct xfs_inode_log_format) + - xfs_icdinode_size(ip->i_d.di_version); - switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: if ((iip->ili_fields & XFS_ILOG_DEXT) && @@ -70,7 +58,6 @@ xfs_inode_item_size( *nvecs += 1; } break; - case XFS_DINODE_FMT_BTREE: if ((iip->ili_fields & XFS_ILOG_DBROOT) && ip->i_df.if_broot_bytes > 0) { @@ -78,7 +65,6 @@ xfs_inode_item_size( *nvecs += 1; } break; - case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & XFS_ILOG_DDATA) && ip->i_df.if_bytes > 0) { @@ -90,19 +76,20 @@ xfs_inode_item_size( case XFS_DINODE_FMT_DEV: case XFS_DINODE_FMT_UUID: break; - default: ASSERT(0); break; } +} - if (!XFS_IFORK_Q(ip)) - return; - +STATIC void +xfs_inode_item_attr_fork_size( + struct xfs_inode_log_item *iip, + int *nvecs, + int *nbytes) +{ + struct xfs_inode *ip = iip->ili_inode; - /* - * Log any necessary attribute data. - */ switch (ip->i_d.di_aformat) { case XFS_DINODE_FMT_EXTENTS: if ((iip->ili_fields & XFS_ILOG_AEXT) && @@ -113,7 +100,6 @@ xfs_inode_item_size( *nvecs += 1; } break; - case XFS_DINODE_FMT_BTREE: if ((iip->ili_fields & XFS_ILOG_ABROOT) && ip->i_afp->if_broot_bytes > 0) { @@ -121,7 +107,6 @@ xfs_inode_item_size( *nvecs += 1; } break; - case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & XFS_ILOG_ADATA) && ip->i_afp->if_bytes > 0) { @@ -129,7 +114,6 @@ xfs_inode_item_size( *nvecs += 1; } break; - default: ASSERT(0); break; @@ -137,98 +121,39 @@ xfs_inode_item_size( } /* - * xfs_inode_item_format_extents - convert in-core extents to on-disk form - * - * For either the data or attr fork in extent format, we need to endian convert - * the in-core extent as we place them into the on-disk inode. In this case, we - * need to do this conversion before we write the extents into the log. Because - * we don't have the disk inode to write into here, we allocate a buffer and - * format the extents into it via xfs_iextents_copy(). We free the buffer in - * the unlock routine after the copy for the log has been made. + * This returns the number of iovecs needed to log the given inode item. * - * In the case of the data fork, the in-core and on-disk fork sizes can be - * different due to delayed allocation extents. We only log on-disk extents - * here, so always use the physical fork size to determine the size of the - * buffer we need to allocate. + * We need one iovec for the inode log format structure, one for the + * inode core, and possibly one for the inode data/extents/b-tree root + * and one for the inode attribute data/extents/b-tree root. */ STATIC void -xfs_inode_item_format_extents( - struct xfs_inode *ip, - struct xfs_log_iovec *vecp, - int whichfork, - int type) +xfs_inode_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) { - xfs_bmbt_rec_t *ext_buffer; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; - ext_buffer = kmem_alloc(XFS_IFORK_SIZE(ip, whichfork), KM_SLEEP); - if (whichfork == XFS_DATA_FORK) - ip->i_itemp->ili_extents_buf = ext_buffer; - else - ip->i_itemp->ili_aextents_buf = ext_buffer; + *nvecs += 2; + *nbytes += sizeof(struct xfs_inode_log_format) + + xfs_icdinode_size(ip->i_d.di_version); - vecp->i_addr = ext_buffer; - vecp->i_len = xfs_iextents_copy(ip, ext_buffer, whichfork); - vecp->i_type = type; + xfs_inode_item_data_fork_size(iip, nvecs, nbytes); + if (XFS_IFORK_Q(ip)) + xfs_inode_item_attr_fork_size(iip, nvecs, nbytes); } -/* - * This is called to fill in the vector of log iovecs for the - * given inode log item. It fills the first item with an inode - * log format structure, the second with the on-disk inode structure, - * and a possible third and/or fourth with the inode data/extents/b-tree - * root and inode attributes data/extents/b-tree root. - */ STATIC void -xfs_inode_item_format( - struct xfs_log_item *lip, - struct xfs_log_iovec *vecp) +xfs_inode_item_format_data_fork( + struct xfs_inode_log_item *iip, + struct xfs_inode_log_format *ilf, + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp) { - struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; - uint nvecs; size_t data_bytes; - xfs_mount_t *mp; - - vecp->i_addr = &iip->ili_format; - vecp->i_len = sizeof(xfs_inode_log_format_t); - vecp->i_type = XLOG_REG_TYPE_IFORMAT; - vecp++; - nvecs = 1; - - vecp->i_addr = &ip->i_d; - vecp->i_len = xfs_icdinode_size(ip->i_d.di_version); - vecp->i_type = XLOG_REG_TYPE_ICORE; - vecp++; - nvecs++; - - /* - * If this is really an old format inode, then we need to - * log it as such. This means that we have to copy the link - * count from the new field to the old. We don't have to worry - * about the new fields, because nothing trusts them as long as - * the old inode version number is there. If the superblock already - * has a new version number, then we don't bother converting back. - */ - mp = ip->i_mount; - ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb)); - if (ip->i_d.di_version == 1) { - if (!xfs_sb_version_hasnlink(&mp->m_sb)) { - /* - * Convert it back. - */ - ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); - ip->i_d.di_onlink = ip->i_d.di_nlink; - } else { - /* - * The superblock version has already been bumped, - * so just make the conversion to the new inode - * format permanent. - */ - ip->i_d.di_version = 2; - ip->i_d.di_onlink = 0; - memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); - } - } switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: @@ -239,36 +164,23 @@ xfs_inode_item_format( if ((iip->ili_fields & XFS_ILOG_DEXT) && ip->i_d.di_nextents > 0 && ip->i_df.if_bytes > 0) { + struct xfs_bmbt_rec *p; + ASSERT(ip->i_df.if_u1.if_extents != NULL); ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0); - ASSERT(iip->ili_extents_buf == NULL); - -#ifdef XFS_NATIVE_HOST - if (ip->i_d.di_nextents == ip->i_df.if_bytes / - (uint)sizeof(xfs_bmbt_rec_t)) { - /* - * There are no delayed allocation - * extents, so just point to the - * real extents array. - */ - vecp->i_addr = ip->i_df.if_u1.if_extents; - vecp->i_len = ip->i_df.if_bytes; - vecp->i_type = XLOG_REG_TYPE_IEXT; - } else -#endif - { - xfs_inode_item_format_extents(ip, vecp, - XFS_DATA_FORK, XLOG_REG_TYPE_IEXT); - } - ASSERT(vecp->i_len <= ip->i_df.if_bytes); - iip->ili_format.ilf_dsize = vecp->i_len; - vecp++; - nvecs++; + + p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT); + data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK); + xlog_finish_iovec(lv, *vecp, data_bytes); + + ASSERT(data_bytes <= ip->i_df.if_bytes); + + ilf->ilf_dsize = data_bytes; + ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_DEXT; } break; - case XFS_DINODE_FMT_BTREE: iip->ili_fields &= ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | @@ -277,80 +189,70 @@ xfs_inode_item_format( if ((iip->ili_fields & XFS_ILOG_DBROOT) && ip->i_df.if_broot_bytes > 0) { ASSERT(ip->i_df.if_broot != NULL); - vecp->i_addr = ip->i_df.if_broot; - vecp->i_len = ip->i_df.if_broot_bytes; - vecp->i_type = XLOG_REG_TYPE_IBROOT; - vecp++; - nvecs++; - iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes; + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IBROOT, + ip->i_df.if_broot, + ip->i_df.if_broot_bytes); + ilf->ilf_dsize = ip->i_df.if_broot_bytes; + ilf->ilf_size++; } else { ASSERT(!(iip->ili_fields & XFS_ILOG_DBROOT)); iip->ili_fields &= ~XFS_ILOG_DBROOT; } break; - case XFS_DINODE_FMT_LOCAL: iip->ili_fields &= ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV | XFS_ILOG_UUID); if ((iip->ili_fields & XFS_ILOG_DDATA) && ip->i_df.if_bytes > 0) { - ASSERT(ip->i_df.if_u1.if_data != NULL); - ASSERT(ip->i_d.di_size > 0); - - vecp->i_addr = ip->i_df.if_u1.if_data; /* * Round i_bytes up to a word boundary. * The underlying memory is guaranteed to * to be there by xfs_idata_realloc(). */ data_bytes = roundup(ip->i_df.if_bytes, 4); - ASSERT((ip->i_df.if_real_bytes == 0) || - (ip->i_df.if_real_bytes == data_bytes)); - vecp->i_len = (int)data_bytes; - vecp->i_type = XLOG_REG_TYPE_ILOCAL; - vecp++; - nvecs++; - iip->ili_format.ilf_dsize = (unsigned)data_bytes; + ASSERT(ip->i_df.if_real_bytes == 0 || + ip->i_df.if_real_bytes == data_bytes); + ASSERT(ip->i_df.if_u1.if_data != NULL); + ASSERT(ip->i_d.di_size > 0); + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL, + ip->i_df.if_u1.if_data, data_bytes); + ilf->ilf_dsize = (unsigned)data_bytes; + ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_DDATA; } break; - case XFS_DINODE_FMT_DEV: iip->ili_fields &= ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEXT | XFS_ILOG_UUID); - if (iip->ili_fields & XFS_ILOG_DEV) { - iip->ili_format.ilf_u.ilfu_rdev = - ip->i_df.if_u2.if_rdev; - } + if (iip->ili_fields & XFS_ILOG_DEV) + ilf->ilf_u.ilfu_rdev = ip->i_df.if_u2.if_rdev; break; - case XFS_DINODE_FMT_UUID: iip->ili_fields &= ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEXT | XFS_ILOG_DEV); - if (iip->ili_fields & XFS_ILOG_UUID) { - iip->ili_format.ilf_u.ilfu_uuid = - ip->i_df.if_u2.if_uuid; - } + if (iip->ili_fields & XFS_ILOG_UUID) + ilf->ilf_u.ilfu_uuid = ip->i_df.if_u2.if_uuid; break; - default: ASSERT(0); break; } +} - /* - * If there are no attributes associated with the file, then we're done. - */ - if (!XFS_IFORK_Q(ip)) { - iip->ili_fields &= - ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT); - goto out; - } +STATIC void +xfs_inode_item_format_attr_fork( + struct xfs_inode_log_item *iip, + struct xfs_inode_log_format *ilf, + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp) +{ + struct xfs_inode *ip = iip->ili_inode; + size_t data_bytes; switch (ip->i_d.di_aformat) { case XFS_DINODE_FMT_EXTENTS: @@ -360,30 +262,22 @@ xfs_inode_item_format( if ((iip->ili_fields & XFS_ILOG_AEXT) && ip->i_d.di_anextents > 0 && ip->i_afp->if_bytes > 0) { + struct xfs_bmbt_rec *p; + ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) == ip->i_d.di_anextents); ASSERT(ip->i_afp->if_u1.if_extents != NULL); -#ifdef XFS_NATIVE_HOST - /* - * There are not delayed allocation extents - * for attributes, so just point at the array. - */ - vecp->i_addr = ip->i_afp->if_u1.if_extents; - vecp->i_len = ip->i_afp->if_bytes; - vecp->i_type = XLOG_REG_TYPE_IATTR_EXT; -#else - ASSERT(iip->ili_aextents_buf == NULL); - xfs_inode_item_format_extents(ip, vecp, - XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT); -#endif - iip->ili_format.ilf_asize = vecp->i_len; - vecp++; - nvecs++; + + p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT); + data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK); + xlog_finish_iovec(lv, *vecp, data_bytes); + + ilf->ilf_asize = data_bytes; + ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_AEXT; } break; - case XFS_DINODE_FMT_BTREE: iip->ili_fields &= ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT); @@ -392,61 +286,89 @@ xfs_inode_item_format( ip->i_afp->if_broot_bytes > 0) { ASSERT(ip->i_afp->if_broot != NULL); - vecp->i_addr = ip->i_afp->if_broot; - vecp->i_len = ip->i_afp->if_broot_bytes; - vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT; - vecp++; - nvecs++; - iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes; + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT, + ip->i_afp->if_broot, + ip->i_afp->if_broot_bytes); + ilf->ilf_asize = ip->i_afp->if_broot_bytes; + ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_ABROOT; } break; - case XFS_DINODE_FMT_LOCAL: iip->ili_fields &= ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT); if ((iip->ili_fields & XFS_ILOG_ADATA) && ip->i_afp->if_bytes > 0) { - ASSERT(ip->i_afp->if_u1.if_data != NULL); - - vecp->i_addr = ip->i_afp->if_u1.if_data; /* * Round i_bytes up to a word boundary. * The underlying memory is guaranteed to * to be there by xfs_idata_realloc(). */ data_bytes = roundup(ip->i_afp->if_bytes, 4); - ASSERT((ip->i_afp->if_real_bytes == 0) || - (ip->i_afp->if_real_bytes == data_bytes)); - vecp->i_len = (int)data_bytes; - vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL; - vecp++; - nvecs++; - iip->ili_format.ilf_asize = (unsigned)data_bytes; + ASSERT(ip->i_afp->if_real_bytes == 0 || + ip->i_afp->if_real_bytes == data_bytes); + ASSERT(ip->i_afp->if_u1.if_data != NULL); + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL, + ip->i_afp->if_u1.if_data, + data_bytes); + ilf->ilf_asize = (unsigned)data_bytes; + ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_ADATA; } break; - default: ASSERT(0); break; } - -out: - /* - * Now update the log format that goes out to disk from the in-core - * values. We always write the inode core to make the arithmetic - * games in recovery easier, which isn't a big deal as just about any - * transaction would dirty it anyway. - */ - iip->ili_format.ilf_fields = XFS_ILOG_CORE | - (iip->ili_fields & ~XFS_ILOG_TIMESTAMP); - iip->ili_format.ilf_size = nvecs; } +/* + * This is called to fill in the vector of log iovecs for the given inode + * log item. It fills the first item with an inode log format structure, + * the second with the on-disk inode structure, and a possible third and/or + * fourth with the inode data/extents/b-tree root and inode attributes + * data/extents/b-tree root. + */ +STATIC void +xfs_inode_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + struct xfs_inode_log_format *ilf; + struct xfs_log_iovec *vecp = NULL; + + ASSERT(ip->i_d.di_version > 1); + + ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT); + ilf->ilf_type = XFS_LI_INODE; + ilf->ilf_ino = ip->i_ino; + ilf->ilf_blkno = ip->i_imap.im_blkno; + ilf->ilf_len = ip->i_imap.im_len; + ilf->ilf_boffset = ip->i_imap.im_boffset; + ilf->ilf_fields = XFS_ILOG_CORE; + ilf->ilf_size = 2; /* format + core */ + xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format)); + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICORE, + &ip->i_d, + xfs_icdinode_size(ip->i_d.di_version)); + + xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp); + if (XFS_IFORK_Q(ip)) { + xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp); + } else { + iip->ili_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT); + } + + /* update the format with the exact fields we actually logged */ + ilf->ilf_fields |= (iip->ili_fields & ~XFS_ILOG_TIMESTAMP); +} /* * This is called to pin the inode associated with the inode log @@ -563,27 +485,6 @@ xfs_inode_item_unlock( ASSERT(ip->i_itemp != NULL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - /* - * If the inode needed a separate buffer with which to log - * its extents, then free it now. - */ - if (iip->ili_extents_buf != NULL) { - ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS); - ASSERT(ip->i_d.di_nextents > 0); - ASSERT(iip->ili_fields & XFS_ILOG_DEXT); - ASSERT(ip->i_df.if_bytes > 0); - kmem_free(iip->ili_extents_buf); - iip->ili_extents_buf = NULL; - } - if (iip->ili_aextents_buf != NULL) { - ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS); - ASSERT(ip->i_d.di_anextents > 0); - ASSERT(iip->ili_fields & XFS_ILOG_AEXT); - ASSERT(ip->i_afp->if_bytes > 0); - kmem_free(iip->ili_aextents_buf); - iip->ili_aextents_buf = NULL; - } - lock_flags = iip->ili_lock_flags; iip->ili_lock_flags = 0; if (lock_flags) @@ -670,11 +571,6 @@ xfs_inode_item_init( iip->ili_inode = ip; xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, &xfs_inode_item_ops); - iip->ili_format.ilf_type = XFS_LI_INODE; - iip->ili_format.ilf_ino = ip->i_ino; - iip->ili_format.ilf_blkno = ip->i_imap.im_blkno; - iip->ili_format.ilf_len = ip->i_imap.im_len; - iip->ili_format.ilf_boffset = ip->i_imap.im_boffset; } /* diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index dce4d656768..488d81254e2 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -34,11 +34,6 @@ typedef struct xfs_inode_log_item { unsigned short ili_logged; /* flushed logged data */ unsigned int ili_last_fields; /* fields when flushed */ unsigned int ili_fields; /* fields to be logged */ - struct xfs_bmbt_rec *ili_extents_buf; /* array of logged - data exts */ - struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged - attr exts */ - xfs_inode_log_format_t ili_format; /* logged structure */ } xfs_inode_log_item_t; static inline int xfs_inode_clean(xfs_inode_t *ip) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 668e8f4ccf5..8bc1bbce745 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -17,32 +17,31 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_alloc.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_ioctl.h" +#include "xfs_alloc.h" #include "xfs_rtalloc.h" #include "xfs_itable.h" #include "xfs_error.h" #include "xfs_attr.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" -#include "xfs_buf_item.h" #include "xfs_fsops.h" #include "xfs_discard.h" #include "xfs_quota.h" -#include "xfs_inode_item.h" #include "xfs_export.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_symlink.h" +#include "xfs_dinode.h" +#include "xfs_trans.h" #include <linux/capability.h> #include <linux/dcache.h> @@ -113,15 +112,11 @@ xfs_find_handle( memset(&handle.ha_fid, 0, sizeof(handle.ha_fid)); hsize = sizeof(xfs_fsid_t); } else { - int lock_mode; - - lock_mode = xfs_ilock_map_shared(ip); handle.ha_fid.fid_len = sizeof(xfs_fid_t) - sizeof(handle.ha_fid.fid_len); handle.ha_fid.fid_pad = 0; handle.ha_fid.fid_gen = ip->i_d.di_gen; handle.ha_fid.fid_ino = ip->i_ino; - xfs_iunlock_map_shared(ip, lock_mode); hsize = XFS_HSIZE(handle); } @@ -276,32 +271,6 @@ xfs_open_by_handle( return error; } -/* - * This is a copy from fs/namei.c:vfs_readlink(), except for removing it's - * unused first argument. - */ -STATIC int -do_readlink( - char __user *buffer, - int buflen, - const char *link) -{ - int len; - - len = PTR_ERR(link); - if (IS_ERR(link)) - goto out; - - len = strlen(link); - if (len > (unsigned) buflen) - len = buflen; - if (copy_to_user(buffer, link, len)) - len = -EFAULT; - out: - return len; -} - - int xfs_readlink_by_handle( struct file *parfilp, @@ -339,7 +308,7 @@ xfs_readlink_by_handle( error = -xfs_readlink(XFS_I(dentry->d_inode), link); if (error) goto out_kfree; - error = do_readlink(hreq->ohandle, olen, link); + error = readlink_copy(hreq->ohandle, olen, link); if (error) goto out_kfree; @@ -443,7 +412,8 @@ xfs_attrlist_by_handle( return -XFS_ERROR(EPERM); if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t))) return -XFS_ERROR(EFAULT); - if (al_hreq.buflen > XATTR_LIST_MAX) + if (al_hreq.buflen < sizeof(struct attrlist) || + al_hreq.buflen > XATTR_LIST_MAX) return -XFS_ERROR(EINVAL); /* @@ -573,10 +543,11 @@ xfs_attrmulti_by_handle( ops = memdup_user(am_hreq.ops, size); if (IS_ERR(ops)) { - error = PTR_ERR(ops); + error = -PTR_ERR(ops); goto out_dput; } + error = ENOMEM; attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); if (!attr_name) goto out_kfree_ops; @@ -586,7 +557,7 @@ xfs_attrmulti_by_handle( ops[i].am_error = strncpy_from_user((char *)attr_name, ops[i].am_attrname, MAXNAMELEN); if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) - error = -ERANGE; + error = ERANGE; if (ops[i].am_error < 0) break; @@ -641,7 +612,11 @@ xfs_ioc_space( unsigned int cmd, xfs_flock64_t *bf) { - int attr_flags = 0; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + struct iattr iattr; + bool setprealloc = false; + bool clrprealloc = false; int error; /* @@ -661,19 +636,128 @@ xfs_ioc_space( if (!S_ISREG(inode->i_mode)) return -XFS_ERROR(EINVAL); - if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) - attr_flags |= XFS_ATTR_NONBLOCK; + error = mnt_want_write_file(filp); + if (error) + return error; - if (filp->f_flags & O_DSYNC) - attr_flags |= XFS_ATTR_SYNC; + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + switch (bf->l_whence) { + case 0: /*SEEK_SET*/ + break; + case 1: /*SEEK_CUR*/ + bf->l_start += filp->f_pos; + break; + case 2: /*SEEK_END*/ + bf->l_start += XFS_ISIZE(ip); + break; + default: + error = XFS_ERROR(EINVAL); + goto out_unlock; + } - if (ioflags & IO_INVIS) - attr_flags |= XFS_ATTR_DMI; + /* + * length of <= 0 for resv/unresv/zero is invalid. length for + * alloc/free is ignored completely and we have no idea what userspace + * might have set it to, so set it to zero to allow range + * checks to pass. + */ + switch (cmd) { + case XFS_IOC_ZERO_RANGE: + case XFS_IOC_RESVSP: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP: + case XFS_IOC_UNRESVSP64: + if (bf->l_len <= 0) { + error = XFS_ERROR(EINVAL); + goto out_unlock; + } + break; + default: + bf->l_len = 0; + break; + } + + if (bf->l_start < 0 || + bf->l_start > mp->m_super->s_maxbytes || + bf->l_start + bf->l_len < 0 || + bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) { + error = XFS_ERROR(EINVAL); + goto out_unlock; + } + + switch (cmd) { + case XFS_IOC_ZERO_RANGE: + error = xfs_zero_file_space(ip, bf->l_start, bf->l_len); + if (!error) + setprealloc = true; + break; + case XFS_IOC_RESVSP: + case XFS_IOC_RESVSP64: + error = xfs_alloc_file_space(ip, bf->l_start, bf->l_len, + XFS_BMAPI_PREALLOC); + if (!error) + setprealloc = true; + break; + case XFS_IOC_UNRESVSP: + case XFS_IOC_UNRESVSP64: + error = xfs_free_file_space(ip, bf->l_start, bf->l_len); + break; + case XFS_IOC_ALLOCSP: + case XFS_IOC_ALLOCSP64: + case XFS_IOC_FREESP: + case XFS_IOC_FREESP64: + if (bf->l_start > XFS_ISIZE(ip)) { + error = xfs_alloc_file_space(ip, XFS_ISIZE(ip), + bf->l_start - XFS_ISIZE(ip), 0); + if (error) + goto out_unlock; + } + + iattr.ia_valid = ATTR_SIZE; + iattr.ia_size = bf->l_start; + + error = xfs_setattr_size(ip, &iattr); + if (!error) + clrprealloc = true; + break; + default: + ASSERT(0); + error = XFS_ERROR(EINVAL); + } - error = mnt_want_write_file(filp); if (error) - return error; - error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags); + goto out_unlock; + + tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_writeid, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_unlock; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + if (!(ioflags & IO_INVIS)) { + ip->i_d.di_mode &= ~S_ISUID; + if (ip->i_d.di_mode & S_IXGRP) + ip->i_d.di_mode &= ~S_ISGID; + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + } + + if (setprealloc) + ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; + else if (clrprealloc) + ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + if (filp->f_flags & O_DSYNC) + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + +out_unlock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); mnt_drop_write_file(filp); return -error; } @@ -1132,7 +1216,7 @@ xfs_ioctl_setattr( * cleared upon successful return from chown() */ if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && - !inode_capable(VFS_I(ip), CAP_FSETID)) + !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID)) ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); /* @@ -1144,15 +1228,8 @@ xfs_ioctl_setattr( olddquot = xfs_qm_vop_chown(tp, ip, &ip->i_pdquot, pdqp); } + ASSERT(ip->i_d.di_version > 1); xfs_set_projid(ip, fa->fsx_projid); - - /* - * We may have to rev the inode as well as - * the superblock version number since projids didn't - * exist before DINODE_VERSION_2 and SB_VERSION_NLINK. - */ - if (ip->i_d.di_version == 1) - xfs_bump_ino_vers2(tp, ip); } } @@ -1474,7 +1551,7 @@ xfs_file_ioctl( XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; - da.d_mem = da.d_miniosz = 1 << target->bt_sshift; + da.d_mem = da.d_miniosz = target->bt_logical_sectorsize; da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); if (copy_to_user(arg, &da, sizeof(da))) diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index f671f7e472a..944d5baa710 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -22,14 +22,13 @@ #include <asm/uaccess.h> #include "xfs.h" #include "xfs_fs.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" #include "xfs_vnode.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_itable.h" #include "xfs_error.h" @@ -357,7 +356,8 @@ xfs_compat_attrlist_by_handle( if (copy_from_user(&al_hreq, arg, sizeof(compat_xfs_fsop_attrlist_handlereq_t))) return -XFS_ERROR(EFAULT); - if (al_hreq.buflen > XATTR_LIST_MAX) + if (al_hreq.buflen < sizeof(struct attrlist) || + al_hreq.buflen > XATTR_LIST_MAX) return -XFS_ERROR(EINVAL); /* @@ -424,10 +424,11 @@ xfs_compat_attrmulti_by_handle( ops = memdup_user(compat_ptr(am_hreq.ops), size); if (IS_ERR(ops)) { - error = PTR_ERR(ops); + error = -PTR_ERR(ops); goto out_dput; } + error = ENOMEM; attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); if (!attr_name) goto out_kfree_ops; @@ -438,7 +439,7 @@ xfs_compat_attrmulti_by_handle( compat_ptr(ops[i].am_attrname), MAXNAMELEN); if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) - error = -ERANGE; + error = ERANGE; if (ops[i].am_error < 0) break; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 8d4d49b6fbf..6d3ec2b6ee2 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -17,34 +17,28 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_alloc.h" -#include "xfs_quota.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_inode_item.h" #include "xfs_btree.h" +#include "xfs_bmap_btree.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" -#include "xfs_rtalloc.h" #include "xfs_error.h" -#include "xfs_itable.h" -#include "xfs_attr.h" -#include "xfs_buf_item.h" +#include "xfs_trans.h" #include "xfs_trans_space.h" #include "xfs_iomap.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_quota.h" #include "xfs_dquot_item.h" #include "xfs_dquot.h" +#include "xfs_dinode.h" #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ @@ -110,7 +104,7 @@ xfs_alert_fsblock_zero( xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, "Access to block zero in inode %llu " "start_block: %llx start_off: %llx " - "blkcnt: %llx extent-state: %x\n", + "blkcnt: %llx extent-state: %x", (unsigned long long)ip->i_ino, (unsigned long long)imap->br_startblock, (unsigned long long)imap->br_startoff, @@ -134,7 +128,6 @@ xfs_iomap_write_direct( xfs_fsblock_t firstfsb; xfs_extlen_t extsz, temp; int nimaps; - int bmapi_flag; int quota_flag; int rt; xfs_trans_t *tp; @@ -206,18 +199,15 @@ xfs_iomap_write_direct( xfs_trans_ijoin(tp, ip, 0); - bmapi_flag = 0; - if (offset < XFS_ISIZE(ip) || extsz) - bmapi_flag |= XFS_BMAPI_PREALLOC; - /* * From this point onwards we overwrite the imap pointer that the * caller gave to us. */ xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; - error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag, - &firstfsb, 0, imap, &nimaps, &free_list); + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, + XFS_BMAPI_PREALLOC, &firstfsb, 0, + imap, &nimaps, &free_list); if (error) goto out_bmap_cancel; @@ -655,7 +645,6 @@ int xfs_iomap_write_allocate( xfs_inode_t *ip, xfs_off_t offset, - size_t count, xfs_bmbt_irec_t *imap) { xfs_mount_t *mp = ip->i_mount; @@ -741,7 +730,7 @@ xfs_iomap_write_allocate( */ nimaps = 1; end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); - error = xfs_bmap_last_offset(NULL, ip, &last_block, + error = xfs_bmap_last_offset(ip, &last_block, XFS_DATA_FORK); if (error) goto trans_cancel; @@ -760,8 +749,7 @@ xfs_iomap_write_allocate( * pointer that the caller gave to us. */ error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, - XFS_BMAPI_STACK_SWITCH, + count_fsb, 0, &first_block, 1, imap, &nimaps, &free_list); if (error) diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 80615760959..411fbb8919e 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -21,12 +21,12 @@ struct xfs_inode; struct xfs_bmbt_irec; -extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, +int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, struct xfs_bmbt_irec *, int); -extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, +int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, struct xfs_bmbt_irec *); -extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t, +int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, struct xfs_bmbt_irec *); -extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); +int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 2b8952d9199..205613a0606 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -17,32 +17,29 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" -#include "xfs_acl.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_alloc.h" -#include "xfs_quota.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" +#include "xfs_da_format.h" #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" -#include "xfs_rtalloc.h" +#include "xfs_acl.h" +#include "xfs_quota.h" #include "xfs_error.h" -#include "xfs_itable.h" #include "xfs_attr.h" -#include "xfs_buf_item.h" -#include "xfs_inode_item.h" +#include "xfs_trans.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_symlink.h" #include "xfs_da_btree.h" -#include "xfs_dir2_format.h" #include "xfs_dir2_priv.h" +#include "xfs_dinode.h" +#include "xfs_trans_space.h" #include <linux/capability.h> #include <linux/xattr.h> @@ -52,6 +49,18 @@ #include <linux/fiemap.h> #include <linux/slab.h> +/* + * Directories have different lock order w.r.t. mmap_sem compared to regular + * files. This is due to readdir potentially triggering page faults on a user + * buffer inside filldir(), and this happens with the ilock on the directory + * held. For regular files, the lock order is the other way around - the + * mmap_sem is taken during the page fault, and then we lock the ilock to do + * block mapping. Hence we need a different class for the directory ilock so + * that lockdep can tell them apart. + */ +static struct lock_class_key xfs_nondir_ilock_class; +static struct lock_class_key xfs_dir_ilock_class; + static int xfs_initxattrs( struct inode *inode, @@ -63,8 +72,8 @@ xfs_initxattrs( int error = 0; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - error = xfs_attr_set(ip, xattr->name, xattr->value, - xattr->value_len, ATTR_SECURE); + error = -xfs_attr_set(ip, xattr->name, xattr->value, + xattr->value_len, ATTR_SECURE); if (error < 0) break; } @@ -84,8 +93,8 @@ xfs_init_security( struct inode *dir, const struct qstr *qstr) { - return security_inode_init_security(inode, dir, qstr, - &xfs_initxattrs, NULL); + return -security_inode_init_security(inode, dir, qstr, + &xfs_initxattrs, NULL); } static void @@ -115,19 +124,19 @@ xfs_cleanup_inode( xfs_dentry_to_name(&teardown, dentry, 0); xfs_remove(XFS_I(dir), &teardown, XFS_I(inode)); - iput(inode); } STATIC int -xfs_vn_mknod( +xfs_generic_create( struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t rdev) + dev_t rdev, + bool tmpfile) /* unnamed file */ { struct inode *inode; struct xfs_inode *ip = NULL; - struct posix_acl *default_acl = NULL; + struct posix_acl *default_acl, *acl; struct xfs_name name; int error; @@ -143,17 +152,16 @@ xfs_vn_mknod( rdev = 0; } - if (IS_POSIXACL(dir)) { - default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(default_acl)) - return PTR_ERR(default_acl); + error = posix_acl_create(dir, &mode, &default_acl, &acl); + if (error) + return error; - if (!default_acl) - mode &= ~current_umask(); + if (!tmpfile) { + xfs_dentry_to_name(&name, dentry, mode); + error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); + } else { + error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip); } - - xfs_dentry_to_name(&name, dentry, mode); - error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); if (unlikely(error)) goto out_free_acl; @@ -163,22 +171,46 @@ xfs_vn_mknod( if (unlikely(error)) goto out_cleanup_inode; +#ifdef CONFIG_XFS_POSIX_ACL if (default_acl) { - error = -xfs_inherit_acl(inode, default_acl); - default_acl = NULL; - if (unlikely(error)) + error = -xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + if (error) + goto out_cleanup_inode; + } + if (acl) { + error = -xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); + if (error) goto out_cleanup_inode; } +#endif + if (tmpfile) + d_tmpfile(dentry, inode); + else + d_instantiate(dentry, inode); - d_instantiate(dentry, inode); + out_free_acl: + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); return -error; out_cleanup_inode: - xfs_cleanup_inode(dir, inode, dentry); - out_free_acl: - posix_acl_release(default_acl); - return -error; + if (!tmpfile) + xfs_cleanup_inode(dir, inode, dentry); + iput(inode); + goto out_free_acl; +} + +STATIC int +xfs_vn_mknod( + struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t rdev) +{ + return xfs_generic_create(dir, dentry, mode, rdev, false); } STATIC int @@ -341,6 +373,7 @@ xfs_vn_symlink( out_cleanup_inode: xfs_cleanup_inode(dir, inode, dentry); + iput(inode); out: return -error; } @@ -395,18 +428,6 @@ xfs_vn_follow_link( return NULL; } -STATIC void -xfs_vn_put_link( - struct dentry *dentry, - struct nameidata *nd, - void *p) -{ - char *s = nd_get_link(nd); - - if (!IS_ERR(s)) - kfree(s); -} - STATIC int xfs_vn_getattr( struct vfsmount *mnt, @@ -463,14 +484,12 @@ xfs_vn_getattr( static void xfs_setattr_mode( - struct xfs_trans *tp, struct xfs_inode *ip, struct iattr *iattr) { - struct inode *inode = VFS_I(ip); - umode_t mode = iattr->ia_mode; + struct inode *inode = VFS_I(ip); + umode_t mode = iattr->ia_mode; - ASSERT(tp); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ip->i_d.di_mode &= S_IFMT; @@ -480,6 +499,32 @@ xfs_setattr_mode( inode->i_mode |= mode & ~S_IFMT; } +static void +xfs_setattr_time( + struct xfs_inode *ip, + struct iattr *iattr) +{ + struct inode *inode = VFS_I(ip); + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (iattr->ia_valid & ATTR_ATIME) { + inode->i_atime = iattr->ia_atime; + ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; + ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; + } + if (iattr->ia_valid & ATTR_CTIME) { + inode->i_ctime = iattr->ia_ctime; + ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; + ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; + } + if (iattr->ia_valid & ATTR_MTIME) { + inode->i_mtime = iattr->ia_mtime; + ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; + } +} + int xfs_setattr_nonsize( struct xfs_inode *ip, @@ -622,7 +667,8 @@ xfs_setattr_nonsize( } if (!gid_eq(igid, gid)) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { - ASSERT(!XFS_IS_PQUOTA_ON(mp)); + ASSERT(xfs_sb_version_has_pquotino(&mp->m_sb) || + !XFS_IS_PQUOTA_ON(mp)); ASSERT(mask & ATTR_GID); ASSERT(gdqp); olddquot2 = xfs_qm_vop_chown(tp, ip, @@ -633,30 +679,10 @@ xfs_setattr_nonsize( } } - /* - * Change file access modes. - */ if (mask & ATTR_MODE) - xfs_setattr_mode(tp, ip, iattr); - - /* - * Change file access or modified times. - */ - if (mask & ATTR_ATIME) { - inode->i_atime = iattr->ia_atime; - ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; - ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; - } - if (mask & ATTR_CTIME) { - inode->i_ctime = iattr->ia_ctime; - ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; - ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; - } - if (mask & ATTR_MTIME) { - inode->i_mtime = iattr->ia_mtime; - ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; - ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; - } + xfs_setattr_mode(ip, iattr); + if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) + xfs_setattr_time(ip, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -687,7 +713,7 @@ xfs_setattr_nonsize( * Posix ACL code seems to care about this issue either. */ if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) { - error = -xfs_acl_chmod(inode); + error = -posix_acl_chmod(inode, inode->i_mode); if (error) return XFS_ERROR(error); } @@ -709,12 +735,10 @@ out_dqrele: int xfs_setattr_size( struct xfs_inode *ip, - struct iattr *iattr, - int flags) + struct iattr *iattr) { struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); - int mask = iattr->ia_valid; xfs_off_t oldsize, newsize; struct xfs_trans *tp; int error; @@ -733,14 +757,10 @@ xfs_setattr_size( if (error) return XFS_ERROR(error); + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(S_ISREG(ip->i_d.di_mode)); - ASSERT((mask & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| - ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); - - if (!(flags & XFS_ATTR_NOLOCK)) { - lock_flags |= XFS_IOLOCK_EXCL; - xfs_ilock(ip, lock_flags); - } + ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| + ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); oldsize = inode->i_size; newsize = iattr->ia_size; @@ -749,13 +769,12 @@ xfs_setattr_size( * Short circuit the truncate case for zero length files. */ if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) { - if (!(mask & (ATTR_CTIME|ATTR_MTIME))) - goto out_unlock; + if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME))) + return 0; /* * Use the regular setattr path to update the timestamps. */ - xfs_iunlock(ip, lock_flags); iattr->ia_valid &= ~ATTR_SIZE; return xfs_setattr_nonsize(ip, iattr, 0); } @@ -765,7 +784,7 @@ xfs_setattr_size( */ error = xfs_qm_dqattach(ip, 0); if (error) - goto out_unlock; + return error; /* * Now we can make the changes. Before we join the inode to the @@ -783,7 +802,7 @@ xfs_setattr_size( */ error = xfs_zero_eof(ip, newsize, oldsize); if (error) - goto out_unlock; + return error; } /* @@ -802,7 +821,7 @@ xfs_setattr_size( error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ip->i_d.di_size, newsize); if (error) - goto out_unlock; + return error; } /* @@ -810,22 +829,34 @@ xfs_setattr_size( */ inode_dio_wait(inode); + /* + * Do all the page cache truncate work outside the transaction context + * as the "lock" order is page lock->log space reservation. i.e. + * locking pages inside the transaction can ABBA deadlock with + * writeback. We have to do the VFS inode size update before we truncate + * the pagecache, however, to avoid racing with page faults beyond the + * new EOF they are not serialised against truncate operations except by + * page locks and size updates. + * + * Hence we are in a situation where a truncate can fail with ENOMEM + * from xfs_trans_reserve(), but having already truncated the in-memory + * version of the file (i.e. made user visible changes). There's not + * much we can do about this, except to hope that the caller sees ENOMEM + * and retries the truncate operation. + */ error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); if (error) - goto out_unlock; + return error; + truncate_setsize(inode, newsize); tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) goto out_trans_cancel; - truncate_setsize(inode, newsize); - commit_flags = XFS_TRANS_RELEASE_LOG_RES; lock_flags |= XFS_ILOCK_EXCL; - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); /* @@ -838,10 +869,11 @@ xfs_setattr_size( * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ - if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { + if (newsize != oldsize && + !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) { iattr->ia_ctime = iattr->ia_mtime = current_fs_time(inode->i_sb); - mask |= ATTR_CTIME | ATTR_MTIME; + iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME; } /* @@ -877,22 +909,10 @@ xfs_setattr_size( xfs_inode_clear_eofblocks_tag(ip); } - /* - * Change file access modes. - */ - if (mask & ATTR_MODE) - xfs_setattr_mode(tp, ip, iattr); - - if (mask & ATTR_CTIME) { - inode->i_ctime = iattr->ia_ctime; - ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; - ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; - } - if (mask & ATTR_MTIME) { - inode->i_mtime = iattr->ia_mtime; - ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; - ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; - } + if (iattr->ia_valid & ATTR_MODE) + xfs_setattr_mode(ip, iattr); + if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) + xfs_setattr_time(ip, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -916,12 +936,21 @@ out_trans_cancel: STATIC int xfs_vn_setattr( - struct dentry *dentry, - struct iattr *iattr) + struct dentry *dentry, + struct iattr *iattr) { - if (iattr->ia_valid & ATTR_SIZE) - return -xfs_setattr_size(XFS_I(dentry->d_inode), iattr, 0); - return -xfs_setattr_nonsize(XFS_I(dentry->d_inode), iattr, 0); + struct xfs_inode *ip = XFS_I(dentry->d_inode); + int error; + + if (iattr->ia_valid & ATTR_SIZE) { + xfs_ilock(ip, XFS_IOLOCK_EXCL); + error = xfs_setattr_size(ip, iattr); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } else { + error = xfs_setattr_nonsize(ip, iattr, 0); + } + + return -error; } STATIC int @@ -1051,8 +1080,18 @@ xfs_vn_fiemap( return 0; } +STATIC int +xfs_vn_tmpfile( + struct inode *dir, + struct dentry *dentry, + umode_t mode) +{ + return xfs_generic_create(dir, dentry, mode, 0, true); +} + static const struct inode_operations xfs_inode_operations = { .get_acl = xfs_get_acl, + .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1080,6 +1119,7 @@ static const struct inode_operations xfs_dir_inode_operations = { .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, .get_acl = xfs_get_acl, + .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1087,6 +1127,7 @@ static const struct inode_operations xfs_dir_inode_operations = { .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, .update_time = xfs_vn_update_time, + .tmpfile = xfs_vn_tmpfile, }; static const struct inode_operations xfs_dir_ci_inode_operations = { @@ -1106,6 +1147,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, .get_acl = xfs_get_acl, + .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1113,13 +1155,13 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, .update_time = xfs_vn_update_time, + .tmpfile = xfs_vn_tmpfile, }; static const struct inode_operations xfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = xfs_vn_follow_link, - .put_link = xfs_vn_put_link, - .get_acl = xfs_get_acl, + .put_link = kfree_put_link, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1169,6 +1211,7 @@ xfs_setup_inode( struct xfs_inode *ip) { struct inode *inode = &ip->i_vnode; + gfp_t gfp_mask; inode->i_ino = ip->i_ino; inode->i_state = I_NEW; @@ -1204,6 +1247,8 @@ xfs_setup_inode( inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; xfs_diflags_to_iflags(inode, ip); + ip->d_ops = ip->i_mount->m_nondir_inode_ops; + lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class); switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op = &xfs_inode_operations; @@ -1211,11 +1256,13 @@ xfs_setup_inode( inode->i_mapping->a_ops = &xfs_address_space_operations; break; case S_IFDIR: + lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class); if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) inode->i_op = &xfs_dir_ci_inode_operations; else inode->i_op = &xfs_dir_inode_operations; inode->i_fop = &xfs_dir_file_operations; + ip->d_ops = ip->i_mount->m_dir_inode_ops; break; case S_IFLNK: inode->i_op = &xfs_symlink_inode_operations; @@ -1229,6 +1276,14 @@ xfs_setup_inode( } /* + * Ensure all page cache allocations are done from GFP_NOFS context to + * prevent direct reclaim recursion back into the filesystem and blowing + * stacks or deadlocking. + */ + gfp_mask = mapping_gfp_mask(inode->i_mapping); + mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS))); + + /* * If there is no attribute fork no ACL can exist on this inode, * and it can't have any file capabilities attached to it either. */ diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index d81fb41205e..1c34e433592 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -30,14 +30,10 @@ extern void xfs_setup_inode(struct xfs_inode *); /* * Internal setattr interfaces. */ -#define XFS_ATTR_DMI 0x01 /* invocation from a DMI function */ -#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if op would block */ -#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */ -#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */ -#define XFS_ATTR_SYNC 0x10 /* synchronous operation required */ +#define XFS_ATTR_NOACL 0x01 /* Don't call posix_acl_chmod */ extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, int flags); -extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap, int flags); +extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap); #endif /* __XFS_IOPS_H__ */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 084b3e1741f..cb64f222d60 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -17,24 +17,23 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_btree.h" #include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" #include "xfs_itable.h" #include "xfs_error.h" -#include "xfs_btree.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_dinode.h" STATIC int xfs_internal_inum( @@ -210,9 +209,8 @@ xfs_bulkstat( xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */ xfs_ino_t lastino; /* last inode number returned */ - int nbcluster; /* # of blocks in a cluster */ - int nicluster; /* # of inodes in a cluster */ - int nimask; /* mask for inode clusters */ + int blks_per_cluster; /* # of blocks per cluster */ + int inodes_per_cluster;/* # of inodes per cluster */ int nirbuf; /* size of irbuf */ int rval; /* return value error code */ int tmp; /* result value from btree calls */ @@ -244,11 +242,8 @@ xfs_bulkstat( *done = 0; fmterror = 0; ubufp = ubuffer; - nicluster = mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp) ? - mp->m_sb.sb_inopblock : - (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog); - nimask = ~(nicluster - 1); - nbcluster = nicluster >> mp->m_sb.sb_inopblog; + blks_per_cluster = xfs_icluster_size_fsb(mp); + inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4); if (!irbuf) return ENOMEM; @@ -275,7 +270,8 @@ xfs_bulkstat( /* * Allocate and initialize a btree cursor for ialloc btree. */ - cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno); + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, + XFS_BTNUM_INO); irbp = irbuf; irbufend = irbuf + nirbuf; end_of_ag = 0; @@ -391,12 +387,12 @@ xfs_bulkstat( agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino); for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK; - chunkidx += nicluster, - agbno += nbcluster) { - if (xfs_inobt_maskn(chunkidx, nicluster) - & ~r.ir_free) + chunkidx += inodes_per_cluster, + agbno += blks_per_cluster) { + if (xfs_inobt_maskn(chunkidx, + inodes_per_cluster) & ~r.ir_free) xfs_btree_reada_bufs(mp, agno, - agbno, nbcluster, + agbno, blks_per_cluster, &xfs_inode_buf_ops); } blk_finish_plug(&plug); @@ -626,7 +622,8 @@ xfs_inumbers( agino = 0; continue; } - cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno); + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, + XFS_BTNUM_INO); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE, &tmp); if (error) { diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index f9bb590acc0..825249d2dfc 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -119,6 +119,7 @@ typedef __uint64_t __psunsigned_t; #include "xfs_iops.h" #include "xfs_aops.h" #include "xfs_super.h" +#include "xfs_cksum.h" #include "xfs_buf.h" #include "xfs_message.h" @@ -178,6 +179,7 @@ typedef __uint64_t __psunsigned_t; #define ENOATTR ENODATA /* Attribute not found */ #define EWRONGFS EINVAL /* Mount with wrong filesystem type */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define SYNCHRONIZE() barrier() #define __return_address __builtin_return_address(0) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index a2dea108071..292308dede6 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -17,21 +17,19 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" #include "xfs_log_priv.h" -#include "xfs_buf_item.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" #include "xfs_log_recover.h" -#include "xfs_trans_priv.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_trace.h" #include "xfs_fsops.h" @@ -618,11 +616,13 @@ xfs_log_mount( int error = 0; int min_logfsbs; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) - xfs_notice(mp, "Mounting Filesystem"); - else { + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { + xfs_notice(mp, "Mounting V%d Filesystem", + XFS_SB_VERSION_NUM(&mp->m_sb)); + } else { xfs_notice(mp, -"Mounting filesystem in no-recovery mode. Filesystem will be inconsistent."); +"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", + XFS_SB_VERSION_NUM(&mp->m_sb)); ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } @@ -1000,27 +1000,34 @@ xfs_log_space_wake( } /* - * Determine if we have a transaction that has gone to disk - * that needs to be covered. To begin the transition to the idle state - * firstly the log needs to be idle (no AIL and nothing in the iclogs). - * If we are then in a state where covering is needed, the caller is informed - * that dummy transactions are required to move the log into the idle state. + * Determine if we have a transaction that has gone to disk that needs to be + * covered. To begin the transition to the idle state firstly the log needs to + * be idle. That means the CIL, the AIL and the iclogs needs to be empty before + * we start attempting to cover the log. + * + * Only if we are then in a state where covering is needed, the caller is + * informed that dummy transactions are required to move the log into the idle + * state. * - * Because this is called as part of the sync process, we should also indicate - * that dummy transactions should be issued in anything but the covered or - * idle states. This ensures that the log tail is accurately reflected in - * the log at the end of the sync, hence if a crash occurrs avoids replay - * of transactions where the metadata is already on disk. + * If there are any items in the AIl or CIL, then we do not want to attempt to + * cover the log as we may be in a situation where there isn't log space + * available to run a dummy transaction and this can lead to deadlocks when the + * tail of the log is pinned by an item that is modified in the CIL. Hence + * there's no point in running a dummy transaction at this point because we + * can't start trying to idle the log until both the CIL and AIL are empty. */ int xfs_log_need_covered(xfs_mount_t *mp) { - int needed = 0; struct xlog *log = mp->m_log; + int needed = 0; if (!xfs_fs_writable(mp)) return 0; + if (!xlog_cil_empty(log)) + return 0; + spin_lock(&log->l_icloglock); switch (log->l_covered_state) { case XLOG_STATE_COVER_DONE: @@ -1029,14 +1036,17 @@ xfs_log_need_covered(xfs_mount_t *mp) break; case XLOG_STATE_COVER_NEED: case XLOG_STATE_COVER_NEED2: - if (!xfs_ail_min_lsn(log->l_ailp) && - xlog_iclogs_empty(log)) { - if (log->l_covered_state == XLOG_STATE_COVER_NEED) - log->l_covered_state = XLOG_STATE_COVER_DONE; - else - log->l_covered_state = XLOG_STATE_COVER_DONE2; - } - /* FALLTHRU */ + if (xfs_ail_min_lsn(log->l_ailp)) + break; + if (!xlog_iclogs_empty(log)) + break; + + needed = 1; + if (log->l_covered_state == XLOG_STATE_COVER_NEED) + log->l_covered_state = XLOG_STATE_COVER_DONE; + else + log->l_covered_state = XLOG_STATE_COVER_DONE2; + break; default: needed = 1; break; @@ -1068,6 +1078,7 @@ xlog_assign_tail_lsn_locked( tail_lsn = lip->li_lsn; else tail_lsn = atomic64_read(&log->l_last_sync_lsn); + trace_xfs_log_assign_tail_lsn(log, tail_lsn); atomic64_set(&log->l_tail_lsn, tail_lsn); return tail_lsn; } @@ -1154,7 +1165,7 @@ xlog_iodone(xfs_buf_t *bp) /* * Race to shutdown the filesystem if we see an error. */ - if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp, + if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { xfs_buf_ioerror_alert(bp, __func__); xfs_buf_stale(bp); @@ -1172,11 +1183,14 @@ xlog_iodone(xfs_buf_t *bp) /* log I/O is always issued ASYNC */ ASSERT(XFS_BUF_ISASYNC(bp)); xlog_state_done_syncing(iclog, aborted); + /* - * do not reference the buffer (bp) here as we could race - * with it being freed after writing the unmount record to the - * log. + * drop the buffer lock now that we are done. Nothing references + * the buffer after this, so an unmount waiting on this lock can now + * tear it down safely. As such, it is unsafe to reference the buffer + * (bp) after the unlock as we could race with it being freed. */ + xfs_buf_unlock(bp); } /* @@ -1359,8 +1373,16 @@ xlog_alloc_log( bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0); if (!bp) goto out_free_log; - bp->b_iodone = xlog_iodone; + + /* + * The iclogbuf buffer locks are held over IO but we are not going to do + * IO yet. Hence unlock the buffer so that the log IO path can grab it + * when appropriately. + */ ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + + bp->b_iodone = xlog_iodone; log->l_xbuf = bp; spin_lock_init(&log->l_icloglock); @@ -1389,6 +1411,9 @@ xlog_alloc_log( if (!bp) goto out_free_iclog; + ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + bp->b_iodone = xlog_iodone; iclog->ic_bp = bp; iclog->ic_data = bp->b_addr; @@ -1413,7 +1438,6 @@ xlog_alloc_log( iclog->ic_callback_tail = &(iclog->ic_callback); iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; - ASSERT(xfs_buf_islocked(iclog->ic_bp)); init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -1622,6 +1646,12 @@ xlog_cksum( * we transition the iclogs to IOERROR state *after* flushing all existing * iclogs to disk. This is because we don't want anymore new transactions to be * started or completed afterwards. + * + * We lock the iclogbufs here so that we can serialise against IO completion + * during unmount. We might be processing a shutdown triggered during unmount, + * and that can occur asynchronously to the unmount thread, and hence we need to + * ensure that completes before tearing down the iclogbufs. Hence we need to + * hold the buffer lock across the log IO to acheive that. */ STATIC int xlog_bdstrat( @@ -1629,6 +1659,7 @@ xlog_bdstrat( { struct xlog_in_core *iclog = bp->b_fspriv; + xfs_buf_lock(bp); if (iclog->ic_state & XLOG_STATE_IOERROR) { xfs_buf_ioerror(bp, EIO); xfs_buf_stale(bp); @@ -1636,7 +1667,8 @@ xlog_bdstrat( /* * It would seem logical to return EIO here, but we rely on * the log state machine to propagate I/O errors instead of - * doing it here. + * doing it here. Similarly, IO completion will unlock the + * buffer, so we don't do it here. */ return 0; } @@ -1838,14 +1870,28 @@ xlog_dealloc_log( xlog_cil_destroy(log); /* - * always need to ensure that the extra buffer does not point to memory - * owned by another log buffer before we free it. + * Cycle all the iclogbuf locks to make sure all log IO completion + * is done before we tear down these buffers. */ + iclog = log->l_iclog; + for (i = 0; i < log->l_iclog_bufs; i++) { + xfs_buf_lock(iclog->ic_bp); + xfs_buf_unlock(iclog->ic_bp); + iclog = iclog->ic_next; + } + + /* + * Always need to ensure that the extra buffer does not point to memory + * owned by another log buffer before we free it. Also, cycle the lock + * first to ensure we've completed IO on it. + */ + xfs_buf_lock(log->l_xbuf); + xfs_buf_unlock(log->l_xbuf); xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size)); xfs_buf_free(log->l_xbuf); iclog = log->l_iclog; - for (i=0; i<log->l_iclog_bufs; i++) { + for (i = 0; i < log->l_iclog_bufs; i++) { xfs_buf_free(iclog->ic_bp); next_iclog = iclog->ic_next; kmem_free(iclog); @@ -1979,7 +2025,7 @@ xlog_print_tic_res( for (i = 0; i < ticket->t_res_num; i++) { uint r_type = ticket->t_res_arr[i].r_type; - xfs_warn(mp, "region[%u]: %s - %u bytes\n", i, + xfs_warn(mp, "region[%u]: %s - %u bytes", i, ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ? "bad-rtype" : res_type_str[r_type-1]), ticket->t_res_arr[i].r_len); @@ -3702,11 +3748,9 @@ xlog_verify_iclog( /* check validity of iclog pointers */ spin_lock(&log->l_icloglock); icptr = log->l_iclog; - for (i=0; i < log->l_iclog_bufs; i++) { - if (icptr == NULL) - xfs_emerg(log->l_mp, "%s: invalid ptr", __func__); - icptr = icptr->ic_next; - } + for (i = 0; i < log->l_iclog_bufs; i++, icptr = icptr->ic_next) + ASSERT(icptr); + if (icptr != log->l_iclog) xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__); spin_unlock(&log->l_icloglock); @@ -3908,11 +3952,14 @@ xfs_log_force_umount( retval = xlog_state_ioerror(log); spin_unlock(&log->l_icloglock); } + /* - * Wake up everybody waiting on xfs_log_force. - * Callback all log item committed functions as if the - * log writes were completed. + * Wake up everybody waiting on xfs_log_force. Wake the CIL push first + * as if the log writes were completed. The abort handling in the log + * item committed callback functions will do this again under lock to + * avoid races. */ + wake_up_all(&log->l_cilp->xc_commit_wait); xlog_state_do_callback(log, XFS_LI_ABORTED, NULL); #ifdef XFSERRORDEBUG diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 1c458487f00..84e0deb95ab 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -18,20 +18,71 @@ #ifndef __XFS_LOG_H__ #define __XFS_LOG_H__ -#include "xfs_log_format.h" - struct xfs_log_vec { struct xfs_log_vec *lv_next; /* next lv in build list */ int lv_niovecs; /* number of iovecs in lv */ struct xfs_log_iovec *lv_iovecp; /* iovec array */ struct xfs_log_item *lv_item; /* owner */ char *lv_buf; /* formatted buffer */ - int lv_buf_len; /* size of formatted buffer */ + int lv_bytes; /* accounted space in buffer */ + int lv_buf_len; /* aligned size of buffer */ int lv_size; /* size of allocated lv */ }; #define XFS_LOG_VEC_ORDERED (-1) +static inline void * +xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, + uint type) +{ + struct xfs_log_iovec *vec = *vecp; + + if (vec) { + ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs); + vec++; + } else { + vec = &lv->lv_iovecp[0]; + } + + vec->i_type = type; + vec->i_addr = lv->lv_buf + lv->lv_buf_len; + + ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t))); + + *vecp = vec; + return vec->i_addr; +} + +/* + * We need to make sure the next buffer is naturally aligned for the biggest + * basic data type we put into it. We already accounted for this padding when + * sizing the buffer. + * + * However, this padding does not get written into the log, and hence we have to + * track the space used by the log vectors separately to prevent log space hangs + * due to inaccurate accounting (i.e. a leak) of the used log space through the + * CIL context ticket. + */ +static inline void +xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len) +{ + lv->lv_buf_len += round_up(len, sizeof(uint64_t)); + lv->lv_bytes += len; + vec->i_len = len; +} + +static inline void * +xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, + uint type, void *data, int len) +{ + void *buf; + + buf = xlog_prepare_iovec(lv, vecp, type); + memcpy(buf, data, len); + xlog_finish_iovec(lv, *vecp, len); + return buf; +} + /* * Structure used to pass callback function and the function's argument * to the log manager. @@ -82,11 +133,7 @@ struct xlog_ticket; struct xfs_log_item; struct xfs_item_ops; struct xfs_trans; - -void xfs_log_item_init(struct xfs_mount *mp, - struct xfs_log_item *item, - int type, - const struct xfs_item_ops *ops); +struct xfs_log_callback; xfs_lsn_t xfs_log_done(struct xfs_mount *mp, struct xlog_ticket *ticket, @@ -114,7 +161,7 @@ xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); void xfs_log_space_wake(struct xfs_mount *mp); int xfs_log_notify(struct xfs_mount *mp, struct xlog_in_core *iclog, - xfs_log_callback_t *callback_entry); + struct xfs_log_callback *callback_entry); int xfs_log_release_iclog(struct xfs_mount *mp, struct xlog_in_core *iclog); int xfs_log_reserve(struct xfs_mount *mp, @@ -135,7 +182,7 @@ void xlog_iodone(struct xfs_buf *); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); -int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, +void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, xfs_lsn_t *commit_lsn, int flags); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index cfe97973ba3..b3425b34e3d 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -17,11 +17,9 @@ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_trans_priv.h" -#include "xfs_log_priv.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" @@ -29,6 +27,10 @@ #include "xfs_alloc.h" #include "xfs_extent_busy.h" #include "xfs_discard.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" +#include "xfs_log_priv.h" /* * Allocate a new ticket. Failing to get a new ticket makes it really hard to @@ -80,36 +82,6 @@ xlog_cil_init_post_recovery( log->l_curr_block); } -STATIC int -xlog_cil_lv_item_format( - struct xfs_log_item *lip, - struct xfs_log_vec *lv) -{ - int index; - char *ptr; - - /* format new vectors into array */ - lip->li_ops->iop_format(lip, lv->lv_iovecp); - - /* copy data into existing array */ - ptr = lv->lv_buf; - for (index = 0; index < lv->lv_niovecs; index++) { - struct xfs_log_iovec *vec = &lv->lv_iovecp[index]; - - memcpy(ptr, vec->i_addr, vec->i_len); - vec->i_addr = ptr; - ptr += vec->i_len; - } - - /* - * some size calculations for log vectors over-estimate, so the caller - * doesn't know the amount of space actually used by the item. Return - * the byte count to the caller so they can check and store it - * appropriately. - */ - return ptr - lv->lv_buf; -} - /* * Prepare the log item for insertion into the CIL. Calculate the difference in * log space and vectors it will consume, and if it is a new item pin it as @@ -125,7 +97,7 @@ xfs_cil_prepare_item( { /* Account for the new LV being passed in */ if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) { - *diff_len += lv->lv_buf_len; + *diff_len += lv->lv_bytes; *diff_iovecs += lv->lv_niovecs; } @@ -139,7 +111,7 @@ xfs_cil_prepare_item( else if (old_lv != lv) { ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); - *diff_len -= old_lv->lv_buf_len; + *diff_len -= old_lv->lv_bytes; *diff_iovecs -= old_lv->lv_niovecs; kmem_free(old_lv); } @@ -230,12 +202,28 @@ xlog_cil_insert_format_items( nbytes = 0; } + /* + * We 64-bit align the length of each iovec so that the start + * of the next one is naturally aligned. We'll need to + * account for that slack space here. Then round nbytes up + * to 64-bit alignment so that the initial buffer alignment is + * easy to calculate and verify. + */ + nbytes += niovecs * sizeof(uint64_t); + nbytes = round_up(nbytes, sizeof(uint64_t)); + /* grab the old item if it exists for reservation accounting */ old_lv = lip->li_lv; - /* calc buffer size */ - buf_size = sizeof(struct xfs_log_vec) + nbytes + - niovecs * sizeof(struct xfs_log_iovec); + /* + * The data buffer needs to start 64-bit aligned, so round up + * that space to ensure we can align it appropriately and not + * overrun the buffer. + */ + buf_size = nbytes + + round_up((sizeof(struct xfs_log_vec) + + niovecs * sizeof(struct xfs_log_iovec)), + sizeof(uint64_t)); /* compare to existing item size */ if (lip->li_lv && buf_size <= lip->li_lv->lv_size) { @@ -251,35 +239,31 @@ xlog_cil_insert_format_items( * that the space reservation accounting is correct. */ *diff_iovecs -= lv->lv_niovecs; - *diff_len -= lv->lv_buf_len; - - /* Ensure the lv is set up according to ->iop_size */ - lv->lv_niovecs = niovecs; - lv->lv_buf = (char *)lv + buf_size - nbytes; - - lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv); - goto insert; + *diff_len -= lv->lv_bytes; + } else { + /* allocate new data chunk */ + lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS); + lv->lv_item = lip; + lv->lv_size = buf_size; + if (ordered) { + /* track as an ordered logvec */ + ASSERT(lip->li_lv == NULL); + lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + goto insert; + } + lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; } - /* allocate new data chunk */ - lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS); - lv->lv_item = lip; - lv->lv_size = buf_size; + /* Ensure the lv is set up according to ->iop_size */ lv->lv_niovecs = niovecs; - if (ordered) { - /* track as an ordered logvec */ - ASSERT(lip->li_lv == NULL); - lv->lv_buf_len = XFS_LOG_VEC_ORDERED; - goto insert; - } - - /* The allocated iovec region lies beyond the log vector. */ - lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; /* The allocated data region lies beyond the iovec region */ + lv->lv_buf_len = 0; + lv->lv_bytes = 0; lv->lv_buf = (char *)lv + buf_size - nbytes; + ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t))); - lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv); + lip->li_ops->iop_format(lip, lv); insert: ASSERT(lv->lv_buf_len <= nbytes); xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs); @@ -402,7 +386,15 @@ xlog_cil_committed( xfs_extent_busy_clear(mp, &ctx->busy_extents, (mp->m_flags & XFS_MOUNT_DISCARD) && !abort); + /* + * If we are aborting the commit, wake up anyone waiting on the + * committing list. If we don't, then a shutdown we can leave processes + * waiting in xlog_cil_force_lsn() waiting on a sequence commit that + * will never happen because we aborted it. + */ spin_lock(&ctx->cil->xc_push_lock); + if (abort) + wake_up_all(&ctx->cil->xc_commit_wait); list_del(&ctx->committing); spin_unlock(&ctx->cil->xc_push_lock); @@ -516,13 +508,6 @@ xlog_cil_push( cil->xc_ctx = new_ctx; /* - * mirror the new sequence into the cil structure so that we can do - * unlocked checks against the current sequence in log forces without - * risking deferencing a freed context pointer. - */ - cil->xc_current_sequence = new_ctx->sequence; - - /* * The switch is now done, so we can drop the context lock and move out * of a shared context. We can't just go straight to the commit record, * though - we need to synchronise with previous and future commits so @@ -540,8 +525,15 @@ xlog_cil_push( * Hence we need to add this context to the committing context list so * that higher sequences will wait for us to write out a commit record * before they do. + * + * xfs_log_force_lsn requires us to mirror the new sequence into the cil + * structure atomically with the addition of this sequence to the + * committing list. This also ensures that we can do unlocked checks + * against the current sequence in log forces without risking + * deferencing a freed context pointer. */ spin_lock(&cil->xc_push_lock); + cil->xc_current_sequence = new_ctx->sequence; list_add(&ctx->committing, &cil->xc_committing); spin_unlock(&cil->xc_push_lock); up_write(&cil->xc_ctx_lock); @@ -581,8 +573,18 @@ restart: spin_lock(&cil->xc_push_lock); list_for_each_entry(new_ctx, &cil->xc_committing, committing) { /* + * Avoid getting stuck in this loop because we were woken by the + * shutdown, but then went back to sleep once already in the + * shutdown state. + */ + if (XLOG_FORCED_SHUTDOWN(log)) { + spin_unlock(&cil->xc_push_lock); + goto out_abort_free_ticket; + } + + /* * Higher sequences will wait for this one so skip them. - * Don't wait for own own sequence, either. + * Don't wait for our own sequence, either. */ if (new_ctx->sequence >= ctx->sequence) continue; @@ -679,8 +681,14 @@ xlog_cil_push_background( } +/* + * xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence + * number that is passed. When it returns, the work will be queued for + * @push_seq, but it won't be completed. The caller is expected to do any + * waiting for push_seq to complete if it is required. + */ static void -xlog_cil_push_foreground( +xlog_cil_push_now( struct xlog *log, xfs_lsn_t push_seq) { @@ -705,10 +713,22 @@ xlog_cil_push_foreground( } cil->xc_push_seq = push_seq; + queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); spin_unlock(&cil->xc_push_lock); +} - /* do the push now */ - xlog_cil_push(log); +bool +xlog_cil_empty( + struct xlog *log) +{ + struct xfs_cil *cil = log->l_cilp; + bool empty = false; + + spin_lock(&cil->xc_push_lock); + if (list_empty(&cil->xc_cil)) + empty = true; + spin_unlock(&cil->xc_push_lock); + return empty; } /* @@ -724,7 +744,7 @@ xlog_cil_push_foreground( * background commit, returns without it held once background commits are * allowed again. */ -int +void xfs_log_commit_cil( struct xfs_mount *mp, struct xfs_trans *tp, @@ -770,7 +790,6 @@ xfs_log_commit_cil( xlog_cil_push_background(log); up_read(&cil->xc_ctx_lock); - return 0; } /* @@ -799,7 +818,8 @@ xlog_cil_force_lsn( * xlog_cil_push() handles racing pushes for the same sequence, * so no need to deal with it here. */ - xlog_cil_push_foreground(log, sequence); +restart: + xlog_cil_push_now(log, sequence); /* * See if we can find a previous sequence still committing. @@ -807,9 +827,15 @@ xlog_cil_force_lsn( * before allowing the force of push_seq to go ahead. Hence block * on commits for those as well. */ -restart: spin_lock(&cil->xc_push_lock); list_for_each_entry(ctx, &cil->xc_committing, committing) { + /* + * Avoid getting stuck in this loop because we were woken by the + * shutdown, but then went back to sleep once already in the + * shutdown state. + */ + if (XLOG_FORCED_SHUTDOWN(log)) + goto out_shutdown; if (ctx->sequence > sequence) continue; if (!ctx->commit_lsn) { @@ -825,8 +851,39 @@ restart: /* found it! */ commit_lsn = ctx->commit_lsn; } + + /* + * The call to xlog_cil_push_now() executes the push in the background. + * Hence by the time we have got here it our sequence may not have been + * pushed yet. This is true if the current sequence still matches the + * push sequence after the above wait loop and the CIL still contains + * dirty objects. + * + * When the push occurs, it will empty the CIL and atomically increment + * the currect sequence past the push sequence and move it into the + * committing list. Of course, if the CIL is clean at the time of the + * push, it won't have pushed the CIL at all, so in that case we should + * try the push for this sequence again from the start just in case. + */ + if (sequence == cil->xc_current_sequence && + !list_empty(&cil->xc_cil)) { + spin_unlock(&cil->xc_push_lock); + goto restart; + } + spin_unlock(&cil->xc_push_lock); return commit_lsn; + + /* + * We detected a shutdown in progress. We need to trigger the log force + * to pass through it's iclog state machine error handling, even though + * we are already in a shutdown state. Hence we can't return + * NULLCOMMITLSN here as that has special meaning to log forces (i.e. + * LSN is already stable), so we return a zero LSN instead. + */ +out_shutdown: + spin_unlock(&cil->xc_push_lock); + return 0; } /* diff --git a/fs/xfs/xfs_log_format.h b/fs/xfs/xfs_log_format.h index ca7e28a8ed3..f0969c77bdb 100644 --- a/fs/xfs/xfs_log_format.h +++ b/fs/xfs/xfs_log_format.h @@ -234,178 +234,6 @@ typedef struct xfs_trans_header { { XFS_LI_ICREATE, "XFS_LI_ICREATE" } /* - * Transaction types. Used to distinguish types of buffers. - */ -#define XFS_TRANS_SETATTR_NOT_SIZE 1 -#define XFS_TRANS_SETATTR_SIZE 2 -#define XFS_TRANS_INACTIVE 3 -#define XFS_TRANS_CREATE 4 -#define XFS_TRANS_CREATE_TRUNC 5 -#define XFS_TRANS_TRUNCATE_FILE 6 -#define XFS_TRANS_REMOVE 7 -#define XFS_TRANS_LINK 8 -#define XFS_TRANS_RENAME 9 -#define XFS_TRANS_MKDIR 10 -#define XFS_TRANS_RMDIR 11 -#define XFS_TRANS_SYMLINK 12 -#define XFS_TRANS_SET_DMATTRS 13 -#define XFS_TRANS_GROWFS 14 -#define XFS_TRANS_STRAT_WRITE 15 -#define XFS_TRANS_DIOSTRAT 16 -/* 17 was XFS_TRANS_WRITE_SYNC */ -#define XFS_TRANS_WRITEID 18 -#define XFS_TRANS_ADDAFORK 19 -#define XFS_TRANS_ATTRINVAL 20 -#define XFS_TRANS_ATRUNCATE 21 -#define XFS_TRANS_ATTR_SET 22 -#define XFS_TRANS_ATTR_RM 23 -#define XFS_TRANS_ATTR_FLAG 24 -#define XFS_TRANS_CLEAR_AGI_BUCKET 25 -#define XFS_TRANS_QM_SBCHANGE 26 -/* - * Dummy entries since we use the transaction type to index into the - * trans_type[] in xlog_recover_print_trans_head() - */ -#define XFS_TRANS_DUMMY1 27 -#define XFS_TRANS_DUMMY2 28 -#define XFS_TRANS_QM_QUOTAOFF 29 -#define XFS_TRANS_QM_DQALLOC 30 -#define XFS_TRANS_QM_SETQLIM 31 -#define XFS_TRANS_QM_DQCLUSTER 32 -#define XFS_TRANS_QM_QINOCREATE 33 -#define XFS_TRANS_QM_QUOTAOFF_END 34 -#define XFS_TRANS_SB_UNIT 35 -#define XFS_TRANS_FSYNC_TS 36 -#define XFS_TRANS_GROWFSRT_ALLOC 37 -#define XFS_TRANS_GROWFSRT_ZERO 38 -#define XFS_TRANS_GROWFSRT_FREE 39 -#define XFS_TRANS_SWAPEXT 40 -#define XFS_TRANS_SB_COUNT 41 -#define XFS_TRANS_CHECKPOINT 42 -#define XFS_TRANS_ICREATE 43 -#define XFS_TRANS_TYPE_MAX 43 -/* new transaction types need to be reflected in xfs_logprint(8) */ - -#define XFS_TRANS_TYPES \ - { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \ - { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \ - { XFS_TRANS_INACTIVE, "INACTIVE" }, \ - { XFS_TRANS_CREATE, "CREATE" }, \ - { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \ - { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \ - { XFS_TRANS_REMOVE, "REMOVE" }, \ - { XFS_TRANS_LINK, "LINK" }, \ - { XFS_TRANS_RENAME, "RENAME" }, \ - { XFS_TRANS_MKDIR, "MKDIR" }, \ - { XFS_TRANS_RMDIR, "RMDIR" }, \ - { XFS_TRANS_SYMLINK, "SYMLINK" }, \ - { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \ - { XFS_TRANS_GROWFS, "GROWFS" }, \ - { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \ - { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \ - { XFS_TRANS_WRITEID, "WRITEID" }, \ - { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \ - { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \ - { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \ - { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \ - { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \ - { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \ - { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \ - { XFS_TRANS_QM_SBCHANGE, "QM_SBCHANGE" }, \ - { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \ - { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \ - { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \ - { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \ - { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \ - { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \ - { XFS_TRANS_SB_UNIT, "SB_UNIT" }, \ - { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \ - { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \ - { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \ - { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ - { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ - { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ - { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \ - { XFS_TRANS_DUMMY1, "DUMMY1" }, \ - { XFS_TRANS_DUMMY2, "DUMMY2" }, \ - { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } - -/* - * This structure is used to track log items associated with - * a transaction. It points to the log item and keeps some - * flags to track the state of the log item. It also tracks - * the amount of space needed to log the item it describes - * once we get to commit processing (see xfs_trans_commit()). - */ -struct xfs_log_item_desc { - struct xfs_log_item *lid_item; - struct list_head lid_trans; - unsigned char lid_flags; -}; - -#define XFS_LID_DIRTY 0x1 - -/* - * Values for t_flags. - */ -#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */ -#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */ -#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */ -#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ -#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ -#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ -#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer - count in superblock */ - -/* - * Values for call flags parameter. - */ -#define XFS_TRANS_RELEASE_LOG_RES 0x4 -#define XFS_TRANS_ABORT 0x8 - -/* - * Field values for xfs_trans_mod_sb. - */ -#define XFS_TRANS_SB_ICOUNT 0x00000001 -#define XFS_TRANS_SB_IFREE 0x00000002 -#define XFS_TRANS_SB_FDBLOCKS 0x00000004 -#define XFS_TRANS_SB_RES_FDBLOCKS 0x00000008 -#define XFS_TRANS_SB_FREXTENTS 0x00000010 -#define XFS_TRANS_SB_RES_FREXTENTS 0x00000020 -#define XFS_TRANS_SB_DBLOCKS 0x00000040 -#define XFS_TRANS_SB_AGCOUNT 0x00000080 -#define XFS_TRANS_SB_IMAXPCT 0x00000100 -#define XFS_TRANS_SB_REXTSIZE 0x00000200 -#define XFS_TRANS_SB_RBMBLOCKS 0x00000400 -#define XFS_TRANS_SB_RBLOCKS 0x00000800 -#define XFS_TRANS_SB_REXTENTS 0x00001000 -#define XFS_TRANS_SB_REXTSLOG 0x00002000 - -/* - * Here we centralize the specification of XFS meta-data buffer - * reference count values. This determine how hard the buffer - * cache tries to hold onto the buffer. - */ -#define XFS_AGF_REF 4 -#define XFS_AGI_REF 4 -#define XFS_AGFL_REF 3 -#define XFS_INO_BTREE_REF 3 -#define XFS_ALLOC_BTREE_REF 2 -#define XFS_BMAP_BTREE_REF 2 -#define XFS_DIR_BTREE_REF 2 -#define XFS_INO_REF 2 -#define XFS_ATTR_BTREE_REF 1 -#define XFS_DQUOT_REF 1 - -/* - * Flags for xfs_trans_ichgtime(). - */ -#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ -#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ -#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */ - - -/* * Inode Log Item Format definitions. * * This is the structure used to lay out an inode log item in the @@ -797,7 +625,6 @@ typedef struct xfs_qoff_logformat { char qf_pad[12]; /* padding for future */ } xfs_qoff_logformat_t; - /* * Disk quotas status in m_qflags, and also sb_qflags. 16 bits. */ @@ -849,8 +676,4 @@ struct xfs_icreate_log { __be32 icl_gen; /* inode generation number to use */ }; -int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes); -int xfs_log_calc_minimum_size(struct xfs_mount *); - - #endif /* __XFS_LOG_FORMAT_H__ */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 136654b9400..9bc403a9e54 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -22,6 +22,7 @@ struct xfs_buf; struct xlog; struct xlog_ticket; struct xfs_mount; +struct xfs_log_callback; /* * Flags for log structure @@ -227,8 +228,8 @@ typedef struct xlog_in_core { /* Callback structures need their own cacheline */ spinlock_t ic_callback_lock ____cacheline_aligned_in_smp; - xfs_log_callback_t *ic_callback; - xfs_log_callback_t **ic_callback_tail; + struct xfs_log_callback *ic_callback; + struct xfs_log_callback **ic_callback_tail; /* reference counts need their own cacheline */ atomic_t ic_refcnt ____cacheline_aligned_in_smp; @@ -254,7 +255,7 @@ struct xfs_cil_ctx { int space_used; /* aggregate size of regions */ struct list_head busy_extents; /* busy extents in chkpt */ struct xfs_log_vec *lv_chain; /* logvecs being pushed */ - xfs_log_callback_t log_cb; /* completion callback hook. */ + struct xfs_log_callback log_cb; /* completion callback hook. */ struct list_head committing; /* ctx committing list */ }; @@ -514,12 +515,10 @@ xlog_assign_grant_head(atomic64_t *head, int cycle, int space) /* * Committed Item List interfaces */ -int -xlog_cil_init(struct xlog *log); -void -xlog_cil_init_post_recovery(struct xlog *log); -void -xlog_cil_destroy(struct xlog *log); +int xlog_cil_init(struct xlog *log); +void xlog_cil_init_post_recovery(struct xlog *log); +void xlog_cil_destroy(struct xlog *log); +bool xlog_cil_empty(struct xlog *log); /* * CIL force routines diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 39797490a1f..981af0f6504 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -17,42 +17,34 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" #include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_error.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_btree.h" -#include "xfs_dinode.h" +#include "xfs_da_format.h" #include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_alloc.h" -#include "xfs_ialloc.h" +#include "xfs_trans.h" +#include "xfs_log.h" #include "xfs_log_priv.h" -#include "xfs_buf_item.h" #include "xfs_log_recover.h" +#include "xfs_inode_item.h" #include "xfs_extfree_item.h" #include "xfs_trans_priv.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" #include "xfs_quota.h" #include "xfs_cksum.h" #include "xfs_trace.h" #include "xfs_icache.h" -#include "xfs_icreate_item.h" - -/* Need all the magic numbers and buffer ops structures from these headers */ -#include "xfs_symlink.h" -#include "xfs_da_btree.h" -#include "xfs_dir2_format.h" +#include "xfs_bmap_btree.h" +#include "xfs_dinode.h" +#include "xfs_error.h" #include "xfs_dir2.h" -#include "xfs_attr_leaf.h" -#include "xfs_attr_remote.h" #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) @@ -201,7 +193,10 @@ xlog_bread_noalign( bp->b_io_length = nbblks; bp->b_error = 0; - xfsbdstrat(log->l_mp, bp); + if (XFS_FORCED_SHUTDOWN(log->l_mp)) + return XFS_ERROR(EIO); + + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) xfs_buf_ioerror_alert(bp, __func__); @@ -305,9 +300,9 @@ xlog_header_check_dump( xfs_mount_t *mp, xlog_rec_header_t *head) { - xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d\n", + xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d", __func__, &mp->m_sb.sb_uuid, XLOG_FMT); - xfs_debug(mp, " log : uuid = %pU, fmt = %d\n", + xfs_debug(mp, " log : uuid = %pU, fmt = %d", &head->h_fs_uuid, be32_to_cpu(head->h_fmt)); } #else @@ -1659,6 +1654,7 @@ xlog_recover_reorder_trans( int pass) { xlog_recover_item_t *item, *n; + int error = 0; LIST_HEAD(sort_list); LIST_HEAD(cancel_list); LIST_HEAD(buffer_list); @@ -1700,9 +1696,17 @@ xlog_recover_reorder_trans( "%s: unrecognized type of log operation", __func__); ASSERT(0); - return XFS_ERROR(EIO); + /* + * return the remaining items back to the transaction + * item list so they can be freed in caller. + */ + if (!list_empty(&sort_list)) + list_splice_init(&sort_list, &trans->r_itemq); + error = XFS_ERROR(EIO); + goto out; } } +out: ASSERT(list_empty(&sort_list)); if (!list_empty(&buffer_list)) list_splice(&buffer_list, &trans->r_itemq); @@ -1712,7 +1716,7 @@ xlog_recover_reorder_trans( list_splice_tail(&inode_buffer_list, &trans->r_itemq); if (!list_empty(&cancel_list)) list_splice_tail(&cancel_list, &trans->r_itemq); - return 0; + return error; } /* @@ -2134,7 +2138,9 @@ xlog_recover_validate_buf_type( bp->b_ops = &xfs_allocbt_buf_ops; break; case XFS_IBT_CRC_MAGIC: + case XFS_FIBT_CRC_MAGIC: case XFS_IBT_MAGIC: + case XFS_FIBT_MAGIC: bp->b_ops = &xfs_inobt_buf_ops; break; case XFS_BMAP_CRC_MAGIC: @@ -2362,7 +2368,7 @@ xlog_recover_do_reg_buffer( item->ri_buf[i].i_len, __func__); goto next; } - error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr, + error = xfs_dqcheck(mp, item->ri_buf[i].i_addr, -1, 0, XFS_QMOPT_DOWARN, "dquot_buf_recover"); if (error) @@ -2394,133 +2400,6 @@ xlog_recover_do_reg_buffer( } /* - * Do some primitive error checking on ondisk dquot data structures. - */ -int -xfs_qm_dqcheck( - struct xfs_mount *mp, - xfs_disk_dquot_t *ddq, - xfs_dqid_t id, - uint type, /* used only when IO_dorepair is true */ - uint flags, - char *str) -{ - xfs_dqblk_t *d = (xfs_dqblk_t *)ddq; - int errs = 0; - - /* - * We can encounter an uninitialized dquot buffer for 2 reasons: - * 1. If we crash while deleting the quotainode(s), and those blks got - * used for user data. This is because we take the path of regular - * file deletion; however, the size field of quotainodes is never - * updated, so all the tricks that we play in itruncate_finish - * don't quite matter. - * - * 2. We don't play the quota buffers when there's a quotaoff logitem. - * But the allocation will be replayed so we'll end up with an - * uninitialized quota block. - * - * This is all fine; things are still consistent, and we haven't lost - * any quota information. Just don't complain about bad dquot blks. - */ - if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x", - str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC); - errs++; - } - if (ddq->d_version != XFS_DQUOT_VERSION) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x", - str, id, ddq->d_version, XFS_DQUOT_VERSION); - errs++; - } - - if (ddq->d_flags != XFS_DQ_USER && - ddq->d_flags != XFS_DQ_PROJ && - ddq->d_flags != XFS_DQ_GROUP) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : XFS dquot ID 0x%x, unknown flags 0x%x", - str, id, ddq->d_flags); - errs++; - } - - if (id != -1 && id != be32_to_cpu(ddq->d_id)) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : ondisk-dquot 0x%p, ID mismatch: " - "0x%x expected, found id 0x%x", - str, ddq, id, be32_to_cpu(ddq->d_id)); - errs++; - } - - if (!errs && ddq->d_id) { - if (ddq->d_blk_softlimit && - be64_to_cpu(ddq->d_bcount) > - be64_to_cpu(ddq->d_blk_softlimit)) { - if (!ddq->d_btimer) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED", - str, (int)be32_to_cpu(ddq->d_id), ddq); - errs++; - } - } - if (ddq->d_ino_softlimit && - be64_to_cpu(ddq->d_icount) > - be64_to_cpu(ddq->d_ino_softlimit)) { - if (!ddq->d_itimer) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED", - str, (int)be32_to_cpu(ddq->d_id), ddq); - errs++; - } - } - if (ddq->d_rtb_softlimit && - be64_to_cpu(ddq->d_rtbcount) > - be64_to_cpu(ddq->d_rtb_softlimit)) { - if (!ddq->d_rtbtimer) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED", - str, (int)be32_to_cpu(ddq->d_id), ddq); - errs++; - } - } - } - - if (!errs || !(flags & XFS_QMOPT_DQREPAIR)) - return errs; - - if (flags & XFS_QMOPT_DOWARN) - xfs_notice(mp, "Re-initializing dquot ID 0x%x", id); - - /* - * Typically, a repair is only requested by quotacheck. - */ - ASSERT(id != -1); - ASSERT(flags & XFS_QMOPT_DQREPAIR); - memset(d, 0, sizeof(xfs_dqblk_t)); - - d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); - d->dd_diskdq.d_version = XFS_DQUOT_VERSION; - d->dd_diskdq.d_flags = type; - d->dd_diskdq.d_id = cpu_to_be32(id); - - if (xfs_sb_version_hascrc(&mp->m_sb)) { - uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); - xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), - XFS_DQUOT_CRC_OFF); - } - - return errs; -} - -/* * Perform a dquot buffer recovery. * Simple algorithm: if we have found a QUOTAOFF log item of the same type * (ie. USR or GRP), then just toss this buffer away; don't recover it. @@ -2649,19 +2528,19 @@ xlog_recover_buffer_pass2( * * Also make sure that only inode buffers with good sizes stay in * the buffer cache. The kernel moves inodes in buffers of 1 block - * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode + * or mp->m_inode_cluster_size bytes, whichever is bigger. The inode * buffers in the log can be a different size if the log was generated * by an older kernel using unclustered inode buffers or a newer kernel * running with a different inode cluster size. Regardless, if the - * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE) - * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep + * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size) + * for *our* value of mp->m_inode_cluster_size, then we need to keep * the buffer out of the buffer cache so that the buffer won't * overlap with future reads of those inodes. */ if (XFS_DINODE_MAGIC == be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize, - (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { + (__uint32_t)log->l_mp->m_inode_cluster_size))) { xfs_buf_stale(bp); error = xfs_bwrite(bp); } else { @@ -3125,7 +3004,7 @@ xlog_recover_dquot_pass2( */ dq_f = item->ri_buf[0].i_addr; ASSERT(dq_f); - error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, + error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, "xlog_recover_dquot_pass2 (log copy)"); if (error) return XFS_ERROR(EIO); @@ -3145,7 +3024,7 @@ xlog_recover_dquot_pass2( * was among a chunk of dquots created earlier, and we did some * minimal initialization then. */ - error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, + error = xfs_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, "xlog_recover_dquot_pass2"); if (error) { xfs_buf_relse(bp); @@ -3268,7 +3147,7 @@ xlog_recover_efd_pass2( } lip = xfs_trans_ail_cursor_next(ailp, &cur); } - xfs_trans_ail_cursor_done(ailp, &cur); + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); return 0; @@ -3334,10 +3213,10 @@ xlog_recover_do_icreate_pass2( } /* existing allocation is fixed value */ - ASSERT(count == XFS_IALLOC_INODES(mp)); - ASSERT(length == XFS_IALLOC_BLOCKS(mp)); - if (count != XFS_IALLOC_INODES(mp) || - length != XFS_IALLOC_BLOCKS(mp)) { + ASSERT(count == mp->m_ialloc_inos); + ASSERT(length == mp->m_ialloc_blks); + if (count != mp->m_ialloc_inos || + length != mp->m_ialloc_blks) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); return EINVAL; } @@ -3643,8 +3522,7 @@ out: STATIC int xlog_recover_unmount_trans( - struct xlog *log, - struct xlog_recover *trans) + struct xlog *log) { /* Do nothing now */ xfs_warn(log->l_mp, "%s: Unmount LR", __func__); @@ -3718,7 +3596,7 @@ xlog_recover_process_data( trans, pass); break; case XLOG_UNMOUNT_TRANS: - error = xlog_recover_unmount_trans(log, trans); + error = xlog_recover_unmount_trans(log); break; case XLOG_WAS_CONT_TRANS: error = xlog_recover_add_to_cont_trans(log, @@ -3743,8 +3621,10 @@ xlog_recover_process_data( error = XFS_ERROR(EIO); break; } - if (error) + if (error) { + xlog_recover_free_trans(trans); return error; + } } dp += be32_to_cpu(ohead->oh_len); num_logops--; @@ -3878,7 +3758,7 @@ xlog_recover_process_efis( lip = xfs_trans_ail_cursor_next(ailp, &cur); } out: - xfs_trans_ail_cursor_done(ailp, &cur); + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); return error; } @@ -4077,7 +3957,7 @@ xlog_unpack_data_crc( if (crc != rhead->h_crc) { if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { xfs_alert(log->l_mp, - "log record CRC mismatch: found 0x%x, expected 0x%x.\n", + "log record CRC mismatch: found 0x%x, expected 0x%x.", le32_to_cpu(rhead->h_crc), le32_to_cpu(crc)); xfs_hex_dump(dp, 32); @@ -4532,7 +4412,13 @@ xlog_do_recover( XFS_BUF_READ(bp); XFS_BUF_UNASYNC(bp); bp->b_ops = &xfs_sb_buf_ops; - xfsbdstrat(log->l_mp, bp); + + if (XFS_FORCED_SHUTDOWN(log->l_mp)) { + xfs_buf_relse(bp); + return XFS_ERROR(EIO); + } + + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) { xfs_buf_ioerror_alert(bp, __func__); diff --git a/fs/xfs/xfs_log_rlimit.c b/fs/xfs/xfs_log_rlimit.c index bbcec0bbc12..ee7e0e80246 100644 --- a/fs/xfs/xfs_log_rlimit.c +++ b/fs/xfs/xfs_log_rlimit.c @@ -17,16 +17,19 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_ag.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_trans_space.h" -#include "xfs_bmap_btree.h" #include "xfs_inode.h" #include "xfs_da_btree.h" #include "xfs_attr_leaf.h" +#include "xfs_bmap_btree.h" /* * Calculate the maximum length in bytes that would be required for a local @@ -39,7 +42,7 @@ xfs_log_calc_max_attrsetm_res( int size; int nblks; - size = xfs_attr_leaf_entsize_local_max(mp->m_sb.sb_blocksize) - + size = xfs_attr_leaf_entsize_local_max(mp->m_attr_geo->blksize) - MAXNAMELEN - 1; nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); nblks += XFS_B_TO_FSB(mp, size); diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 9163dc14053..63ca2f0420b 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -17,9 +17,8 @@ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 5dcc68019d1..3507cd0ec40 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -17,35 +17,31 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" #include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_da_btree.h" -#include "xfs_dir2_format.h" -#include "xfs_dir2.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" +#include "xfs_da_format.h" #include "xfs_inode.h" -#include "xfs_btree.h" +#include "xfs_dir2.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_rtalloc.h" #include "xfs_bmap.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_fsops.h" #include "xfs_trace.h" #include "xfs_icache.h" -#include "xfs_cksum.h" -#include "xfs_buf_item.h" +#include "xfs_dinode.h" #ifdef HAVE_PERCPU_SB @@ -286,22 +282,29 @@ xfs_readsb( struct xfs_sb *sbp = &mp->m_sb; int error; int loud = !(flags & XFS_MFSI_QUIET); + const struct xfs_buf_ops *buf_ops; ASSERT(mp->m_sb_bp == NULL); ASSERT(mp->m_ddev_targp != NULL); /* + * For the initial read, we must guess at the sector + * size based on the block device. It's enough to + * get the sb_sectsize out of the superblock and + * then reread with the proper length. + * We don't verify it yet, because it may not be complete. + */ + sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); + buf_ops = NULL; + + /* * Allocate a (locked) buffer to hold the superblock. * This will be kept around at all times to optimize * access to the superblock. */ - sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); - reread: bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), 0, - loud ? &xfs_sb_buf_ops - : &xfs_sb_quiet_buf_ops); + BTOBB(sector_size), 0, buf_ops); if (!bp) { if (loud) xfs_warn(mp, "SB buffer read failed"); @@ -311,14 +314,28 @@ reread: error = bp->b_error; if (loud) xfs_warn(mp, "SB validate failed with error %d.", error); + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; goto release_buf; } /* * Initialize the mount structure from the superblock. */ - xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); - xfs_sb_quota_from_disk(&mp->m_sb); + xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); + xfs_sb_quota_from_disk(sbp); + + /* + * If we haven't validated the superblock, do so now before we try + * to check the sector size and reread the superblock appropriately. + */ + if (sbp->sb_magicnum != XFS_SB_MAGIC) { + if (loud) + xfs_warn(mp, "Invalid superblock magic number"); + error = EINVAL; + goto release_buf; + } /* * We must be able to do sector-sized and sector-aligned IO. @@ -331,13 +348,14 @@ reread: goto release_buf; } - /* - * If device sector size is smaller than the superblock size, - * re-read the superblock so the buffer is correctly sized. - */ - if (sector_size < sbp->sb_sectsize) { + if (buf_ops == NULL) { + /* + * Re-read the superblock so the buffer is correctly sized, + * and properly verified. + */ xfs_buf_relse(bp); sector_size = sbp->sb_sectsize; + buf_ops = loud ? &xfs_sb_buf_ops : &xfs_sb_quiet_buf_ops; goto reread; } @@ -690,6 +708,12 @@ xfs_mountfs( mp->m_update_flags |= XFS_SB_VERSIONNUM; } + /* always use v2 inodes by default now */ + if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) { + mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT; + mp->m_update_flags |= XFS_SB_VERSIONNUM; + } + /* * Check if sb_agblocks is aligned at stripe boundary * If sb_agblocks is NOT aligned turn off m_dalign since @@ -723,8 +747,20 @@ xfs_mountfs( * Set the inode cluster size. * This may still be overridden by the file system * block size if it is larger than the chosen cluster size. + * + * For v5 filesystems, scale the cluster size with the inode size to + * keep a constant ratio of inode per cluster buffer, but only if mkfs + * has set the inode alignment value appropriately for larger cluster + * sizes. */ mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + int new_size = mp->m_inode_cluster_size; + + new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; + if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size)) + mp->m_inode_cluster_size = new_size; + } /* * Set inode alignment fields @@ -755,12 +791,11 @@ xfs_mountfs( mp->m_dmevmask = 0; /* not persistent; set after each mount */ - xfs_dir_mount(mp); - - /* - * Initialize the attribute manager's entries. - */ - mp->m_attr_magicpct = (mp->m_sb.sb_blocksize * 37) / 100; + error = xfs_da_mount(mp); + if (error) { + xfs_warn(mp, "Failed dir/attr init: %d", error); + goto out_remove_uuid; + } /* * Initialize the precomputed transaction reservations values. @@ -775,7 +810,7 @@ xfs_mountfs( error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); if (error) { xfs_warn(mp, "Failed per-ag init: %d", error); - goto out_remove_uuid; + goto out_free_dir; } if (!sbp->sb_logblocks) { @@ -950,6 +985,8 @@ xfs_mountfs( xfs_wait_buftarg(mp->m_ddev_targp); out_free_perag: xfs_free_perag(mp); + out_free_dir: + xfs_da_unmount(mp); out_remove_uuid: xfs_uuid_unmount(mp); out: @@ -1027,6 +1064,7 @@ xfs_unmountfs( "Freespace may not be correct on next mount."); xfs_log_unmount(mp); + xfs_da_unmount(mp); xfs_uuid_unmount(mp); #if defined(DEBUG) diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 1fa0584b562..7295a0b7c34 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -26,6 +26,8 @@ struct xfs_mru_cache; struct xfs_nameops; struct xfs_ail; struct xfs_quotainfo; +struct xfs_dir_ops; +struct xfs_da_geometry; #ifdef HAVE_PERCPU_SB @@ -95,6 +97,8 @@ typedef struct xfs_mount { uint m_readio_blocks; /* min read size blocks */ uint m_writeio_log; /* min write size log bytes */ uint m_writeio_blocks; /* min write size blocks */ + struct xfs_da_geometry *m_dir_geo; /* directory block geometry */ + struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */ struct xlog *m_log; /* log specific stuff */ int m_logbufs; /* number of log buffers */ int m_logbsize; /* size of each log buffer */ @@ -111,7 +115,7 @@ typedef struct xfs_mount { __uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ __uint8_t m_agno_log; /* log #ag's */ __uint8_t m_agino_log; /* #bits for agino in inum */ - __uint16_t m_inode_cluster_size;/* min inode buf size */ + uint m_inode_cluster_size;/* min inode buf size */ uint m_blockmask; /* sb_blocksize-1 */ uint m_blockwsize; /* sb_blocksize in words */ uint m_blockwmask; /* blockwsize-1 */ @@ -130,8 +134,6 @@ typedef struct xfs_mount { int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_dmevmask; /* DMI events for this FS */ __uint64_t m_flags; /* global mount flags */ - uint m_dir_node_ents; /* #entries in a dir danode */ - uint m_attr_node_ents; /* #entries in attr danode */ int m_ialloc_inos; /* inodes in inode allocation */ int m_ialloc_blks; /* blocks in inode allocation */ int m_inoalign_mask;/* mask sb_inoalignmt if used */ @@ -144,15 +146,10 @@ typedef struct xfs_mount { int m_dalign; /* stripe unit */ int m_swidth; /* stripe width */ int m_sinoalign; /* stripe unit inode alignment */ - int m_attr_magicpct;/* 37% of the blocksize */ - int m_dir_magicpct; /* 37% of the dir blocksize */ __uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */ - int m_dirblksize; /* directory block sz--bytes */ - int m_dirblkfsbs; /* directory block sz--fsbs */ - xfs_dablk_t m_dirdatablk; /* blockno of dir data v2 */ - xfs_dablk_t m_dirleafblk; /* blockno of dir non-data v2 */ - xfs_dablk_t m_dirfreeblk; /* blockno of dirfreeindex v2 */ + const struct xfs_dir_ops *m_dir_inode_ops; /* vector of dir inode ops */ + const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */ uint m_chsize; /* size of next field */ atomic_t m_active_trans; /* number trans frozen */ #ifdef HAVE_PERCPU_SB diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 4aff5639573..f99b4933dc2 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -100,14 +100,20 @@ * likely result in a loop in one of the lists. That's a sure-fire recipe for * an infinite loop in the code. */ -typedef struct xfs_mru_cache_elem -{ - struct list_head list_node; - unsigned long key; - void *value; -} xfs_mru_cache_elem_t; +struct xfs_mru_cache { + struct radix_tree_root store; /* Core storage data structure. */ + struct list_head *lists; /* Array of lists, one per grp. */ + struct list_head reap_list; /* Elements overdue for reaping. */ + spinlock_t lock; /* Lock to protect this struct. */ + unsigned int grp_count; /* Number of discrete groups. */ + unsigned int grp_time; /* Time period spanned by grps. */ + unsigned int lru_grp; /* Group containing time zero. */ + unsigned long time_zero; /* Time first element was added. */ + xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */ + struct delayed_work work; /* Workqueue data for reaping. */ + unsigned int queued; /* work has been queued */ +}; -static kmem_zone_t *xfs_mru_elem_zone; static struct workqueue_struct *xfs_mru_reap_wq; /* @@ -129,12 +135,12 @@ static struct workqueue_struct *xfs_mru_reap_wq; */ STATIC unsigned long _xfs_mru_cache_migrate( - xfs_mru_cache_t *mru, - unsigned long now) + struct xfs_mru_cache *mru, + unsigned long now) { - unsigned int grp; - unsigned int migrated = 0; - struct list_head *lru_list; + unsigned int grp; + unsigned int migrated = 0; + struct list_head *lru_list; /* Nothing to do if the data store is empty. */ if (!mru->time_zero) @@ -193,11 +199,11 @@ _xfs_mru_cache_migrate( */ STATIC void _xfs_mru_cache_list_insert( - xfs_mru_cache_t *mru, - xfs_mru_cache_elem_t *elem) + struct xfs_mru_cache *mru, + struct xfs_mru_cache_elem *elem) { - unsigned int grp = 0; - unsigned long now = jiffies; + unsigned int grp = 0; + unsigned long now = jiffies; /* * If the data store is empty, initialise time zero, leave grp set to @@ -231,10 +237,10 @@ _xfs_mru_cache_list_insert( */ STATIC void _xfs_mru_cache_clear_reap_list( - xfs_mru_cache_t *mru) __releases(mru->lock) __acquires(mru->lock) - + struct xfs_mru_cache *mru) + __releases(mru->lock) __acquires(mru->lock) { - xfs_mru_cache_elem_t *elem, *next; + struct xfs_mru_cache_elem *elem, *next; struct list_head tmp; INIT_LIST_HEAD(&tmp); @@ -252,15 +258,8 @@ _xfs_mru_cache_clear_reap_list( spin_unlock(&mru->lock); list_for_each_entry_safe(elem, next, &tmp, list_node) { - - /* Remove the element from the reap list. */ list_del_init(&elem->list_node); - - /* Call the client's free function with the key and value pointer. */ - mru->free_func(elem->key, elem->value); - - /* Free the element structure. */ - kmem_zone_free(xfs_mru_elem_zone, elem); + mru->free_func(elem); } spin_lock(&mru->lock); @@ -277,7 +276,8 @@ STATIC void _xfs_mru_cache_reap( struct work_struct *work) { - xfs_mru_cache_t *mru = container_of(work, xfs_mru_cache_t, work.work); + struct xfs_mru_cache *mru = + container_of(work, struct xfs_mru_cache, work.work); unsigned long now, next; ASSERT(mru && mru->lists); @@ -304,28 +304,16 @@ _xfs_mru_cache_reap( int xfs_mru_cache_init(void) { - xfs_mru_elem_zone = kmem_zone_init(sizeof(xfs_mru_cache_elem_t), - "xfs_mru_cache_elem"); - if (!xfs_mru_elem_zone) - goto out; - xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1); if (!xfs_mru_reap_wq) - goto out_destroy_mru_elem_zone; - + return -ENOMEM; return 0; - - out_destroy_mru_elem_zone: - kmem_zone_destroy(xfs_mru_elem_zone); - out: - return -ENOMEM; } void xfs_mru_cache_uninit(void) { destroy_workqueue(xfs_mru_reap_wq); - kmem_zone_destroy(xfs_mru_elem_zone); } /* @@ -336,14 +324,14 @@ xfs_mru_cache_uninit(void) */ int xfs_mru_cache_create( - xfs_mru_cache_t **mrup, + struct xfs_mru_cache **mrup, unsigned int lifetime_ms, unsigned int grp_count, xfs_mru_cache_free_func_t free_func) { - xfs_mru_cache_t *mru = NULL; - int err = 0, grp; - unsigned int grp_time; + struct xfs_mru_cache *mru = NULL; + int err = 0, grp; + unsigned int grp_time; if (mrup) *mrup = NULL; @@ -400,7 +388,7 @@ exit: */ static void xfs_mru_cache_flush( - xfs_mru_cache_t *mru) + struct xfs_mru_cache *mru) { if (!mru || !mru->lists) return; @@ -420,7 +408,7 @@ xfs_mru_cache_flush( void xfs_mru_cache_destroy( - xfs_mru_cache_t *mru) + struct xfs_mru_cache *mru) { if (!mru || !mru->lists) return; @@ -438,38 +426,30 @@ xfs_mru_cache_destroy( */ int xfs_mru_cache_insert( - xfs_mru_cache_t *mru, - unsigned long key, - void *value) + struct xfs_mru_cache *mru, + unsigned long key, + struct xfs_mru_cache_elem *elem) { - xfs_mru_cache_elem_t *elem; + int error; ASSERT(mru && mru->lists); if (!mru || !mru->lists) return EINVAL; - elem = kmem_zone_zalloc(xfs_mru_elem_zone, KM_SLEEP); - if (!elem) + if (radix_tree_preload(GFP_KERNEL)) return ENOMEM; - if (radix_tree_preload(GFP_KERNEL)) { - kmem_zone_free(xfs_mru_elem_zone, elem); - return ENOMEM; - } - INIT_LIST_HEAD(&elem->list_node); elem->key = key; - elem->value = value; spin_lock(&mru->lock); - - radix_tree_insert(&mru->store, key, elem); + error = -radix_tree_insert(&mru->store, key, elem); radix_tree_preload_end(); - _xfs_mru_cache_list_insert(mru, elem); - + if (!error) + _xfs_mru_cache_list_insert(mru, elem); spin_unlock(&mru->lock); - return 0; + return error; } /* @@ -478,13 +458,12 @@ xfs_mru_cache_insert( * the client data pointer for the removed element is returned, otherwise this * function will return a NULL pointer. */ -void * +struct xfs_mru_cache_elem * xfs_mru_cache_remove( - xfs_mru_cache_t *mru, - unsigned long key) + struct xfs_mru_cache *mru, + unsigned long key) { - xfs_mru_cache_elem_t *elem; - void *value = NULL; + struct xfs_mru_cache_elem *elem; ASSERT(mru && mru->lists); if (!mru || !mru->lists) @@ -492,17 +471,11 @@ xfs_mru_cache_remove( spin_lock(&mru->lock); elem = radix_tree_delete(&mru->store, key); - if (elem) { - value = elem->value; + if (elem) list_del(&elem->list_node); - } - spin_unlock(&mru->lock); - if (elem) - kmem_zone_free(xfs_mru_elem_zone, elem); - - return value; + return elem; } /* @@ -511,13 +484,14 @@ xfs_mru_cache_remove( */ void xfs_mru_cache_delete( - xfs_mru_cache_t *mru, - unsigned long key) + struct xfs_mru_cache *mru, + unsigned long key) { - void *value = xfs_mru_cache_remove(mru, key); + struct xfs_mru_cache_elem *elem; - if (value) - mru->free_func(key, value); + elem = xfs_mru_cache_remove(mru, key); + if (elem) + mru->free_func(elem); } /* @@ -540,12 +514,12 @@ xfs_mru_cache_delete( * status, we need to help it get it right by annotating the path that does * not release the lock. */ -void * +struct xfs_mru_cache_elem * xfs_mru_cache_lookup( - xfs_mru_cache_t *mru, - unsigned long key) + struct xfs_mru_cache *mru, + unsigned long key) { - xfs_mru_cache_elem_t *elem; + struct xfs_mru_cache_elem *elem; ASSERT(mru && mru->lists); if (!mru || !mru->lists) @@ -560,7 +534,7 @@ xfs_mru_cache_lookup( } else spin_unlock(&mru->lock); - return elem ? elem->value : NULL; + return elem; } /* @@ -570,7 +544,8 @@ xfs_mru_cache_lookup( */ void xfs_mru_cache_done( - xfs_mru_cache_t *mru) __releases(mru->lock) + struct xfs_mru_cache *mru) + __releases(mru->lock) { spin_unlock(&mru->lock); } diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h index 36dd3ec8b4e..fb5245ba5ff 100644 --- a/fs/xfs/xfs_mru_cache.h +++ b/fs/xfs/xfs_mru_cache.h @@ -18,24 +18,15 @@ #ifndef __XFS_MRU_CACHE_H__ #define __XFS_MRU_CACHE_H__ +struct xfs_mru_cache; -/* Function pointer type for callback to free a client's data pointer. */ -typedef void (*xfs_mru_cache_free_func_t)(unsigned long, void*); +struct xfs_mru_cache_elem { + struct list_head list_node; + unsigned long key; +}; -typedef struct xfs_mru_cache -{ - struct radix_tree_root store; /* Core storage data structure. */ - struct list_head *lists; /* Array of lists, one per grp. */ - struct list_head reap_list; /* Elements overdue for reaping. */ - spinlock_t lock; /* Lock to protect this struct. */ - unsigned int grp_count; /* Number of discrete groups. */ - unsigned int grp_time; /* Time period spanned by grps. */ - unsigned int lru_grp; /* Group containing time zero. */ - unsigned long time_zero; /* Time first element was added. */ - xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */ - struct delayed_work work; /* Workqueue data for reaping. */ - unsigned int queued; /* work has been queued */ -} xfs_mru_cache_t; +/* Function pointer type for callback to free a client's data pointer. */ +typedef void (*xfs_mru_cache_free_func_t)(struct xfs_mru_cache_elem *elem); int xfs_mru_cache_init(void); void xfs_mru_cache_uninit(void); @@ -44,10 +35,12 @@ int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms, xfs_mru_cache_free_func_t free_func); void xfs_mru_cache_destroy(struct xfs_mru_cache *mru); int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key, - void *value); -void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key); + struct xfs_mru_cache_elem *elem); +struct xfs_mru_cache_elem * +xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key); void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key); -void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key); +struct xfs_mru_cache_elem * +xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key); void xfs_mru_cache_done(struct xfs_mru_cache *mru); #endif /* __XFS_MRU_CACHE_H__ */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 3e6c2e6c9cd..6d26759c779 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -17,31 +17,28 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_alloc.h" -#include "xfs_quota.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_ialloc.h" #include "xfs_itable.h" -#include "xfs_rtalloc.h" +#include "xfs_quota.h" #include "xfs_error.h" #include "xfs_bmap.h" -#include "xfs_attr.h" -#include "xfs_buf_item.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans.h" #include "xfs_trans_space.h" #include "xfs_qm.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_cksum.h" +#include "xfs_dinode.h" /* * The global quota manager. There is only one of these for the entire @@ -137,8 +134,6 @@ xfs_qm_dqpurge( { struct xfs_mount *mp = dqp->q_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; - struct xfs_dquot *gdqp = NULL; - struct xfs_dquot *pdqp = NULL; xfs_dqlock(dqp); if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) { @@ -146,21 +141,6 @@ xfs_qm_dqpurge( return EAGAIN; } - /* - * If this quota has a hint attached, prepare for releasing it now. - */ - gdqp = dqp->q_gdquot; - if (gdqp) { - xfs_dqlock(gdqp); - dqp->q_gdquot = NULL; - } - - pdqp = dqp->q_pdquot; - if (pdqp) { - xfs_dqlock(pdqp); - dqp->q_pdquot = NULL; - } - dqp->dq_flags |= XFS_DQ_FREEING; xfs_dqflock(dqp); @@ -209,11 +189,6 @@ xfs_qm_dqpurge( XFS_STATS_DEC(xs_qm_dquot_unused); xfs_qm_dqdestroy(dqp); - - if (gdqp) - xfs_qm_dqput(gdqp); - if (pdqp) - xfs_qm_dqput(pdqp); return 0; } @@ -383,7 +358,6 @@ xfs_qm_dqattach_one( xfs_dqid_t id, uint type, uint doalloc, - xfs_dquot_t *udqhint, /* hint */ xfs_dquot_t **IO_idqpp) { xfs_dquot_t *dqp; @@ -393,9 +367,9 @@ xfs_qm_dqattach_one( error = 0; /* - * See if we already have it in the inode itself. IO_idqpp is - * &i_udquot or &i_gdquot. This made the code look weird, but - * made the logic a lot simpler. + * See if we already have it in the inode itself. IO_idqpp is &i_udquot + * or &i_gdquot. This made the code look weird, but made the logic a lot + * simpler. */ dqp = *IO_idqpp; if (dqp) { @@ -404,49 +378,10 @@ xfs_qm_dqattach_one( } /* - * udqhint is the i_udquot field in inode, and is non-NULL only - * when the type arg is group/project. Its purpose is to save a - * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside - * the user dquot. - */ - if (udqhint) { - ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ); - xfs_dqlock(udqhint); - - /* - * No need to take dqlock to look at the id. - * - * The ID can't change until it gets reclaimed, and it won't - * be reclaimed as long as we have a ref from inode and we - * hold the ilock. - */ - if (type == XFS_DQ_GROUP) - dqp = udqhint->q_gdquot; - else - dqp = udqhint->q_pdquot; - if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) { - ASSERT(*IO_idqpp == NULL); - - *IO_idqpp = xfs_qm_dqhold(dqp); - xfs_dqunlock(udqhint); - return 0; - } - - /* - * We can't hold a dquot lock when we call the dqget code. - * We'll deadlock in no time, because of (not conforming to) - * lock ordering - the inodelock comes before any dquot lock, - * and we may drop and reacquire the ilock in xfs_qm_dqget(). - */ - xfs_dqunlock(udqhint); - } - - /* - * Find the dquot from somewhere. This bumps the - * reference count of dquot and returns it locked. - * This can return ENOENT if dquot didn't exist on - * disk and we didn't ask it to allocate; - * ESRCH if quotas got turned off suddenly. + * Find the dquot from somewhere. This bumps the reference count of + * dquot and returns it locked. This can return ENOENT if dquot didn't + * exist on disk and we didn't ask it to allocate; ESRCH if quotas got + * turned off suddenly. */ error = xfs_qm_dqget(ip->i_mount, ip, id, type, doalloc | XFS_QMOPT_DOWARN, &dqp); @@ -464,48 +399,6 @@ xfs_qm_dqattach_one( return 0; } - -/* - * Given a udquot and group/project type, attach the group/project - * dquot pointer to the udquot as a hint for future lookups. - */ -STATIC void -xfs_qm_dqattach_hint( - struct xfs_inode *ip, - int type) -{ - struct xfs_dquot **dqhintp; - struct xfs_dquot *dqp; - struct xfs_dquot *udq = ip->i_udquot; - - ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ); - - xfs_dqlock(udq); - - if (type == XFS_DQ_GROUP) { - dqp = ip->i_gdquot; - dqhintp = &udq->q_gdquot; - } else { - dqp = ip->i_pdquot; - dqhintp = &udq->q_pdquot; - } - - if (*dqhintp) { - struct xfs_dquot *tmp; - - if (*dqhintp == dqp) - goto done; - - tmp = *dqhintp; - *dqhintp = NULL; - xfs_qm_dqrele(tmp); - } - - *dqhintp = xfs_qm_dqhold(dqp); -done: - xfs_dqunlock(udq); -} - static bool xfs_qm_need_dqattach( struct xfs_inode *ip) @@ -536,7 +429,6 @@ xfs_qm_dqattach_locked( uint flags) { xfs_mount_t *mp = ip->i_mount; - uint nquotas = 0; int error = 0; if (!xfs_qm_need_dqattach(ip)) @@ -544,77 +436,39 @@ xfs_qm_dqattach_locked( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - if (XFS_IS_UQUOTA_ON(mp)) { + if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, flags & XFS_QMOPT_DQALLOC, - NULL, &ip->i_udquot); + &ip->i_udquot); if (error) goto done; - nquotas++; + ASSERT(ip->i_udquot); } - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - if (XFS_IS_GQUOTA_ON(mp)) { + if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, flags & XFS_QMOPT_DQALLOC, - ip->i_udquot, &ip->i_gdquot); - /* - * Don't worry about the udquot that we may have - * attached above. It'll get detached, if not already. - */ + &ip->i_gdquot); if (error) goto done; - nquotas++; + ASSERT(ip->i_gdquot); } - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - if (XFS_IS_PQUOTA_ON(mp)) { + if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) { error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ, flags & XFS_QMOPT_DQALLOC, - ip->i_udquot, &ip->i_pdquot); - /* - * Don't worry about the udquot that we may have - * attached above. It'll get detached, if not already. - */ + &ip->i_pdquot); if (error) goto done; - nquotas++; + ASSERT(ip->i_pdquot); } +done: /* - * Attach this group/project quota to the user quota as a hint. - * This WON'T, in general, result in a thrash. + * Don't worry about the dquots that we may have attached before any + * error - they'll get detached later if it has not already been done. */ - if (nquotas > 1 && ip->i_udquot) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(ip->i_gdquot || !XFS_IS_GQUOTA_ON(mp)); - ASSERT(ip->i_pdquot || !XFS_IS_PQUOTA_ON(mp)); - - /* - * We do not have i_udquot locked at this point, but this check - * is OK since we don't depend on the i_gdquot to be accurate - * 100% all the time. It is just a hint, and this will - * succeed in general. - */ - if (ip->i_udquot->q_gdquot != ip->i_gdquot) - xfs_qm_dqattach_hint(ip, XFS_DQ_GROUP); - - if (ip->i_udquot->q_pdquot != ip->i_pdquot) - xfs_qm_dqattach_hint(ip, XFS_DQ_PROJ); - } - - done: -#ifdef DEBUG - if (!error) { - if (XFS_IS_UQUOTA_ON(mp)) - ASSERT(ip->i_udquot); - if (XFS_IS_GQUOTA_ON(mp)) - ASSERT(ip->i_gdquot); - if (XFS_IS_PQUOTA_ON(mp)) - ASSERT(ip->i_pdquot); - } ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); -#endif return error; } @@ -664,20 +518,6 @@ xfs_qm_dqdetach( } } -int -xfs_qm_calc_dquots_per_chunk( - struct xfs_mount *mp, - unsigned int nbblks) /* basic block units */ -{ - unsigned int ndquots; - - ASSERT(nbblks > 0); - ndquots = BBTOB(nbblks); - do_div(ndquots, sizeof(xfs_dqblk_t)); - - return ndquots; -} - struct xfs_qm_isolate { struct list_head buffers; struct list_head dispose; @@ -831,22 +671,17 @@ xfs_qm_init_quotainfo( qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); - if ((error = list_lru_init(&qinf->qi_lru))) { - kmem_free(qinf); - mp->m_quotainfo = NULL; - return error; - } + error = -list_lru_init(&qinf->qi_lru); + if (error) + goto out_free_qinf; /* * See if quotainodes are setup, and if not, allocate them, * and change the superblock accordingly. */ - if ((error = xfs_qm_init_quotainos(mp))) { - list_lru_destroy(&qinf->qi_lru); - kmem_free(qinf); - mp->m_quotainfo = NULL; - return error; - } + error = xfs_qm_init_quotainos(mp); + if (error) + goto out_free_lru; INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS); INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS); @@ -858,8 +693,7 @@ xfs_qm_init_quotainfo( /* Precalc some constants */ qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); - qinf->qi_dqperchunk = xfs_qm_calc_dquots_per_chunk(mp, - qinf->qi_dqchunklen); + qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen); mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); @@ -906,7 +740,7 @@ xfs_qm_init_quotainfo( qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); - + xfs_qm_dqdestroy(dqp); } else { qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; @@ -923,6 +757,13 @@ xfs_qm_init_quotainfo( qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; register_shrinker(&qinf->qi_shrinker); return 0; + +out_free_lru: + list_lru_destroy(&qinf->qi_lru); +out_free_qinf: + kmem_free(qinf); + mp->m_quotainfo = NULL; + return error; } @@ -1092,10 +933,10 @@ xfs_qm_reset_dqcounts( /* * Do a sanity check, and if needed, repair the dqblk. Don't * output any warnings because it's perfectly possible to - * find uninitialised dquot blks. See comment in xfs_qm_dqcheck. + * find uninitialised dquot blks. See comment in xfs_dqcheck. */ - (void) xfs_qm_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR, - "xfs_quotacheck"); + xfs_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR, + "xfs_quotacheck"); ddq->d_bcount = 0; ddq->d_icount = 0; ddq->d_rtbcount = 0; @@ -1210,16 +1051,18 @@ xfs_qm_dqiterate( lblkno = 0; maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); do { + uint lock_mode; + nmaps = XFS_DQITER_MAP_SIZE; /* * We aren't changing the inode itself. Just changing * some of its data. No new blocks are added here, and * the inode is never added to the transaction. */ - xfs_ilock(qip, XFS_ILOCK_SHARED); + lock_mode = xfs_ilock_data_map_shared(qip); error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno, map, &nmaps, 0); - xfs_iunlock(qip, XFS_ILOCK_SHARED); + xfs_iunlock(qip, lock_mode); if (error) break; @@ -2099,24 +1942,21 @@ xfs_qm_vop_create_dqattach( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - if (udqp) { + if (udqp && XFS_IS_UQUOTA_ON(mp)) { ASSERT(ip->i_udquot == NULL); - ASSERT(XFS_IS_UQUOTA_ON(mp)); ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); ip->i_udquot = xfs_qm_dqhold(udqp); xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); } - if (gdqp) { + if (gdqp && XFS_IS_GQUOTA_ON(mp)) { ASSERT(ip->i_gdquot == NULL); - ASSERT(XFS_IS_GQUOTA_ON(mp)); ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id)); ip->i_gdquot = xfs_qm_dqhold(gdqp); xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); } - if (pdqp) { + if (pdqp && XFS_IS_PQUOTA_ON(mp)) { ASSERT(ip->i_pdquot == NULL); - ASSERT(XFS_IS_PQUOTA_ON(mp)); ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id)); ip->i_pdquot = xfs_qm_dqhold(pdqp); diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 2b602df9c24..797fd463627 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -20,13 +20,29 @@ #include "xfs_dquot_item.h" #include "xfs_dquot.h" -#include "xfs_quota_priv.h" struct xfs_inode; extern struct kmem_zone *xfs_qm_dqtrxzone; /* + * Number of bmaps that we ask from bmapi when doing a quotacheck. + * We make this restriction to keep the memory usage to a minimum. + */ +#define XFS_DQITER_MAP_SIZE 10 + +#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ + !dqp->q_core.d_blk_hardlimit && \ + !dqp->q_core.d_blk_softlimit && \ + !dqp->q_core.d_rtb_hardlimit && \ + !dqp->q_core.d_rtb_softlimit && \ + !dqp->q_core.d_ino_hardlimit && \ + !dqp->q_core.d_ino_softlimit && \ + !dqp->q_core.d_bcount && \ + !dqp->q_core.d_rtbcount && \ + !dqp->q_core.d_icount) + +/* * This defines the unit of allocation of dquots. * Currently, it is just one file system block, and a 4K blk contains 30 * (136 * 30 = 4080) dquots. It's probably not worth trying to make @@ -103,8 +119,6 @@ xfs_dq_to_quota_inode(struct xfs_dquot *dqp) return NULL; } -extern int xfs_qm_calc_dquots_per_chunk(struct xfs_mount *mp, - unsigned int nbblks); extern void xfs_trans_mod_dquot(struct xfs_trans *, struct xfs_dquot *, uint, long); extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 3af50ccdfac..e9be63abd8d 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -18,21 +18,15 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_format.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_alloc.h" #include "xfs_quota.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" #include "xfs_inode.h" -#include "xfs_itable.h" -#include "xfs_bmap.h" -#include "xfs_rtalloc.h" #include "xfs_error.h" -#include "xfs_attr.h" -#include "xfs_buf_item.h" +#include "xfs_trans.h" #include "xfs_qm.h" diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 8174aad0b38..bbc813caba4 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -20,24 +20,18 @@ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_alloc.h" -#include "xfs_quota.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" #include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_itable.h" -#include "xfs_bmap.h" -#include "xfs_rtalloc.h" +#include "xfs_trans.h" #include "xfs_error.h" -#include "xfs_attr.h" -#include "xfs_buf_item.h" +#include "xfs_quota.h" #include "xfs_qm.h" #include "xfs_trace.h" #include "xfs_icache.h" @@ -284,22 +278,29 @@ xfs_qm_scall_trunc_qfiles( xfs_mount_t *mp, uint flags) { - int error = 0, error2 = 0; + int error = EINVAL; - if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { - xfs_debug(mp, "%s: flags=%x m_qflags=%x\n", + if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 || + (flags & ~XFS_DQ_ALLTYPES)) { + xfs_debug(mp, "%s: flags=%x m_qflags=%x", __func__, flags, mp->m_qflags); return XFS_ERROR(EINVAL); } - if (flags & XFS_DQ_USER) + if (flags & XFS_DQ_USER) { error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino); - if (flags & XFS_DQ_GROUP) - error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino); + if (error) + return error; + } + if (flags & XFS_DQ_GROUP) { + error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino); + if (error) + return error; + } if (flags & XFS_DQ_PROJ) - error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino); + error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino); - return error ? error : error2; + return error; } /* @@ -325,7 +326,7 @@ xfs_qm_scall_quotaon( sbflags = 0; if (flags == 0) { - xfs_debug(mp, "%s: zero flags, m_qflags=%x\n", + xfs_debug(mp, "%s: zero flags, m_qflags=%x", __func__, mp->m_qflags); return XFS_ERROR(EINVAL); } @@ -348,7 +349,7 @@ xfs_qm_scall_quotaon( (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 && (flags & XFS_PQUOTA_ENFD))) { xfs_debug(mp, - "%s: Can't enforce without acct, flags=%x sbflags=%x\n", + "%s: Can't enforce without acct, flags=%x sbflags=%x", __func__, flags, mp->m_sb.sb_qflags); return XFS_ERROR(EINVAL); } @@ -648,7 +649,7 @@ xfs_qm_scall_setqlim( q->qi_bsoftlimit = soft; } } else { - xfs_debug(mp, "blkhard %Ld < blksoft %Ld\n", hard, soft); + xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft); } hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ? (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) : @@ -664,7 +665,7 @@ xfs_qm_scall_setqlim( q->qi_rtbsoftlimit = soft; } } else { - xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld\n", hard, soft); + xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft); } hard = (newlim->d_fieldmask & FS_DQ_IHARD) ? @@ -681,7 +682,7 @@ xfs_qm_scall_setqlim( q->qi_isoftlimit = soft; } } else { - xfs_debug(mp, "ihard %Ld < isoft %Ld\n", hard, soft); + xfs_debug(mp, "ihard %Ld < isoft %Ld", hard, soft); } /* @@ -959,7 +960,6 @@ xfs_qm_export_flags( STATIC int xfs_dqrele_inode( struct xfs_inode *ip, - struct xfs_perag *pag, int flags, void *args) { diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index e7d84d2d868..5376dd406ba 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -150,10 +150,6 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, pd, nb, ni, \ f | XFS_QMOPT_RES_REGBLKS) -extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *, - xfs_dqid_t, uint, uint, char *); extern int xfs_mount_reset_sbqflags(struct xfs_mount *); -extern const struct xfs_buf_ops xfs_dquot_buf_ops; - #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/xfs_quota_defs.h b/fs/xfs/xfs_quota_defs.h index e6b0d6e1f4f..137e2093707 100644 --- a/fs/xfs/xfs_quota_defs.h +++ b/fs/xfs/xfs_quota_defs.h @@ -154,4 +154,8 @@ typedef __uint16_t xfs_qwarncnt_t; (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA) #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) +extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq, + xfs_dqid_t id, uint type, uint flags, char *str); +extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); + #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h deleted file mode 100644 index 6d86219d93d..00000000000 --- a/fs/xfs/xfs_quota_priv.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2000-2003 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_QUOTA_PRIV_H__ -#define __XFS_QUOTA_PRIV_H__ - -/* - * Number of bmaps that we ask from bmapi when doing a quotacheck. - * We make this restriction to keep the memory usage to a minimum. - */ -#define XFS_DQITER_MAP_SIZE 10 - -#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ - !dqp->q_core.d_blk_hardlimit && \ - !dqp->q_core.d_blk_softlimit && \ - !dqp->q_core.d_rtb_hardlimit && \ - !dqp->q_core.d_rtb_softlimit && \ - !dqp->q_core.d_ino_hardlimit && \ - !dqp->q_core.d_ino_softlimit && \ - !dqp->q_core.d_bcount && \ - !dqp->q_core.d_rtbcount && \ - !dqp->q_core.d_icount) - -#define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \ - (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \ - (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???"))) - -#endif /* __XFS_QUOTA_PRIV_H__ */ diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 1326d81596c..2ad1b9822e9 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -17,15 +17,14 @@ */ #include "xfs.h" #include "xfs_format.h" +#include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_log.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_inode.h" #include "xfs_quota.h" #include "xfs_trans.h" -#include "xfs_bmap_btree.h" -#include "xfs_inode.h" #include "xfs_qm.h" #include <linux/quota.h> @@ -101,16 +100,36 @@ xfs_fs_set_xstate( if (!XFS_IS_QUOTA_ON(mp)) return -EINVAL; return -xfs_qm_scall_quotaoff(mp, flags); - case Q_XQUOTARM: - if (XFS_IS_QUOTA_ON(mp)) - return -EINVAL; - return -xfs_qm_scall_trunc_qfiles(mp, flags); } return -EINVAL; } STATIC int +xfs_fs_rm_xquota( + struct super_block *sb, + unsigned int uflags) +{ + struct xfs_mount *mp = XFS_M(sb); + unsigned int flags = 0; + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + + if (XFS_IS_QUOTA_ON(mp)) + return -EINVAL; + + if (uflags & FS_USER_QUOTA) + flags |= XFS_DQ_USER; + if (uflags & FS_GROUP_QUOTA) + flags |= XFS_DQ_GROUP; + if (uflags & FS_USER_QUOTA) + flags |= XFS_DQ_PROJ; + + return -xfs_qm_scall_trunc_qfiles(mp, flags); +} + +STATIC int xfs_fs_get_dqblk( struct super_block *sb, struct kqid qid, @@ -150,6 +169,7 @@ const struct quotactl_ops xfs_quotactl_operations = { .get_xstatev = xfs_fs_get_xstatev, .get_xstate = xfs_fs_get_xstate, .set_xstate = xfs_fs_set_xstate, + .rm_xquota = xfs_fs_rm_xquota, .get_dqblk = xfs_fs_get_dqblk, .set_dqblk = xfs_fs_set_dqblk, }; diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 6f9e63c9fc2..ec5ca65c621 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -17,172 +17,260 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" -#include "xfs_rtalloc.h" -#include "xfs_fsops.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc.h" #include "xfs_error.h" -#include "xfs_inode_item.h" +#include "xfs_trans.h" #include "xfs_trans_space.h" #include "xfs_trace.h" #include "xfs_buf.h" #include "xfs_icache.h" +#include "xfs_dinode.h" +#include "xfs_rtalloc.h" /* - * Prototypes for internal functions. + * Read and return the summary information for a given extent size, + * bitmap block combination. + * Keeps track of a current summary block, so we don't keep reading + * it from the buffer cache. */ +STATIC int /* error */ +xfs_rtget_summary( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + int log, /* log2 of extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_suminfo_t *sum) /* out: summary info for this block */ +{ + xfs_buf_t *bp; /* buffer for summary block */ + int error; /* error value */ + xfs_fsblock_t sb; /* summary fsblock */ + int so; /* index into the summary file */ + xfs_suminfo_t *sp; /* pointer to returned data */ + /* + * Compute entry number in the summary file. + */ + so = XFS_SUMOFFS(mp, log, bbno); + /* + * Compute the block number in the summary file. + */ + sb = XFS_SUMOFFSTOBLOCK(mp, so); + /* + * If we have an old buffer, and the block number matches, use that. + */ + if (rbpp && *rbpp && *rsb == sb) + bp = *rbpp; + /* + * Otherwise we have to get the buffer. + */ + else { + /* + * If there was an old one, get rid of it first. + */ + if (rbpp && *rbpp) + xfs_trans_brelse(tp, *rbpp); + error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); + if (error) { + return error; + } + /* + * Remember this buffer and block for the next call. + */ + if (rbpp) { + *rbpp = bp; + *rsb = sb; + } + } + /* + * Point to the summary information & copy it out. + */ + sp = XFS_SUMPTR(mp, bp, so); + *sum = *sp; + /* + * Drop the buffer if we're not asked to remember it. + */ + if (!rbpp) + xfs_trans_brelse(tp, bp); + return 0; +} -STATIC int xfs_rtallocate_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, - xfs_extlen_t, xfs_buf_t **, xfs_fsblock_t *); -STATIC int xfs_rtany_summary(xfs_mount_t *, xfs_trans_t *, int, int, - xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, int *); -STATIC int xfs_rtcheck_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, - xfs_extlen_t, int, xfs_rtblock_t *, int *); -STATIC int xfs_rtfind_back(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, - xfs_rtblock_t, xfs_rtblock_t *); -STATIC int xfs_rtfind_forw(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, - xfs_rtblock_t, xfs_rtblock_t *); -STATIC int xfs_rtget_summary( xfs_mount_t *, xfs_trans_t *, int, - xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, xfs_suminfo_t *); -STATIC int xfs_rtmodify_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, - xfs_extlen_t, int); -STATIC int xfs_rtmodify_summary(xfs_mount_t *, xfs_trans_t *, int, - xfs_rtblock_t, int, xfs_buf_t **, xfs_fsblock_t *); - -/* - * Internal functions. - */ /* - * Allocate space to the bitmap or summary file, and zero it, for growfs. + * Return whether there are any free extents in the size range given + * by low and high, for the bitmap block bbno. */ STATIC int /* error */ -xfs_growfs_rt_alloc( - xfs_mount_t *mp, /* file system mount point */ - xfs_extlen_t oblocks, /* old count of blocks */ - xfs_extlen_t nblocks, /* new count of blocks */ - xfs_inode_t *ip) /* inode (bitmap/summary) */ +xfs_rtany_summary( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + int low, /* low log2 extent size */ + int high, /* high log2 extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + int *stat) /* out: any good extents here? */ { - xfs_fileoff_t bno; /* block number in file */ - xfs_buf_t *bp; /* temporary buffer for zeroing */ - int committed; /* transaction committed flag */ - xfs_daddr_t d; /* disk block address */ - int error; /* error return value */ - xfs_fsblock_t firstblock; /* first block allocated in xaction */ - xfs_bmap_free_t flist; /* list of freed blocks */ - xfs_fsblock_t fsbno; /* filesystem block for bno */ - xfs_bmbt_irec_t map; /* block map output */ - int nmap; /* number of block maps */ - int resblks; /* space reservation */ + int error; /* error value */ + int log; /* loop counter, log2 of ext. size */ + xfs_suminfo_t sum; /* summary data */ /* - * Allocate space to the file, as necessary. + * Loop over logs of extent sizes. Order is irrelevant. */ - while (oblocks < nblocks) { - int cancelflags = 0; - xfs_trans_t *tp; - - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); - resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks); + for (log = low; log <= high; log++) { /* - * Reserve space & log for one extent added to the file. + * Get one summary datum. */ - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata, - resblks, 0); - if (error) - goto error_cancel; - cancelflags = XFS_TRANS_RELEASE_LOG_RES; + error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum); + if (error) { + return error; + } /* - * Lock the inode. + * If there are any, return success. */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + if (sum) { + *stat = 1; + return 0; + } + } + /* + * Found nothing, return failure. + */ + *stat = 0; + return 0; +} - xfs_bmap_init(&flist, &firstblock); - /* - * Allocate blocks to the bitmap file. - */ - nmap = 1; - cancelflags |= XFS_TRANS_ABORT; - error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, - XFS_BMAPI_METADATA, &firstblock, - resblks, &map, &nmap, &flist); - if (!error && nmap < 1) - error = XFS_ERROR(ENOSPC); - if (error) - goto error_cancel; - /* - * Free any blocks freed up in the transaction, then commit. - */ - error = xfs_bmap_finish(&tp, &flist, &committed); - if (error) - goto error_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - if (error) - goto error; - /* - * Now we need to clear the allocated blocks. - * Do this one block per transaction, to keep it simple. - */ - cancelflags = 0; - for (bno = map.br_startoff, fsbno = map.br_startblock; - bno < map.br_startoff + map.br_blockcount; - bno++, fsbno++) { - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO); - /* - * Reserve log for one block zeroing. - */ - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero, - 0, 0); + +/* + * Copy and transform the summary file, given the old and new + * parameters in the mount structures. + */ +STATIC int /* error */ +xfs_rtcopy_summary( + xfs_mount_t *omp, /* old file system mount point */ + xfs_mount_t *nmp, /* new file system mount point */ + xfs_trans_t *tp) /* transaction pointer */ +{ + xfs_rtblock_t bbno; /* bitmap block number */ + xfs_buf_t *bp; /* summary buffer */ + int error; /* error return value */ + int log; /* summary level number (log length) */ + xfs_suminfo_t sum; /* summary data */ + xfs_fsblock_t sumbno; /* summary block number */ + + bp = NULL; + for (log = omp->m_rsumlevels - 1; log >= 0; log--) { + for (bbno = omp->m_sb.sb_rbmblocks - 1; + (xfs_srtblock_t)bbno >= 0; + bbno--) { + error = xfs_rtget_summary(omp, tp, log, bbno, &bp, + &sumbno, &sum); if (error) - goto error_cancel; - /* - * Lock the bitmap inode. - */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - /* - * Get a buffer for the block. - */ - d = XFS_FSB_TO_DADDR(mp, fsbno); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - mp->m_bsize, 0); - if (bp == NULL) { - error = XFS_ERROR(EIO); -error_cancel: - xfs_trans_cancel(tp, cancelflags); - goto error; - } - memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); - xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); - /* - * Commit the transaction. - */ - error = xfs_trans_commit(tp, 0); + return error; + if (sum == 0) + continue; + error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum, + &bp, &sumbno); if (error) - goto error; + return error; + error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum, + &bp, &sumbno); + if (error) + return error; + ASSERT(sum > 0); } - /* - * Go on to the next extent, if any. - */ - oblocks = map.br_startoff + map.br_blockcount; } return 0; +} +/* + * Mark an extent specified by start and len allocated. + * Updates all the summary information as well as the bitmap. + */ +STATIC int /* error */ +xfs_rtallocate_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* start block to allocate */ + xfs_extlen_t len, /* length to allocate */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb) /* in/out: summary block number */ +{ + xfs_rtblock_t end; /* end of the allocated extent */ + int error; /* error value */ + xfs_rtblock_t postblock = 0; /* first block allocated > end */ + xfs_rtblock_t preblock = 0; /* first block allocated < start */ -error: + end = start + len - 1; + /* + * Assume we're allocating out of the middle of a free extent. + * We need to find the beginning and end of the extent so we can + * properly update the summary. + */ + error = xfs_rtfind_back(mp, tp, start, 0, &preblock); + if (error) { + return error; + } + /* + * Find the next allocated block (end of free extent). + */ + error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, + &postblock); + if (error) { + return error; + } + /* + * Decrement the summary information corresponding to the entire + * (old) free extent. + */ + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock + 1 - preblock), + XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); + if (error) { + return error; + } + /* + * If there are blocks not being allocated at the front of the + * old extent, add summary data for them to be free. + */ + if (preblock < start) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(start - preblock), + XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * If there are blocks not being allocated at the end of the + * old extent, add summary data for them to be free. + */ + if (postblock > end) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock - end), + XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * Modify the bitmap to mark this extent allocated. + */ + error = xfs_rtmodify_range(mp, tp, start, len, 0); return error; } @@ -721,1112 +809,126 @@ xfs_rtallocate_extent_size( } /* - * Mark an extent specified by start and len allocated. - * Updates all the summary information as well as the bitmap. + * Allocate space to the bitmap or summary file, and zero it, for growfs. */ STATIC int /* error */ -xfs_rtallocate_range( +xfs_growfs_rt_alloc( xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* start block to allocate */ - xfs_extlen_t len, /* length to allocate */ - xfs_buf_t **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb) /* in/out: summary block number */ + xfs_extlen_t oblocks, /* old count of blocks */ + xfs_extlen_t nblocks, /* new count of blocks */ + xfs_inode_t *ip) /* inode (bitmap/summary) */ { - xfs_rtblock_t end; /* end of the allocated extent */ - int error; /* error value */ - xfs_rtblock_t postblock = 0; /* first block allocated > end */ - xfs_rtblock_t preblock = 0; /* first block allocated < start */ + xfs_fileoff_t bno; /* block number in file */ + xfs_buf_t *bp; /* temporary buffer for zeroing */ + int committed; /* transaction committed flag */ + xfs_daddr_t d; /* disk block address */ + int error; /* error return value */ + xfs_fsblock_t firstblock; /* first block allocated in xaction */ + xfs_bmap_free_t flist; /* list of freed blocks */ + xfs_fsblock_t fsbno; /* filesystem block for bno */ + xfs_bmbt_irec_t map; /* block map output */ + int nmap; /* number of block maps */ + int resblks; /* space reservation */ - end = start + len - 1; - /* - * Assume we're allocating out of the middle of a free extent. - * We need to find the beginning and end of the extent so we can - * properly update the summary. - */ - error = xfs_rtfind_back(mp, tp, start, 0, &preblock); - if (error) { - return error; - } - /* - * Find the next allocated block (end of free extent). - */ - error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, - &postblock); - if (error) { - return error; - } - /* - * Decrement the summary information corresponding to the entire - * (old) free extent. - */ - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(postblock + 1 - preblock), - XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); - if (error) { - return error; - } - /* - * If there are blocks not being allocated at the front of the - * old extent, add summary data for them to be free. - */ - if (preblock < start) { - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(start - preblock), - XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); - if (error) { - return error; - } - } - /* - * If there are blocks not being allocated at the end of the - * old extent, add summary data for them to be free. - */ - if (postblock > end) { - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(postblock - end), - XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb); - if (error) { - return error; - } - } /* - * Modify the bitmap to mark this extent allocated. + * Allocate space to the file, as necessary. */ - error = xfs_rtmodify_range(mp, tp, start, len, 0); - return error; -} - -/* - * Return whether there are any free extents in the size range given - * by low and high, for the bitmap block bbno. - */ -STATIC int /* error */ -xfs_rtany_summary( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - int low, /* low log2 extent size */ - int high, /* high log2 extent size */ - xfs_rtblock_t bbno, /* bitmap block number */ - xfs_buf_t **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - int *stat) /* out: any good extents here? */ -{ - int error; /* error value */ - int log; /* loop counter, log2 of ext. size */ - xfs_suminfo_t sum; /* summary data */ + while (oblocks < nblocks) { + int cancelflags = 0; + xfs_trans_t *tp; - /* - * Loop over logs of extent sizes. Order is irrelevant. - */ - for (log = low; log <= high; log++) { + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); + resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks); /* - * Get one summary datum. + * Reserve space & log for one extent added to the file. */ - error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum); - if (error) { - return error; - } + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc, + resblks, 0); + if (error) + goto error_cancel; + cancelflags = XFS_TRANS_RELEASE_LOG_RES; /* - * If there are any, return success. + * Lock the inode. */ - if (sum) { - *stat = 1; - return 0; - } - } - /* - * Found nothing, return failure. - */ - *stat = 0; - return 0; -} - -/* - * Get a buffer for the bitmap or summary file block specified. - * The buffer is returned read and locked. - */ -STATIC int /* error */ -xfs_rtbuf_get( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t block, /* block number in bitmap or summary */ - int issum, /* is summary not bitmap */ - xfs_buf_t **bpp) /* output: buffer for the block */ -{ - xfs_buf_t *bp; /* block buffer, result */ - xfs_inode_t *ip; /* bitmap or summary inode */ - xfs_bmbt_irec_t map; - int nmap = 1; - int error; /* error value */ - - ip = issum ? mp->m_rsumip : mp->m_rbmip; - - error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK); - if (error) - return error; - - ASSERT(map.br_startblock != NULLFSBLOCK); - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_FSB_TO_DADDR(mp, map.br_startblock), - mp->m_bsize, 0, &bp, NULL); - if (error) - return error; - ASSERT(!xfs_buf_geterror(bp)); - *bpp = bp; - return 0; -} - -#ifdef DEBUG -/* - * Check that the given extent (block range) is allocated already. - */ -STATIC int /* error */ -xfs_rtcheck_alloc_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number of extent */ - xfs_extlen_t len, /* length of extent */ - int *stat) /* out: 1 for allocated, 0 for not */ -{ - xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */ - - return xfs_rtcheck_range(mp, tp, bno, len, 0, &new, stat); -} -#endif - -/* - * Check that the given range is either all allocated (val = 0) or - * all free (val = 1). - */ -STATIC int /* error */ -xfs_rtcheck_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block number of extent */ - xfs_extlen_t len, /* length of extent */ - int val, /* 1 for free, 0 for allocated */ - xfs_rtblock_t *new, /* out: first block not matching */ - int *stat) /* out: 1 for matches, 0 for not */ -{ - xfs_rtword_t *b; /* current word in buffer */ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - xfs_buf_t *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* starting word in buffer */ - int error; /* error value */ - xfs_rtblock_t i; /* current bit number rel. to start */ - xfs_rtblock_t lastbit; /* last useful bit in word */ - xfs_rtword_t mask; /* mask of relevant bits for value */ - xfs_rtword_t wdiff; /* difference from wanted value */ - int word; /* word number in the buffer */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - /* - * Compute starting bitmap block number - */ - block = XFS_BITTOBLOCK(mp, start); - /* - * Read the bitmap block. - */ - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - if (error) { - return error; - } - bufp = bp->b_addr; - /* - * Compute the starting word's address, and starting bit. - */ - word = XFS_BITTOWORD(mp, start); - b = &bufp[word]; - bit = (int)(start & (XFS_NBWORD - 1)); - /* - * 0 (allocated) => all zero's; 1 (free) => all one's. - */ - val = -val; - /* - * If not starting on a word boundary, deal with the first - * (partial) word. - */ - if (bit) { - /* - * Compute first bit not examined. - */ - lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); - /* - * Mask of relevant bits. - */ - mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; - /* - * Compute difference between actual and desired value. - */ - if ((wdiff = (*b ^ val) & mask)) { - /* - * Different, compute first wrong bit and return. - */ - xfs_trans_brelse(tp, bp); - i = XFS_RTLOBIT(wdiff) - bit; - *new = start + i; - *stat = 0; - return 0; - } - i = lastbit - bit; - /* - * Go on to next block if that's where the next word is - * and we need the next word. - */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { - /* - * If done with this block, get the next one. - */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { - return error; - } - b = bufp = bp->b_addr; - word = 0; - } else { - /* - * Go on to the next word in the buffer. - */ - b++; - } - } else { - /* - * Starting on a word boundary, no partial word. - */ - i = 0; - } - /* - * Loop over whole words in buffers. When we use up one buffer - * we move on to the next one. - */ - while (len - i >= XFS_NBWORD) { - /* - * Compute difference between actual and desired value. - */ - if ((wdiff = *b ^ val)) { - /* - * Different, compute first wrong bit and return. - */ - xfs_trans_brelse(tp, bp); - i += XFS_RTLOBIT(wdiff); - *new = start + i; - *stat = 0; - return 0; - } - i += XFS_NBWORD; + xfs_bmap_init(&flist, &firstblock); /* - * Go on to next block if that's where the next word is - * and we need the next word. + * Allocate blocks to the bitmap file. */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { - /* - * If done with this block, get the next one. - */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { - return error; - } - b = bufp = bp->b_addr; - word = 0; - } else { - /* - * Go on to the next word in the buffer. - */ - b++; - } - } - /* - * If not ending on a word boundary, deal with the last - * (partial) word. - */ - if ((lastbit = len - i)) { + nmap = 1; + cancelflags |= XFS_TRANS_ABORT; + error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, + XFS_BMAPI_METADATA, &firstblock, + resblks, &map, &nmap, &flist); + if (!error && nmap < 1) + error = XFS_ERROR(ENOSPC); + if (error) + goto error_cancel; /* - * Mask of relevant bits. + * Free any blocks freed up in the transaction, then commit. */ - mask = ((xfs_rtword_t)1 << lastbit) - 1; + error = xfs_bmap_finish(&tp, &flist, &committed); + if (error) + goto error_cancel; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto error; /* - * Compute difference between actual and desired value. + * Now we need to clear the allocated blocks. + * Do this one block per transaction, to keep it simple. */ - if ((wdiff = (*b ^ val) & mask)) { + cancelflags = 0; + for (bno = map.br_startoff, fsbno = map.br_startblock; + bno < map.br_startoff + map.br_blockcount; + bno++, fsbno++) { + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO); /* - * Different, compute first wrong bit and return. + * Reserve log for one block zeroing. */ - xfs_trans_brelse(tp, bp); - i += XFS_RTLOBIT(wdiff); - *new = start + i; - *stat = 0; - return 0; - } else - i = len; - } - /* - * Successful, return. - */ - xfs_trans_brelse(tp, bp); - *new = start + i; - *stat = 1; - return 0; -} - -/* - * Copy and transform the summary file, given the old and new - * parameters in the mount structures. - */ -STATIC int /* error */ -xfs_rtcopy_summary( - xfs_mount_t *omp, /* old file system mount point */ - xfs_mount_t *nmp, /* new file system mount point */ - xfs_trans_t *tp) /* transaction pointer */ -{ - xfs_rtblock_t bbno; /* bitmap block number */ - xfs_buf_t *bp; /* summary buffer */ - int error; /* error return value */ - int log; /* summary level number (log length) */ - xfs_suminfo_t sum; /* summary data */ - xfs_fsblock_t sumbno; /* summary block number */ - - bp = NULL; - for (log = omp->m_rsumlevels - 1; log >= 0; log--) { - for (bbno = omp->m_sb.sb_rbmblocks - 1; - (xfs_srtblock_t)bbno >= 0; - bbno--) { - error = xfs_rtget_summary(omp, tp, log, bbno, &bp, - &sumbno, &sum); - if (error) - return error; - if (sum == 0) - continue; - error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum, - &bp, &sumbno); - if (error) - return error; - error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum, - &bp, &sumbno); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero, + 0, 0); if (error) - return error; - ASSERT(sum > 0); - } - } - return 0; -} - -/* - * Searching backward from start to limit, find the first block whose - * allocated/free state is different from start's. - */ -STATIC int /* error */ -xfs_rtfind_back( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to look at */ - xfs_rtblock_t limit, /* last block to look at */ - xfs_rtblock_t *rtblock) /* out: start block found */ -{ - xfs_rtword_t *b; /* current word in buffer */ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - xfs_buf_t *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* starting word in buffer */ - int error; /* error value */ - xfs_rtblock_t firstbit; /* first useful bit in the word */ - xfs_rtblock_t i; /* current bit number rel. to start */ - xfs_rtblock_t len; /* length of inspected area */ - xfs_rtword_t mask; /* mask of relevant bits for value */ - xfs_rtword_t want; /* mask for "good" values */ - xfs_rtword_t wdiff; /* difference from wanted value */ - int word; /* word number in the buffer */ - - /* - * Compute and read in starting bitmap block for starting block. - */ - block = XFS_BITTOBLOCK(mp, start); - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - if (error) { - return error; - } - bufp = bp->b_addr; - /* - * Get the first word's index & point to it. - */ - word = XFS_BITTOWORD(mp, start); - b = &bufp[word]; - bit = (int)(start & (XFS_NBWORD - 1)); - len = start - limit + 1; - /* - * Compute match value, based on the bit at start: if 1 (free) - * then all-ones, else all-zeroes. - */ - want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; - /* - * If the starting position is not word-aligned, deal with the - * partial word. - */ - if (bit < XFS_NBWORD - 1) { - /* - * Calculate first (leftmost) bit number to look at, - * and mask for all the relevant bits in this word. - */ - firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0); - mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) << - firstbit; - /* - * Calculate the difference between the value there - * and what we're looking for. - */ - if ((wdiff = (*b ^ want) & mask)) { - /* - * Different. Mark where we are and return. - */ - xfs_trans_brelse(tp, bp); - i = bit - XFS_RTHIBIT(wdiff); - *rtblock = start - i + 1; - return 0; - } - i = bit - firstbit + 1; - /* - * Go on to previous block if that's where the previous word is - * and we need the previous word. - */ - if (--word == -1 && i < len) { - /* - * If done with this block, get the previous one. - */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); - if (error) { - return error; - } - bufp = bp->b_addr; - word = XFS_BLOCKWMASK(mp); - b = &bufp[word]; - } else { - /* - * Go on to the previous word in the buffer. - */ - b--; - } - } else { - /* - * Starting on a word boundary, no partial word. - */ - i = 0; - } - /* - * Loop over whole words in buffers. When we use up one buffer - * we move on to the previous one. - */ - while (len - i >= XFS_NBWORD) { - /* - * Compute difference between actual and desired value. - */ - if ((wdiff = *b ^ want)) { - /* - * Different, mark where we are and return. - */ - xfs_trans_brelse(tp, bp); - i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); - *rtblock = start - i + 1; - return 0; - } - i += XFS_NBWORD; - /* - * Go on to previous block if that's where the previous word is - * and we need the previous word. - */ - if (--word == -1 && i < len) { - /* - * If done with this block, get the previous one. - */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); - if (error) { - return error; - } - bufp = bp->b_addr; - word = XFS_BLOCKWMASK(mp); - b = &bufp[word]; - } else { - /* - * Go on to the previous word in the buffer. - */ - b--; - } - } - /* - * If not ending on a word boundary, deal with the last - * (partial) word. - */ - if (len - i) { - /* - * Calculate first (leftmost) bit number to look at, - * and mask for all the relevant bits in this word. - */ - firstbit = XFS_NBWORD - (len - i); - mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit; - /* - * Compute difference between actual and desired value. - */ - if ((wdiff = (*b ^ want) & mask)) { - /* - * Different, mark where we are and return. - */ - xfs_trans_brelse(tp, bp); - i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); - *rtblock = start - i + 1; - return 0; - } else - i = len; - } - /* - * No match, return that we scanned the whole area. - */ - xfs_trans_brelse(tp, bp); - *rtblock = start - i + 1; - return 0; -} - -/* - * Searching forward from start to limit, find the first block whose - * allocated/free state is different from start's. - */ -STATIC int /* error */ -xfs_rtfind_forw( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to look at */ - xfs_rtblock_t limit, /* last block to look at */ - xfs_rtblock_t *rtblock) /* out: start block found */ -{ - xfs_rtword_t *b; /* current word in buffer */ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - xfs_buf_t *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* starting word in buffer */ - int error; /* error value */ - xfs_rtblock_t i; /* current bit number rel. to start */ - xfs_rtblock_t lastbit; /* last useful bit in the word */ - xfs_rtblock_t len; /* length of inspected area */ - xfs_rtword_t mask; /* mask of relevant bits for value */ - xfs_rtword_t want; /* mask for "good" values */ - xfs_rtword_t wdiff; /* difference from wanted value */ - int word; /* word number in the buffer */ - - /* - * Compute and read in starting bitmap block for starting block. - */ - block = XFS_BITTOBLOCK(mp, start); - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - if (error) { - return error; - } - bufp = bp->b_addr; - /* - * Get the first word's index & point to it. - */ - word = XFS_BITTOWORD(mp, start); - b = &bufp[word]; - bit = (int)(start & (XFS_NBWORD - 1)); - len = limit - start + 1; - /* - * Compute match value, based on the bit at start: if 1 (free) - * then all-ones, else all-zeroes. - */ - want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; - /* - * If the starting position is not word-aligned, deal with the - * partial word. - */ - if (bit) { - /* - * Calculate last (rightmost) bit number to look at, - * and mask for all the relevant bits in this word. - */ - lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); - mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; - /* - * Calculate the difference between the value there - * and what we're looking for. - */ - if ((wdiff = (*b ^ want) & mask)) { - /* - * Different. Mark where we are and return. - */ - xfs_trans_brelse(tp, bp); - i = XFS_RTLOBIT(wdiff) - bit; - *rtblock = start + i - 1; - return 0; - } - i = lastbit - bit; - /* - * Go on to next block if that's where the next word is - * and we need the next word. - */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { - /* - * If done with this block, get the previous one. - */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { - return error; - } - b = bufp = bp->b_addr; - word = 0; - } else { - /* - * Go on to the previous word in the buffer. - */ - b++; - } - } else { - /* - * Starting on a word boundary, no partial word. - */ - i = 0; - } - /* - * Loop over whole words in buffers. When we use up one buffer - * we move on to the next one. - */ - while (len - i >= XFS_NBWORD) { - /* - * Compute difference between actual and desired value. - */ - if ((wdiff = *b ^ want)) { + goto error_cancel; /* - * Different, mark where we are and return. + * Lock the bitmap inode. */ - xfs_trans_brelse(tp, bp); - i += XFS_RTLOBIT(wdiff); - *rtblock = start + i - 1; - return 0; - } - i += XFS_NBWORD; - /* - * Go on to next block if that's where the next word is - * and we need the next word. - */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); /* - * If done with this block, get the next one. + * Get a buffer for the block. */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { - return error; + d = XFS_FSB_TO_DADDR(mp, fsbno); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + mp->m_bsize, 0); + if (bp == NULL) { + error = XFS_ERROR(EIO); +error_cancel: + xfs_trans_cancel(tp, cancelflags); + goto error; } - b = bufp = bp->b_addr; - word = 0; - } else { + memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); + xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); /* - * Go on to the next word in the buffer. + * Commit the transaction. */ - b++; + error = xfs_trans_commit(tp, 0); + if (error) + goto error; } - } - /* - * If not ending on a word boundary, deal with the last - * (partial) word. - */ - if ((lastbit = len - i)) { /* - * Calculate mask for all the relevant bits in this word. - */ - mask = ((xfs_rtword_t)1 << lastbit) - 1; - /* - * Compute difference between actual and desired value. + * Go on to the next extent, if any. */ - if ((wdiff = (*b ^ want) & mask)) { - /* - * Different, mark where we are and return. - */ - xfs_trans_brelse(tp, bp); - i += XFS_RTLOBIT(wdiff); - *rtblock = start + i - 1; - return 0; - } else - i = len; + oblocks = map.br_startoff + map.br_blockcount; } - /* - * No match, return that we scanned the whole area. - */ - xfs_trans_brelse(tp, bp); - *rtblock = start + i - 1; return 0; -} -/* - * Mark an extent specified by start and len freed. - * Updates all the summary information as well as the bitmap. - */ -STATIC int /* error */ -xfs_rtfree_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to free */ - xfs_extlen_t len, /* length to free */ - xfs_buf_t **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb) /* in/out: summary block number */ -{ - xfs_rtblock_t end; /* end of the freed extent */ - int error; /* error value */ - xfs_rtblock_t postblock; /* first block freed > end */ - xfs_rtblock_t preblock; /* first block freed < start */ - - end = start + len - 1; - /* - * Modify the bitmap to mark this extent freed. - */ - error = xfs_rtmodify_range(mp, tp, start, len, 1); - if (error) { - return error; - } - /* - * Assume we're freeing out of the middle of an allocated extent. - * We need to find the beginning and end of the extent so we can - * properly update the summary. - */ - error = xfs_rtfind_back(mp, tp, start, 0, &preblock); - if (error) { - return error; - } - /* - * Find the next allocated block (end of allocated extent). - */ - error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, - &postblock); - if (error) - return error; - /* - * If there are blocks not being freed at the front of the - * old extent, add summary data for them to be allocated. - */ - if (preblock < start) { - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(start - preblock), - XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); - if (error) { - return error; - } - } - /* - * If there are blocks not being freed at the end of the - * old extent, add summary data for them to be allocated. - */ - if (postblock > end) { - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(postblock - end), - XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb); - if (error) { - return error; - } - } - /* - * Increment the summary information corresponding to the entire - * (new) free extent. - */ - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(postblock + 1 - preblock), - XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); +error: return error; } /* - * Read and return the summary information for a given extent size, - * bitmap block combination. - * Keeps track of a current summary block, so we don't keep reading - * it from the buffer cache. - */ -STATIC int /* error */ -xfs_rtget_summary( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - int log, /* log2 of extent size */ - xfs_rtblock_t bbno, /* bitmap block number */ - xfs_buf_t **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - xfs_suminfo_t *sum) /* out: summary info for this block */ -{ - xfs_buf_t *bp; /* buffer for summary block */ - int error; /* error value */ - xfs_fsblock_t sb; /* summary fsblock */ - int so; /* index into the summary file */ - xfs_suminfo_t *sp; /* pointer to returned data */ - - /* - * Compute entry number in the summary file. - */ - so = XFS_SUMOFFS(mp, log, bbno); - /* - * Compute the block number in the summary file. - */ - sb = XFS_SUMOFFSTOBLOCK(mp, so); - /* - * If we have an old buffer, and the block number matches, use that. - */ - if (rbpp && *rbpp && *rsb == sb) - bp = *rbpp; - /* - * Otherwise we have to get the buffer. - */ - else { - /* - * If there was an old one, get rid of it first. - */ - if (rbpp && *rbpp) - xfs_trans_brelse(tp, *rbpp); - error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); - if (error) { - return error; - } - /* - * Remember this buffer and block for the next call. - */ - if (rbpp) { - *rbpp = bp; - *rsb = sb; - } - } - /* - * Point to the summary information & copy it out. - */ - sp = XFS_SUMPTR(mp, bp, so); - *sum = *sp; - /* - * Drop the buffer if we're not asked to remember it. - */ - if (!rbpp) - xfs_trans_brelse(tp, bp); - return 0; -} - -/* - * Set the given range of bitmap bits to the given value. - * Do whatever I/O and logging is required. - */ -STATIC int /* error */ -xfs_rtmodify_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to modify */ - xfs_extlen_t len, /* length of extent to modify */ - int val) /* 1 for free, 0 for allocated */ -{ - xfs_rtword_t *b; /* current word in buffer */ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - xfs_buf_t *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* starting word in buffer */ - int error; /* error value */ - xfs_rtword_t *first; /* first used word in the buffer */ - int i; /* current bit number rel. to start */ - int lastbit; /* last useful bit in word */ - xfs_rtword_t mask; /* mask o frelevant bits for value */ - int word; /* word number in the buffer */ - - /* - * Compute starting bitmap block number. - */ - block = XFS_BITTOBLOCK(mp, start); - /* - * Read the bitmap block, and point to its data. - */ - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - if (error) { - return error; - } - bufp = bp->b_addr; - /* - * Compute the starting word's address, and starting bit. - */ - word = XFS_BITTOWORD(mp, start); - first = b = &bufp[word]; - bit = (int)(start & (XFS_NBWORD - 1)); - /* - * 0 (allocated) => all zeroes; 1 (free) => all ones. - */ - val = -val; - /* - * If not starting on a word boundary, deal with the first - * (partial) word. - */ - if (bit) { - /* - * Compute first bit not changed and mask of relevant bits. - */ - lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); - mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; - /* - * Set/clear the active bits. - */ - if (val) - *b |= mask; - else - *b &= ~mask; - i = lastbit - bit; - /* - * Go on to the next block if that's where the next word is - * and we need the next word. - */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { - /* - * Log the changed part of this block. - * Get the next one. - */ - xfs_trans_log_buf(tp, bp, - (uint)((char *)first - (char *)bufp), - (uint)((char *)b - (char *)bufp)); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { - return error; - } - first = b = bufp = bp->b_addr; - word = 0; - } else { - /* - * Go on to the next word in the buffer - */ - b++; - } - } else { - /* - * Starting on a word boundary, no partial word. - */ - i = 0; - } - /* - * Loop over whole words in buffers. When we use up one buffer - * we move on to the next one. - */ - while (len - i >= XFS_NBWORD) { - /* - * Set the word value correctly. - */ - *b = val; - i += XFS_NBWORD; - /* - * Go on to the next block if that's where the next word is - * and we need the next word. - */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { - /* - * Log the changed part of this block. - * Get the next one. - */ - xfs_trans_log_buf(tp, bp, - (uint)((char *)first - (char *)bufp), - (uint)((char *)b - (char *)bufp)); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { - return error; - } - first = b = bufp = bp->b_addr; - word = 0; - } else { - /* - * Go on to the next word in the buffer - */ - b++; - } - } - /* - * If not ending on a word boundary, deal with the last - * (partial) word. - */ - if ((lastbit = len - i)) { - /* - * Compute a mask of relevant bits. - */ - bit = 0; - mask = ((xfs_rtword_t)1 << lastbit) - 1; - /* - * Set/clear the active bits. - */ - if (val) - *b |= mask; - else - *b &= ~mask; - b++; - } - /* - * Log any remaining changed bytes. - */ - if (b > first) - xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp), - (uint)((char *)b - (char *)bufp - 1)); - return 0; -} - -/* - * Read and modify the summary information for a given extent size, - * bitmap block combination. - * Keeps track of a current summary block, so we don't keep reading - * it from the buffer cache. - */ -STATIC int /* error */ -xfs_rtmodify_summary( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - int log, /* log2 of extent size */ - xfs_rtblock_t bbno, /* bitmap block number */ - int delta, /* change to make to summary info */ - xfs_buf_t **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb) /* in/out: summary block number */ -{ - xfs_buf_t *bp; /* buffer for the summary block */ - int error; /* error value */ - xfs_fsblock_t sb; /* summary fsblock */ - int so; /* index into the summary file */ - xfs_suminfo_t *sp; /* pointer to returned data */ - - /* - * Compute entry number in the summary file. - */ - so = XFS_SUMOFFS(mp, log, bbno); - /* - * Compute the block number in the summary file. - */ - sb = XFS_SUMOFFSTOBLOCK(mp, so); - /* - * If we have an old buffer, and the block number matches, use that. - */ - if (rbpp && *rbpp && *rsb == sb) - bp = *rbpp; - /* - * Otherwise we have to get the buffer. - */ - else { - /* - * If there was an old one, get rid of it first. - */ - if (rbpp && *rbpp) - xfs_trans_brelse(tp, *rbpp); - error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); - if (error) { - return error; - } - /* - * Remember this buffer and block for the next call. - */ - if (rbpp) { - *rbpp = bp; - *rsb = sb; - } - } - /* - * Point to the summary information, modify and log it. - */ - sp = XFS_SUMPTR(mp, bp, so); - *sp += delta; - xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr), - (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1)); - return 0; -} - -/* * Visible (exported) functions. */ @@ -2129,66 +1231,6 @@ xfs_rtallocate_extent( } /* - * Free an extent in the realtime subvolume. Length is expressed in - * realtime extents, as is the block number. - */ -int /* error */ -xfs_rtfree_extent( - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number to free */ - xfs_extlen_t len) /* length of extent freed */ -{ - int error; /* error value */ - xfs_mount_t *mp; /* file system mount structure */ - xfs_fsblock_t sb; /* summary file block number */ - xfs_buf_t *sumbp; /* summary file block buffer */ - - mp = tp->t_mountp; - - ASSERT(mp->m_rbmip->i_itemp != NULL); - ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); - -#ifdef DEBUG - /* - * Check to see that this whole range is currently allocated. - */ - { - int stat; /* result from checking range */ - - error = xfs_rtcheck_alloc_range(mp, tp, bno, len, &stat); - if (error) { - return error; - } - ASSERT(stat); - } -#endif - sumbp = NULL; - /* - * Free the range of realtime blocks. - */ - error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb); - if (error) { - return error; - } - /* - * Mark more blocks free in the superblock. - */ - xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len); - /* - * If we've now freed all the blocks, reset the file sequence - * number to 0. - */ - if (tp->t_frextents_delta + mp->m_sb.sb_frextents == - mp->m_sb.sb_rextents) { - if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) - mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; - *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0; - xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); - } - return 0; -} - -/* * Initialize realtime fields in the mount structure. */ int /* error */ diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index b2a1a24c0e2..752b63d1030 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -95,6 +95,30 @@ xfs_growfs_rt( struct xfs_mount *mp, /* file system mount structure */ xfs_growfs_rt_t *in); /* user supplied growfs struct */ +/* + * From xfs_rtbitmap.c + */ +int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t block, int issum, struct xfs_buf **bpp); +int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_extlen_t len, int val, + xfs_rtblock_t *new, int *stat); +int xfs_rtfind_back(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_rtblock_t limit, + xfs_rtblock_t *rtblock); +int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_rtblock_t limit, + xfs_rtblock_t *rtblock); +int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_extlen_t len, int val); +int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log, + xfs_rtblock_t bbno, int delta, xfs_buf_t **rbpp, + xfs_fsblock_t *rsb); +int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_extlen_t len, + struct xfs_buf **rbpp, xfs_fsblock_t *rsb); + + #else # define xfs_rtallocate_extent(t,b,min,max,l,a,f,p,rb) (ENOSYS) # define xfs_rtfree_extent(t,b,l) (ENOSYS) diff --git a/fs/xfs/xfs_rtbitmap.c b/fs/xfs/xfs_rtbitmap.c new file mode 100644 index 00000000000..f4dd697cac0 --- /dev/null +++ b/fs/xfs/xfs_rtbitmap.c @@ -0,0 +1,973 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_buf.h" +#include "xfs_icache.h" +#include "xfs_dinode.h" +#include "xfs_rtalloc.h" + + +/* + * Realtime allocator bitmap functions shared with userspace. + */ + +/* + * Get a buffer for the bitmap or summary file block specified. + * The buffer is returned read and locked. + */ +int +xfs_rtbuf_get( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t block, /* block number in bitmap or summary */ + int issum, /* is summary not bitmap */ + xfs_buf_t **bpp) /* output: buffer for the block */ +{ + xfs_buf_t *bp; /* block buffer, result */ + xfs_inode_t *ip; /* bitmap or summary inode */ + xfs_bmbt_irec_t map; + int nmap = 1; + int error; /* error value */ + + ip = issum ? mp->m_rsumip : mp->m_rbmip; + + error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK); + if (error) + return error; + + ASSERT(map.br_startblock != NULLFSBLOCK); + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, map.br_startblock), + mp->m_bsize, 0, &bp, NULL); + if (error) + return error; + *bpp = bp; + return 0; +} + +/* + * Searching backward from start to limit, find the first block whose + * allocated/free state is different from start's. + */ +int +xfs_rtfind_back( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to look at */ + xfs_rtblock_t limit, /* last block to look at */ + xfs_rtblock_t *rtblock) /* out: start block found */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtblock_t firstbit; /* first useful bit in the word */ + xfs_rtblock_t i; /* current bit number rel. to start */ + xfs_rtblock_t len; /* length of inspected area */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t want; /* mask for "good" values */ + xfs_rtword_t wdiff; /* difference from wanted value */ + int word; /* word number in the buffer */ + + /* + * Compute and read in starting bitmap block for starting block. + */ + block = XFS_BITTOBLOCK(mp, start); + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + /* + * Get the first word's index & point to it. + */ + word = XFS_BITTOWORD(mp, start); + b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + len = start - limit + 1; + /* + * Compute match value, based on the bit at start: if 1 (free) + * then all-ones, else all-zeroes. + */ + want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; + /* + * If the starting position is not word-aligned, deal with the + * partial word. + */ + if (bit < XFS_NBWORD - 1) { + /* + * Calculate first (leftmost) bit number to look at, + * and mask for all the relevant bits in this word. + */ + firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0); + mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) << + firstbit; + /* + * Calculate the difference between the value there + * and what we're looking for. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different. Mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i = bit - XFS_RTHIBIT(wdiff); + *rtblock = start - i + 1; + return 0; + } + i = bit - firstbit + 1; + /* + * Go on to previous block if that's where the previous word is + * and we need the previous word. + */ + if (--word == -1 && i < len) { + /* + * If done with this block, get the previous one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + word = XFS_BLOCKWMASK(mp); + b = &bufp[word]; + } else { + /* + * Go on to the previous word in the buffer. + */ + b--; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the previous one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = *b ^ want)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); + *rtblock = start - i + 1; + return 0; + } + i += XFS_NBWORD; + /* + * Go on to previous block if that's where the previous word is + * and we need the previous word. + */ + if (--word == -1 && i < len) { + /* + * If done with this block, get the previous one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + word = XFS_BLOCKWMASK(mp); + b = &bufp[word]; + } else { + /* + * Go on to the previous word in the buffer. + */ + b--; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if (len - i) { + /* + * Calculate first (leftmost) bit number to look at, + * and mask for all the relevant bits in this word. + */ + firstbit = XFS_NBWORD - (len - i); + mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); + *rtblock = start - i + 1; + return 0; + } else + i = len; + } + /* + * No match, return that we scanned the whole area. + */ + xfs_trans_brelse(tp, bp); + *rtblock = start - i + 1; + return 0; +} + +/* + * Searching forward from start to limit, find the first block whose + * allocated/free state is different from start's. + */ +int +xfs_rtfind_forw( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to look at */ + xfs_rtblock_t limit, /* last block to look at */ + xfs_rtblock_t *rtblock) /* out: start block found */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtblock_t i; /* current bit number rel. to start */ + xfs_rtblock_t lastbit; /* last useful bit in the word */ + xfs_rtblock_t len; /* length of inspected area */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t want; /* mask for "good" values */ + xfs_rtword_t wdiff; /* difference from wanted value */ + int word; /* word number in the buffer */ + + /* + * Compute and read in starting bitmap block for starting block. + */ + block = XFS_BITTOBLOCK(mp, start); + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + /* + * Get the first word's index & point to it. + */ + word = XFS_BITTOWORD(mp, start); + b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + len = limit - start + 1; + /* + * Compute match value, based on the bit at start: if 1 (free) + * then all-ones, else all-zeroes. + */ + want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; + /* + * If the starting position is not word-aligned, deal with the + * partial word. + */ + if (bit) { + /* + * Calculate last (rightmost) bit number to look at, + * and mask for all the relevant bits in this word. + */ + lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; + /* + * Calculate the difference between the value there + * and what we're looking for. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different. Mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i = XFS_RTLOBIT(wdiff) - bit; + *rtblock = start + i - 1; + return 0; + } + i = lastbit - bit; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the previous one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the previous word in the buffer. + */ + b++; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the next one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = *b ^ want)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *rtblock = start + i - 1; + return 0; + } + i += XFS_NBWORD; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the next one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer. + */ + b++; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if ((lastbit = len - i)) { + /* + * Calculate mask for all the relevant bits in this word. + */ + mask = ((xfs_rtword_t)1 << lastbit) - 1; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *rtblock = start + i - 1; + return 0; + } else + i = len; + } + /* + * No match, return that we scanned the whole area. + */ + xfs_trans_brelse(tp, bp); + *rtblock = start + i - 1; + return 0; +} + +/* + * Read and modify the summary information for a given extent size, + * bitmap block combination. + * Keeps track of a current summary block, so we don't keep reading + * it from the buffer cache. + */ +int +xfs_rtmodify_summary( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + int log, /* log2 of extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + int delta, /* change to make to summary info */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb) /* in/out: summary block number */ +{ + xfs_buf_t *bp; /* buffer for the summary block */ + int error; /* error value */ + xfs_fsblock_t sb; /* summary fsblock */ + int so; /* index into the summary file */ + xfs_suminfo_t *sp; /* pointer to returned data */ + + /* + * Compute entry number in the summary file. + */ + so = XFS_SUMOFFS(mp, log, bbno); + /* + * Compute the block number in the summary file. + */ + sb = XFS_SUMOFFSTOBLOCK(mp, so); + /* + * If we have an old buffer, and the block number matches, use that. + */ + if (rbpp && *rbpp && *rsb == sb) + bp = *rbpp; + /* + * Otherwise we have to get the buffer. + */ + else { + /* + * If there was an old one, get rid of it first. + */ + if (rbpp && *rbpp) + xfs_trans_brelse(tp, *rbpp); + error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); + if (error) { + return error; + } + /* + * Remember this buffer and block for the next call. + */ + if (rbpp) { + *rbpp = bp; + *rsb = sb; + } + } + /* + * Point to the summary information, modify and log it. + */ + sp = XFS_SUMPTR(mp, bp, so); + *sp += delta; + xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr), + (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1)); + return 0; +} + +/* + * Set the given range of bitmap bits to the given value. + * Do whatever I/O and logging is required. + */ +int +xfs_rtmodify_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to modify */ + xfs_extlen_t len, /* length of extent to modify */ + int val) /* 1 for free, 0 for allocated */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtword_t *first; /* first used word in the buffer */ + int i; /* current bit number rel. to start */ + int lastbit; /* last useful bit in word */ + xfs_rtword_t mask; /* mask o frelevant bits for value */ + int word; /* word number in the buffer */ + + /* + * Compute starting bitmap block number. + */ + block = XFS_BITTOBLOCK(mp, start); + /* + * Read the bitmap block, and point to its data. + */ + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + /* + * Compute the starting word's address, and starting bit. + */ + word = XFS_BITTOWORD(mp, start); + first = b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + /* + * 0 (allocated) => all zeroes; 1 (free) => all ones. + */ + val = -val; + /* + * If not starting on a word boundary, deal with the first + * (partial) word. + */ + if (bit) { + /* + * Compute first bit not changed and mask of relevant bits. + */ + lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; + /* + * Set/clear the active bits. + */ + if (val) + *b |= mask; + else + *b &= ~mask; + i = lastbit - bit; + /* + * Go on to the next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * Log the changed part of this block. + * Get the next one. + */ + xfs_trans_log_buf(tp, bp, + (uint)((char *)first - (char *)bufp), + (uint)((char *)b - (char *)bufp)); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + first = b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer + */ + b++; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the next one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Set the word value correctly. + */ + *b = val; + i += XFS_NBWORD; + /* + * Go on to the next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * Log the changed part of this block. + * Get the next one. + */ + xfs_trans_log_buf(tp, bp, + (uint)((char *)first - (char *)bufp), + (uint)((char *)b - (char *)bufp)); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + first = b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer + */ + b++; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if ((lastbit = len - i)) { + /* + * Compute a mask of relevant bits. + */ + bit = 0; + mask = ((xfs_rtword_t)1 << lastbit) - 1; + /* + * Set/clear the active bits. + */ + if (val) + *b |= mask; + else + *b &= ~mask; + b++; + } + /* + * Log any remaining changed bytes. + */ + if (b > first) + xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp), + (uint)((char *)b - (char *)bufp - 1)); + return 0; +} + +/* + * Mark an extent specified by start and len freed. + * Updates all the summary information as well as the bitmap. + */ +int +xfs_rtfree_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to free */ + xfs_extlen_t len, /* length to free */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb) /* in/out: summary block number */ +{ + xfs_rtblock_t end; /* end of the freed extent */ + int error; /* error value */ + xfs_rtblock_t postblock; /* first block freed > end */ + xfs_rtblock_t preblock; /* first block freed < start */ + + end = start + len - 1; + /* + * Modify the bitmap to mark this extent freed. + */ + error = xfs_rtmodify_range(mp, tp, start, len, 1); + if (error) { + return error; + } + /* + * Assume we're freeing out of the middle of an allocated extent. + * We need to find the beginning and end of the extent so we can + * properly update the summary. + */ + error = xfs_rtfind_back(mp, tp, start, 0, &preblock); + if (error) { + return error; + } + /* + * Find the next allocated block (end of allocated extent). + */ + error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, + &postblock); + if (error) + return error; + /* + * If there are blocks not being freed at the front of the + * old extent, add summary data for them to be allocated. + */ + if (preblock < start) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(start - preblock), + XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * If there are blocks not being freed at the end of the + * old extent, add summary data for them to be allocated. + */ + if (postblock > end) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock - end), + XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * Increment the summary information corresponding to the entire + * (new) free extent. + */ + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock + 1 - preblock), + XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); + return error; +} + +/* + * Check that the given range is either all allocated (val = 0) or + * all free (val = 1). + */ +int +xfs_rtcheck_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block number of extent */ + xfs_extlen_t len, /* length of extent */ + int val, /* 1 for free, 0 for allocated */ + xfs_rtblock_t *new, /* out: first block not matching */ + int *stat) /* out: 1 for matches, 0 for not */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtblock_t i; /* current bit number rel. to start */ + xfs_rtblock_t lastbit; /* last useful bit in word */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t wdiff; /* difference from wanted value */ + int word; /* word number in the buffer */ + + /* + * Compute starting bitmap block number + */ + block = XFS_BITTOBLOCK(mp, start); + /* + * Read the bitmap block. + */ + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + /* + * Compute the starting word's address, and starting bit. + */ + word = XFS_BITTOWORD(mp, start); + b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + /* + * 0 (allocated) => all zero's; 1 (free) => all one's. + */ + val = -val; + /* + * If not starting on a word boundary, deal with the first + * (partial) word. + */ + if (bit) { + /* + * Compute first bit not examined. + */ + lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + /* + * Mask of relevant bits. + */ + mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ val) & mask)) { + /* + * Different, compute first wrong bit and return. + */ + xfs_trans_brelse(tp, bp); + i = XFS_RTLOBIT(wdiff) - bit; + *new = start + i; + *stat = 0; + return 0; + } + i = lastbit - bit; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the next one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer. + */ + b++; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the next one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = *b ^ val)) { + /* + * Different, compute first wrong bit and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *new = start + i; + *stat = 0; + return 0; + } + i += XFS_NBWORD; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the next one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer. + */ + b++; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if ((lastbit = len - i)) { + /* + * Mask of relevant bits. + */ + mask = ((xfs_rtword_t)1 << lastbit) - 1; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ val) & mask)) { + /* + * Different, compute first wrong bit and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *new = start + i; + *stat = 0; + return 0; + } else + i = len; + } + /* + * Successful, return. + */ + xfs_trans_brelse(tp, bp); + *new = start + i; + *stat = 1; + return 0; +} + +#ifdef DEBUG +/* + * Check that the given extent (block range) is allocated already. + */ +STATIC int /* error */ +xfs_rtcheck_alloc_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number of extent */ + xfs_extlen_t len) /* length of extent */ +{ + xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */ + int stat; + int error; + + error = xfs_rtcheck_range(mp, tp, bno, len, 0, &new, &stat); + if (error) + return error; + ASSERT(stat); + return 0; +} +#else +#define xfs_rtcheck_alloc_range(m,t,b,l) (0) +#endif +/* + * Free an extent in the realtime subvolume. Length is expressed in + * realtime extents, as is the block number. + */ +int /* error */ +xfs_rtfree_extent( + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to free */ + xfs_extlen_t len) /* length of extent freed */ +{ + int error; /* error value */ + xfs_mount_t *mp; /* file system mount structure */ + xfs_fsblock_t sb; /* summary file block number */ + xfs_buf_t *sumbp = NULL; /* summary file block buffer */ + + mp = tp->t_mountp; + + ASSERT(mp->m_rbmip->i_itemp != NULL); + ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); + + error = xfs_rtcheck_alloc_range(mp, tp, bno, len); + if (error) + return error; + + /* + * Free the range of realtime blocks. + */ + error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb); + if (error) { + return error; + } + /* + * Mark more blocks free in the superblock. + */ + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len); + /* + * If we've now freed all the blocks, reset the file sequence + * number to 0. + */ + if (tp->t_frextents_delta + mp->m_sb.sb_frextents == + mp->m_sb.sb_rextents) { + if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) + mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; + *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0; + xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); + } + return 0; +} + diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c index a5b59d92eb7..7703fa6770f 100644 --- a/fs/xfs/xfs_sb.c +++ b/fs/xfs/xfs_sb.c @@ -17,34 +17,26 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_da_btree.h" -#include "xfs_dir2_format.h" -#include "xfs_dir2.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" -#include "xfs_rtalloc.h" -#include "xfs_bmap.h" #include "xfs_error.h" -#include "xfs_quota.h" -#include "xfs_fsops.h" #include "xfs_trace.h" #include "xfs_cksum.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" +#include "xfs_dinode.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -209,10 +201,6 @@ xfs_mount_validate_sb( * write validation, we don't need to check feature masks. */ if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { - xfs_alert(mp, -"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n" -"Use of these features in this kernel is at your own risk!"); - if (xfs_sb_has_compat_feature(sbp, XFS_SB_FEAT_COMPAT_UNKNOWN)) { xfs_warn(mp, @@ -249,13 +237,13 @@ xfs_mount_validate_sb( if (xfs_sb_version_has_pquotino(sbp)) { if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) { xfs_notice(mp, - "Version 5 of Super block has XFS_OQUOTA bits.\n"); + "Version 5 of Super block has XFS_OQUOTA bits."); return XFS_ERROR(EFSCORRUPTED); } } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { xfs_notice(mp, -"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits.\n"); +"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits."); return XFS_ERROR(EFSCORRUPTED); } @@ -296,15 +284,16 @@ xfs_mount_validate_sb( sbp->sb_inodelog < XFS_DINODE_MIN_LOG || sbp->sb_inodelog > XFS_DINODE_MAX_LOG || sbp->sb_inodesize != (1 << sbp->sb_inodelog) || + sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) || (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */) || sbp->sb_dblocks == 0 || sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) || - sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) { - XFS_CORRUPTION_ERROR("SB sanity check failed", - XFS_ERRLEVEL_LOW, mp, sbp); + sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp) || + sbp->sb_shared_vn != 0)) { + xfs_notice(mp, "SB sanity check failed"); return XFS_ERROR(EFSCORRUPTED); } @@ -345,15 +334,6 @@ xfs_mount_validate_sb( xfs_warn(mp, "Offline file system operation in progress!"); return XFS_ERROR(EFSCORRUPTED); } - - /* - * Version 1 directory format has never worked on Linux. - */ - if (unlikely(!xfs_sb_version_hasdirv2(sbp))) { - xfs_warn(mp, "file system using version 1 directory format"); - return XFS_ERROR(ENOSYS); - } - return 0; } @@ -503,10 +483,16 @@ xfs_sb_quota_to_disk( } /* - * GQUOTINO and PQUOTINO cannot be used together in versions - * of superblock that do not have pquotino. from->sb_flags - * tells us which quota is active and should be copied to - * disk. + * GQUOTINO and PQUOTINO cannot be used together in versions of + * superblock that do not have pquotino. from->sb_flags tells us which + * quota is active and should be copied to disk. If neither are active, + * make sure we write NULLFSINO to the sb_gquotino field as a quota + * inode value of "0" is invalid when the XFS_SB_VERSION_QUOTA feature + * bit is set. + * + * Note that we don't need to handle the sb_uquotino or sb_pquotino here + * as they do not require any translation. Hence the main sb field loop + * will write them appropriately from the in-core superblock. */ if ((*fields & XFS_SB_GQUOTINO) && (from->sb_qflags & XFS_GQUOTA_ACCT)) @@ -514,6 +500,17 @@ xfs_sb_quota_to_disk( else if ((*fields & XFS_SB_PQUOTINO) && (from->sb_qflags & XFS_PQUOTA_ACCT)) to->sb_gquotino = cpu_to_be64(from->sb_pquotino); + else { + /* + * We can't rely on just the fields being logged to tell us + * that it is safe to write NULLFSINO - we should only do that + * if quotas are not actually enabled. Hence only write + * NULLFSINO if both in-core quota inodes are NULL. + */ + if (from->sb_gquotino == NULLFSINO && + from->sb_pquotino == NULLFSINO) + to->sb_gquotino = cpu_to_be64(NULLFSINO); + } *fields &= ~(XFS_SB_PQUOTINO | XFS_SB_GQUOTINO); } @@ -596,6 +593,11 @@ xfs_sb_verify( * single bit error could clear the feature bit and unused parts of the * superblock are supposed to be zero. Hence a non-null crc field indicates that * we've potentially lost a feature bit and we should check it anyway. + * + * However, past bugs (i.e. in growfs) left non-zeroed regions beyond the + * last field in V4 secondary superblocks. So for secondary superblocks, + * we are more forgiving, and ignore CRC failures if the primary doesn't + * indicate that the fs version is V5. */ static void xfs_sb_read_verify( @@ -614,19 +616,22 @@ xfs_sb_read_verify( XFS_SB_VERSION_5) || dsb->sb_crc != 0)) { - if (!xfs_verify_cksum(bp->b_addr, be16_to_cpu(dsb->sb_sectsize), - offsetof(struct xfs_sb, sb_crc))) { - error = EFSCORRUPTED; - goto out_error; + if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) { + /* Only fail bad secondaries on a known V5 filesystem */ + if (bp->b_bn == XFS_SB_DADDR || + xfs_sb_version_hascrc(&mp->m_sb)) { + error = EFSBADCRC; + goto out_error; + } } } error = xfs_sb_verify(bp, true); out_error: if (error) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, - mp, bp->b_addr); xfs_buf_ioerror(bp, error); + if (error == EFSCORRUPTED || error == EFSBADCRC) + xfs_verifier_error(bp); } } @@ -642,7 +647,6 @@ xfs_sb_quiet_read_verify( { struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); - if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) { /* XFS filesystem, verify noisily! */ xfs_sb_read_verify(bp); @@ -662,9 +666,8 @@ xfs_sb_write_verify( error = xfs_sb_verify(bp, false); if (error) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, - mp, bp->b_addr); xfs_buf_ioerror(bp, error); + xfs_verifier_error(bp); return; } @@ -674,8 +677,7 @@ xfs_sb_write_verify( if (bip) XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), - offsetof(struct xfs_sb, sb_crc)); + xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF); } const struct xfs_buf_ops xfs_sb_buf_ops = { diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index 6835b44f850..c43c2d609a2 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -36,8 +36,6 @@ struct xfs_trans; #define XFS_SB_VERSION_5 5 /* CRC enabled filesystem */ #define XFS_SB_VERSION_NUMBITS 0x000f #define XFS_SB_VERSION_ALLFBITS 0xfff0 -#define XFS_SB_VERSION_SASHFBITS 0xf000 -#define XFS_SB_VERSION_REALFBITS 0x0ff0 #define XFS_SB_VERSION_ATTRBIT 0x0010 #define XFS_SB_VERSION_NLINKBIT 0x0020 #define XFS_SB_VERSION_QUOTABIT 0x0040 @@ -50,24 +48,15 @@ struct xfs_trans; #define XFS_SB_VERSION_DIRV2BIT 0x2000 #define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */ #define XFS_SB_VERSION_MOREBITSBIT 0x8000 -#define XFS_SB_VERSION_OKSASHFBITS \ - (XFS_SB_VERSION_EXTFLGBIT | \ - XFS_SB_VERSION_DIRV2BIT | \ - XFS_SB_VERSION_BORGBIT) -#define XFS_SB_VERSION_OKREALFBITS \ - (XFS_SB_VERSION_ATTRBIT | \ - XFS_SB_VERSION_NLINKBIT | \ - XFS_SB_VERSION_QUOTABIT | \ - XFS_SB_VERSION_ALIGNBIT | \ - XFS_SB_VERSION_DALIGNBIT | \ - XFS_SB_VERSION_SHAREDBIT | \ - XFS_SB_VERSION_LOGV2BIT | \ - XFS_SB_VERSION_SECTORBIT | \ - XFS_SB_VERSION_MOREBITSBIT) -#define XFS_SB_VERSION_OKREALBITS \ - (XFS_SB_VERSION_NUMBITS | \ - XFS_SB_VERSION_OKREALFBITS | \ - XFS_SB_VERSION_OKSASHFBITS) + +/* + * Supported feature bit list is just all bits in the versionnum field because + * we've used them all up and understand them all. Except, of course, for the + * shared superblock bit, which nobody knows what it does and so is unsupported. + */ +#define XFS_SB_VERSION_OKBITS \ + ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \ + ~XFS_SB_VERSION_SHAREDBIT) /* * There are two words to hold XFS "feature" bits: the original @@ -76,7 +65,6 @@ struct xfs_trans; * * These defines represent bits in sb_features2. */ -#define XFS_SB_VERSION2_REALFBITS 0x00ffffff /* Mask: features */ #define XFS_SB_VERSION2_RESERVED1BIT 0x00000001 #define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */ #define XFS_SB_VERSION2_RESERVED4BIT 0x00000004 @@ -86,16 +74,11 @@ struct xfs_trans; #define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */ #define XFS_SB_VERSION2_FTYPE 0x00000200 /* inode type in dir */ -#define XFS_SB_VERSION2_OKREALFBITS \ +#define XFS_SB_VERSION2_OKBITS \ (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ XFS_SB_VERSION2_ATTR2BIT | \ XFS_SB_VERSION2_PROJID32BIT | \ XFS_SB_VERSION2_FTYPE) -#define XFS_SB_VERSION2_OKSASHFBITS \ - (0) -#define XFS_SB_VERSION2_OKREALBITS \ - (XFS_SB_VERSION2_OKREALFBITS | \ - XFS_SB_VERSION2_OKSASHFBITS ) /* * Superblock - in core version. Must match the ondisk version below. @@ -182,6 +165,8 @@ typedef struct xfs_sb { /* must be padded to 64 bit alignment */ } xfs_sb_t; +#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc) + /* * Superblock - on disk version. Must match the in core version above. * Must be padded to 64 bit alignment. @@ -343,214 +328,140 @@ typedef enum { #define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS) -static inline int xfs_sb_good_version(xfs_sb_t *sbp) -{ - /* We always support version 1-3 */ - if (sbp->sb_versionnum >= XFS_SB_VERSION_1 && - sbp->sb_versionnum <= XFS_SB_VERSION_3) - return 1; - - /* We support version 4 if all feature bits are supported */ - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) { - if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) || - ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && - (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) - return 0; - - if (sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN) - return 0; - return 1; - } - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) - return 1; - - return 0; -} - /* - * Detect a mismatched features2 field. Older kernels read/wrote - * this into the wrong slot, so to be safe we keep them in sync. + * The first XFS version we support is a v4 superblock with V2 directories. */ -static inline int xfs_sb_has_mismatched_features2(xfs_sb_t *sbp) +static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp) { - return (sbp->sb_bad_features2 != sbp->sb_features2); -} - -static inline unsigned xfs_sb_version_tonew(unsigned v) -{ - if (v == XFS_SB_VERSION_1) - return XFS_SB_VERSION_4; - - if (v == XFS_SB_VERSION_2) - return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT; + if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) + return false; - return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT | - XFS_SB_VERSION_NLINKBIT; -} + /* check for unknown features in the fs */ + if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || + ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && + (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) + return false; -static inline unsigned xfs_sb_version_toold(unsigned v) -{ - if (v & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT)) - return 0; - if (v & XFS_SB_VERSION_NLINKBIT) - return XFS_SB_VERSION_3; - if (v & XFS_SB_VERSION_ATTRBIT) - return XFS_SB_VERSION_2; - return XFS_SB_VERSION_1; + return true; } -static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp) +static inline bool xfs_sb_good_version(struct xfs_sb *sbp) { - return sbp->sb_versionnum == XFS_SB_VERSION_2 || - sbp->sb_versionnum == XFS_SB_VERSION_3 || - (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT)); + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) + return true; + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) + return xfs_sb_good_v4_features(sbp); + return false; } -static inline void xfs_sb_version_addattr(xfs_sb_t *sbp) +/* + * Detect a mismatched features2 field. Older kernels read/wrote + * this into the wrong slot, so to be safe we keep them in sync. + */ +static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp) { - if (sbp->sb_versionnum == XFS_SB_VERSION_1) - sbp->sb_versionnum = XFS_SB_VERSION_2; - else if (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4) - sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT; - else - sbp->sb_versionnum = XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT; + return sbp->sb_bad_features2 != sbp->sb_features2; } -static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp) { - return sbp->sb_versionnum == XFS_SB_VERSION_3 || - (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT)); + return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT); } -static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp) +static inline void xfs_sb_version_addattr(struct xfs_sb *sbp) { - if (sbp->sb_versionnum <= XFS_SB_VERSION_2) - sbp->sb_versionnum = XFS_SB_VERSION_3; - else - sbp->sb_versionnum |= XFS_SB_VERSION_NLINKBIT; + sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT; } -static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp) { - return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT); + return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT); } -static inline void xfs_sb_version_addquota(xfs_sb_t *sbp) +static inline void xfs_sb_version_addquota(struct xfs_sb *sbp) { - if (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4) - sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT; - else - sbp->sb_versionnum = xfs_sb_version_tonew(sbp->sb_versionnum) | - XFS_SB_VERSION_QUOTABIT; + sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT; } -static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT)); } -static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); -} - -static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT); -} - -static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)); + return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); } -static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp) +static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT)); + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); } -static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)); + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT); } -static inline int xfs_sb_version_hassector(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp) { - return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT); + return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT); } -static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp) { - return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT); + return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT); } -static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT)); + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); } /* * sb_features2 bit version macros. - * - * For example, for a bit defined as XFS_SB_VERSION2_FUNBIT, has a macro: - * - * SB_VERSION_HASFUNBIT(xfs_sb_t *sbp) - * ((xfs_sb_version_hasmorebits(sbp) && - * ((sbp)->sb_features2 & XFS_SB_VERSION2_FUNBIT) */ - -static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp) +static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp) { return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || (xfs_sb_version_hasmorebits(sbp) && (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)); } -static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp) { return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || (xfs_sb_version_hasmorebits(sbp) && (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT)); } -static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp) +static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp) { sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT; + sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT; } -static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp) +static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp) { sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; + sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; if (!sbp->sb_features2) sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; } -static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp) { return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || (xfs_sb_version_hasmorebits(sbp) && (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)); } -static inline void xfs_sb_version_addprojid32bit(xfs_sb_t *sbp) +static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp) { sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT; @@ -585,7 +496,9 @@ xfs_sb_has_compat_feature( return (sbp->sb_features_compat & feature) != 0; } -#define XFS_SB_FEAT_RO_COMPAT_ALL 0 +#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */ +#define XFS_SB_FEAT_RO_COMPAT_ALL \ + (XFS_SB_FEAT_RO_COMPAT_FINOBT) #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL static inline bool xfs_sb_has_ro_compat_feature( @@ -621,12 +534,12 @@ xfs_sb_has_incompat_log_feature( /* * V5 superblock specific feature checks */ -static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp) +static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp) { return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; } -static inline int xfs_sb_version_has_pquotino(xfs_sb_t *sbp) +static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp) { return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; } @@ -639,6 +552,12 @@ static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp) (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)); } +static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && + (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); +} + /* * end of superblock version macros */ @@ -699,7 +618,4 @@ extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp); -extern const struct xfs_buf_ops xfs_sb_buf_ops; -extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; - #endif /* __XFS_SB_H__ */ diff --git a/fs/xfs/xfs_shared.h b/fs/xfs/xfs_shared.h new file mode 100644 index 00000000000..82404da2ca6 --- /dev/null +++ b/fs/xfs/xfs_shared.h @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SHARED_H__ +#define __XFS_SHARED_H__ + +/* + * Definitions shared between kernel and userspace that don't fit into any other + * header file that is shared with userspace. + */ +struct xfs_ifork; +struct xfs_buf; +struct xfs_buf_ops; +struct xfs_mount; +struct xfs_trans; +struct xfs_inode; + +/* + * Buffer verifier operations are widely used, including userspace tools + */ +extern const struct xfs_buf_ops xfs_agf_buf_ops; +extern const struct xfs_buf_ops xfs_agi_buf_ops; +extern const struct xfs_buf_ops xfs_agf_buf_ops; +extern const struct xfs_buf_ops xfs_agfl_buf_ops; +extern const struct xfs_buf_ops xfs_allocbt_buf_ops; +extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; +extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; +extern const struct xfs_buf_ops xfs_bmbt_buf_ops; +extern const struct xfs_buf_ops xfs_da3_node_buf_ops; +extern const struct xfs_buf_ops xfs_dquot_buf_ops; +extern const struct xfs_buf_ops xfs_symlink_buf_ops; +extern const struct xfs_buf_ops xfs_agi_buf_ops; +extern const struct xfs_buf_ops xfs_inobt_buf_ops; +extern const struct xfs_buf_ops xfs_inode_buf_ops; +extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; +extern const struct xfs_buf_ops xfs_dquot_buf_ops; +extern const struct xfs_buf_ops xfs_sb_buf_ops; +extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; +extern const struct xfs_buf_ops xfs_symlink_buf_ops; + +/* + * Transaction types. Used to distinguish types of buffers. These never reach + * the log. + */ +#define XFS_TRANS_SETATTR_NOT_SIZE 1 +#define XFS_TRANS_SETATTR_SIZE 2 +#define XFS_TRANS_INACTIVE 3 +#define XFS_TRANS_CREATE 4 +#define XFS_TRANS_CREATE_TRUNC 5 +#define XFS_TRANS_TRUNCATE_FILE 6 +#define XFS_TRANS_REMOVE 7 +#define XFS_TRANS_LINK 8 +#define XFS_TRANS_RENAME 9 +#define XFS_TRANS_MKDIR 10 +#define XFS_TRANS_RMDIR 11 +#define XFS_TRANS_SYMLINK 12 +#define XFS_TRANS_SET_DMATTRS 13 +#define XFS_TRANS_GROWFS 14 +#define XFS_TRANS_STRAT_WRITE 15 +#define XFS_TRANS_DIOSTRAT 16 +/* 17 was XFS_TRANS_WRITE_SYNC */ +#define XFS_TRANS_WRITEID 18 +#define XFS_TRANS_ADDAFORK 19 +#define XFS_TRANS_ATTRINVAL 20 +#define XFS_TRANS_ATRUNCATE 21 +#define XFS_TRANS_ATTR_SET 22 +#define XFS_TRANS_ATTR_RM 23 +#define XFS_TRANS_ATTR_FLAG 24 +#define XFS_TRANS_CLEAR_AGI_BUCKET 25 +#define XFS_TRANS_QM_SBCHANGE 26 +/* + * Dummy entries since we use the transaction type to index into the + * trans_type[] in xlog_recover_print_trans_head() + */ +#define XFS_TRANS_DUMMY1 27 +#define XFS_TRANS_DUMMY2 28 +#define XFS_TRANS_QM_QUOTAOFF 29 +#define XFS_TRANS_QM_DQALLOC 30 +#define XFS_TRANS_QM_SETQLIM 31 +#define XFS_TRANS_QM_DQCLUSTER 32 +#define XFS_TRANS_QM_QINOCREATE 33 +#define XFS_TRANS_QM_QUOTAOFF_END 34 +#define XFS_TRANS_SB_UNIT 35 +#define XFS_TRANS_FSYNC_TS 36 +#define XFS_TRANS_GROWFSRT_ALLOC 37 +#define XFS_TRANS_GROWFSRT_ZERO 38 +#define XFS_TRANS_GROWFSRT_FREE 39 +#define XFS_TRANS_SWAPEXT 40 +#define XFS_TRANS_SB_COUNT 41 +#define XFS_TRANS_CHECKPOINT 42 +#define XFS_TRANS_ICREATE 43 +#define XFS_TRANS_CREATE_TMPFILE 44 +#define XFS_TRANS_TYPE_MAX 44 +/* new transaction types need to be reflected in xfs_logprint(8) */ + +#define XFS_TRANS_TYPES \ + { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \ + { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \ + { XFS_TRANS_INACTIVE, "INACTIVE" }, \ + { XFS_TRANS_CREATE, "CREATE" }, \ + { XFS_TRANS_CREATE_TMPFILE, "CREATE_TMPFILE" }, \ + { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \ + { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \ + { XFS_TRANS_REMOVE, "REMOVE" }, \ + { XFS_TRANS_LINK, "LINK" }, \ + { XFS_TRANS_RENAME, "RENAME" }, \ + { XFS_TRANS_MKDIR, "MKDIR" }, \ + { XFS_TRANS_RMDIR, "RMDIR" }, \ + { XFS_TRANS_SYMLINK, "SYMLINK" }, \ + { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \ + { XFS_TRANS_GROWFS, "GROWFS" }, \ + { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \ + { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \ + { XFS_TRANS_WRITEID, "WRITEID" }, \ + { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \ + { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \ + { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \ + { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \ + { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \ + { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \ + { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \ + { XFS_TRANS_QM_SBCHANGE, "QM_SBCHANGE" }, \ + { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \ + { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \ + { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \ + { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \ + { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \ + { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \ + { XFS_TRANS_SB_UNIT, "SB_UNIT" }, \ + { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \ + { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \ + { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \ + { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ + { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ + { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ + { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \ + { XFS_TRANS_DUMMY1, "DUMMY1" }, \ + { XFS_TRANS_DUMMY2, "DUMMY2" }, \ + { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } + +/* + * This structure is used to track log items associated with + * a transaction. It points to the log item and keeps some + * flags to track the state of the log item. It also tracks + * the amount of space needed to log the item it describes + * once we get to commit processing (see xfs_trans_commit()). + */ +struct xfs_log_item_desc { + struct xfs_log_item *lid_item; + struct list_head lid_trans; + unsigned char lid_flags; +}; + +#define XFS_LID_DIRTY 0x1 + +/* log size calculation functions */ +int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes); +int xfs_log_calc_minimum_size(struct xfs_mount *); + + +/* + * Values for t_flags. + */ +#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */ +#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */ +#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */ +#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ +#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ +#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ +#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer + count in superblock */ +/* + * Values for call flags parameter. + */ +#define XFS_TRANS_RELEASE_LOG_RES 0x4 +#define XFS_TRANS_ABORT 0x8 + +/* + * Field values for xfs_trans_mod_sb. + */ +#define XFS_TRANS_SB_ICOUNT 0x00000001 +#define XFS_TRANS_SB_IFREE 0x00000002 +#define XFS_TRANS_SB_FDBLOCKS 0x00000004 +#define XFS_TRANS_SB_RES_FDBLOCKS 0x00000008 +#define XFS_TRANS_SB_FREXTENTS 0x00000010 +#define XFS_TRANS_SB_RES_FREXTENTS 0x00000020 +#define XFS_TRANS_SB_DBLOCKS 0x00000040 +#define XFS_TRANS_SB_AGCOUNT 0x00000080 +#define XFS_TRANS_SB_IMAXPCT 0x00000100 +#define XFS_TRANS_SB_REXTSIZE 0x00000200 +#define XFS_TRANS_SB_RBMBLOCKS 0x00000400 +#define XFS_TRANS_SB_RBLOCKS 0x00000800 +#define XFS_TRANS_SB_REXTENTS 0x00001000 +#define XFS_TRANS_SB_REXTSLOG 0x00002000 + +/* + * Here we centralize the specification of XFS meta-data buffer reference count + * values. This determines how hard the buffer cache tries to hold onto the + * buffer. + */ +#define XFS_AGF_REF 4 +#define XFS_AGI_REF 4 +#define XFS_AGFL_REF 3 +#define XFS_INO_BTREE_REF 3 +#define XFS_ALLOC_BTREE_REF 2 +#define XFS_BMAP_BTREE_REF 2 +#define XFS_DIR_BTREE_REF 2 +#define XFS_INO_REF 2 +#define XFS_ATTR_BTREE_REF 1 +#define XFS_DQUOT_REF 1 + +/* + * Flags for xfs_trans_ichgtime(). + */ +#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ +#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ +#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */ + + +/* + * Symlink decoding/encoding functions + */ +int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen); +int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset, + uint32_t size, struct xfs_buf *bp); +bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset, + uint32_t size, struct xfs_buf *bp); +void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_inode *ip, struct xfs_ifork *ifp); + +#endif /* __XFS_SHARED_H__ */ diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index ce372b7d564..f2240383d4b 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -59,6 +59,7 @@ static int xfs_stat_proc_show(struct seq_file *m, void *v) { "abtc2", XFSSTAT_END_ABTC_V2 }, { "bmbt2", XFSSTAT_END_BMBT_V2 }, { "ibt2", XFSSTAT_END_IBT_V2 }, + { "fibt2", XFSSTAT_END_FIBT_V2 }, /* we print both series of quota information together */ { "qm", XFSSTAT_END_QM }, }; diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index c03ad38ceae..c8f238b8299 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -183,7 +183,23 @@ struct xfsstats { __uint32_t xs_ibt_2_alloc; __uint32_t xs_ibt_2_free; __uint32_t xs_ibt_2_moves; -#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_IBT_V2+6) +#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2+15) + __uint32_t xs_fibt_2_lookup; + __uint32_t xs_fibt_2_compare; + __uint32_t xs_fibt_2_insrec; + __uint32_t xs_fibt_2_delrec; + __uint32_t xs_fibt_2_newroot; + __uint32_t xs_fibt_2_killroot; + __uint32_t xs_fibt_2_increment; + __uint32_t xs_fibt_2_decrement; + __uint32_t xs_fibt_2_lshift; + __uint32_t xs_fibt_2_rshift; + __uint32_t xs_fibt_2_split; + __uint32_t xs_fibt_2_join; + __uint32_t xs_fibt_2_alloc; + __uint32_t xs_fibt_2_free; + __uint32_t xs_fibt_2_moves; +#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_FIBT_V2+6) __uint32_t xs_qm_dqreclaims; __uint32_t xs_qm_dqreclaim_misses; __uint32_t xs_qm_dquot_dups; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 15188cc9944..8f0333b3f7a 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -17,34 +17,26 @@ */ #include "xfs.h" +#include "xfs_shared.h" #include "xfs_format.h" -#include "xfs_log.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_alloc.h" -#include "xfs_quota.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" +#include "xfs_da_format.h" #include "xfs_inode.h" #include "xfs_btree.h" -#include "xfs_ialloc.h" #include "xfs_bmap.h" -#include "xfs_rtalloc.h" +#include "xfs_alloc.h" #include "xfs_error.h" -#include "xfs_itable.h" #include "xfs_fsops.h" -#include "xfs_attr.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" +#include "xfs_log.h" #include "xfs_log_priv.h" -#include "xfs_trans_priv.h" -#include "xfs_filestream.h" #include "xfs_da_btree.h" -#include "xfs_dir2_format.h" #include "xfs_dir2.h" #include "xfs_extfree_item.h" #include "xfs_mru_cache.h" @@ -52,6 +44,9 @@ #include "xfs_icache.h" #include "xfs_trace.h" #include "xfs_icreate_item.h" +#include "xfs_dinode.h" +#include "xfs_filestream.h" +#include "xfs_quota.h" #include <linux/namei.h> #include <linux/init.h> @@ -770,20 +765,18 @@ xfs_open_devices( * Setup xfs_mount buffer target pointers */ error = ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname); + mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev); if (!mp->m_ddev_targp) goto out_close_rtdev; if (rtdev) { - mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1, - mp->m_fsname); + mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev); if (!mp->m_rtdev_targp) goto out_free_ddev_targ; } if (logdev && logdev != ddev) { - mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1, - mp->m_fsname); + mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev); if (!mp->m_logdev_targp) goto out_free_rtdev_targ; } else { @@ -816,8 +809,7 @@ xfs_setup_devices( { int error; - error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize, - mp->m_sb.sb_sectsize); + error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize); if (error) return error; @@ -827,14 +819,12 @@ xfs_setup_devices( if (xfs_sb_version_hassector(&mp->m_sb)) log_sector_size = mp->m_sb.sb_logsectsize; error = xfs_setsize_buftarg(mp->m_logdev_targp, - mp->m_sb.sb_blocksize, log_sector_size); if (error) return error; } if (mp->m_rtdev_targp) { error = xfs_setsize_buftarg(mp->m_rtdev_targp, - mp->m_sb.sb_blocksize, mp->m_sb.sb_sectsize); if (error) return error; @@ -946,10 +936,6 @@ xfs_fs_destroy_inode( XFS_STATS_INC(vn_reclaim); - /* bad inode, get out here ASAP */ - if (is_bad_inode(inode)) - goto out_reclaim; - ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); /* @@ -965,7 +951,6 @@ xfs_fs_destroy_inode( * this more efficiently than we can here, so simply let background * reclaim tear down all inodes. */ -out_reclaim: xfs_inode_set_reclaim_tag(ip); } @@ -1006,7 +991,7 @@ xfs_fs_evict_inode( trace_xfs_evict_inode(ip); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); XFS_STATS_INC(vn_rele); XFS_STATS_INC(vn_remove); @@ -1165,7 +1150,7 @@ xfs_restore_resvblks(struct xfs_mount *mp) * Note: xfs_log_quiesce() stops background log work - the callers must ensure * it is started again when appropriate. */ -void +static void xfs_quiesce_attr( struct xfs_mount *mp) { @@ -1207,6 +1192,7 @@ xfs_fs_remount( char *p; int error; + sync_filesystem(sb); while ((p = strsep(&options, ",")) != NULL) { int token; @@ -1246,7 +1232,7 @@ xfs_fs_remount( */ #if 0 xfs_info(mp, - "mount option \"%s\" not supported for remount\n", p); + "mount option \"%s\" not supported for remount", p); return -EINVAL; #else break; @@ -1442,11 +1428,11 @@ xfs_fs_fill_super( if (error) goto out_free_fsname; - error = xfs_init_mount_workqueues(mp); + error = -xfs_init_mount_workqueues(mp); if (error) goto out_close_devices; - error = xfs_icsb_init_counters(mp); + error = -xfs_icsb_init_counters(mp); if (error) goto out_destroy_workqueues; @@ -1491,10 +1477,6 @@ xfs_fs_fill_super( error = ENOENT; goto out_unmount; } - if (is_bad_inode(root)) { - error = EINVAL; - goto out_unmount; - } sb->s_root = d_make_root(root); if (!sb->s_root) { error = ENOMEM; @@ -1767,13 +1749,9 @@ init_xfs_fs(void) if (error) goto out_destroy_wq; - error = xfs_filestream_init(); - if (error) - goto out_mru_cache_uninit; - error = xfs_buf_init(); if (error) - goto out_filestream_uninit; + goto out_mru_cache_uninit; error = xfs_init_procfs(); if (error) @@ -1800,8 +1778,6 @@ init_xfs_fs(void) xfs_cleanup_procfs(); out_buf_terminate: xfs_buf_terminate(); - out_filestream_uninit: - xfs_filestream_uninit(); out_mru_cache_uninit: xfs_mru_cache_uninit(); out_destroy_wq: @@ -1820,7 +1796,6 @@ exit_xfs_fs(void) xfs_sysctl_unregister(); xfs_cleanup_procfs(); xfs_buf_terminate(); - xfs_filestream_uninit(); xfs_mru_cache_uninit(); xfs_destroy_workqueues(); xfs_destroy_zones(); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index f622a97a7e3..d69363c833e 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -17,31 +17,32 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" +#include "xfs_shared.h" #include "xfs_fs.h" #include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_dir2_format.h" #include "xfs_dir2.h" -#include "xfs_bmap_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_bmap.h" +#include "xfs_bmap_btree.h" #include "xfs_bmap_util.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trans_space.h" #include "xfs_trace.h" #include "xfs_symlink.h" -#include "xfs_buf_item.h" +#include "xfs_trans.h" +#include "xfs_log.h" +#include "xfs_dinode.h" /* ----- Kernel only functions below ----- */ STATIC int @@ -80,6 +81,10 @@ xfs_readlink_bmap( if (error) { xfs_buf_ioerror_alert(bp, __func__); xfs_buf_relse(bp); + + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; goto out; } byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); @@ -88,7 +93,7 @@ xfs_readlink_bmap( cur_chunk = bp->b_addr; if (xfs_sb_version_hascrc(&mp->m_sb)) { - if (!xfs_symlink_hdr_ok(mp, ip->i_ino, offset, + if (!xfs_symlink_hdr_ok(ip->i_ino, offset, byte_cnt, bp)) { error = EFSCORRUPTED; xfs_alert(mp, @@ -208,10 +213,7 @@ xfs_symlink( return XFS_ERROR(ENAMETOOLONG); udqp = gdqp = NULL; - if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - prid = xfs_get_projid(dp); - else - prid = XFS_PROJID_DEFAULT; + prid = xfs_get_initial_prid(dp); /* * Make sure that we have allocated dquot(s) on disk. @@ -424,8 +426,7 @@ xfs_symlink( */ STATIC int xfs_inactive_symlink_rmt( - xfs_inode_t *ip, - xfs_trans_t **tpp) + struct xfs_inode *ip) { xfs_buf_t *bp; int committed; @@ -437,11 +438,9 @@ xfs_inactive_symlink_rmt( xfs_mount_t *mp; xfs_bmbt_irec_t mval[XFS_SYMLINK_MAPS]; int nmaps; - xfs_trans_t *ntp; int size; xfs_trans_t *tp; - tp = *tpp; mp = ip->i_mount; ASSERT(ip->i_df.if_flags & XFS_IFEXTENTS); /* @@ -453,6 +452,16 @@ xfs_inactive_symlink_rmt( */ ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2); + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + /* * Lock the inode, fix the size, and join it to the transaction. * Hold it so in the normal path, we still have it locked for @@ -471,7 +480,7 @@ xfs_inactive_symlink_rmt( error = xfs_bmapi_read(ip, 0, xfs_symlink_blocks(mp, size), mval, &nmaps, 0); if (error) - goto error0; + goto error_trans_cancel; /* * Invalidate the block(s). No validation is done. */ @@ -481,22 +490,24 @@ xfs_inactive_symlink_rmt( XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); if (!bp) { error = ENOMEM; - goto error1; + goto error_bmap_cancel; } xfs_trans_binval(tp, bp); } /* * Unmap the dead block(s) to the free_list. */ - if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps, - &first_block, &free_list, &done))) - goto error1; + error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps, + &first_block, &free_list, &done); + if (error) + goto error_bmap_cancel; ASSERT(done); /* * Commit the first transaction. This logs the EFI and the inode. */ - if ((error = xfs_bmap_finish(&tp, &free_list, &committed))) - goto error1; + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto error_bmap_cancel; /* * The transaction must have been committed, since there were * actually extents freed by xfs_bunmapi. See xfs_bmap_finish. @@ -511,26 +522,13 @@ xfs_inactive_symlink_rmt( xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); /* - * Get a new, empty transaction to return to our caller. - */ - ntp = xfs_trans_dup(tp); - /* * Commit the transaction containing extent freeing and EFDs. - * If we get an error on the commit here or on the reserve below, - * we need to unlock the inode since the new transaction doesn't - * have the inode attached. */ - error = xfs_trans_commit(tp, 0); - tp = ntp; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); - goto error0; + goto error_unlock; } - /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(tp->t_ticket); /* * Remove the memory for extent descriptions (just bookkeeping). @@ -538,23 +536,16 @@ xfs_inactive_symlink_rmt( if (ip->i_df.if_bytes) xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK); ASSERT(ip->i_df.if_bytes == 0); - /* - * Put an itruncate log reservation in the new transaction - * for our caller. - */ - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); - if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - goto error0; - } - xfs_trans_ijoin(tp, ip, 0); - *tpp = tp; + xfs_iunlock(ip, XFS_ILOCK_EXCL); return 0; - error1: +error_bmap_cancel: xfs_bmap_cancel(&free_list); - error0: +error_trans_cancel: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); +error_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -563,41 +554,46 @@ xfs_inactive_symlink_rmt( */ int xfs_inactive_symlink( - struct xfs_inode *ip, - struct xfs_trans **tp) + struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; int pathlen; trace_xfs_inactive_symlink(ip); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); + xfs_ilock(ip, XFS_ILOCK_EXCL); + /* * Zero length symlinks _can_ exist. */ pathlen = (int)ip->i_d.di_size; - if (!pathlen) + if (!pathlen) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); return 0; + } if (pathlen < 0 || pathlen > MAXPATHLEN) { xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)", __func__, (unsigned long long)ip->i_ino, pathlen); + xfs_iunlock(ip, XFS_ILOCK_EXCL); ASSERT(0); return XFS_ERROR(EFSCORRUPTED); } if (ip->i_df.if_flags & XFS_IFINLINE) { - if (ip->i_df.if_bytes > 0) + if (ip->i_df.if_bytes > 0) xfs_idata_realloc(ip, -(ip->i_df.if_bytes), XFS_DATA_FORK); + xfs_iunlock(ip, XFS_ILOCK_EXCL); ASSERT(ip->i_df.if_bytes == 0); return 0; } + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* remove the remote symlink */ - return xfs_inactive_symlink_rmt(ip, tp); + return xfs_inactive_symlink_rmt(ip); } diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h index 99338ba666a..e75245d0911 100644 --- a/fs/xfs/xfs_symlink.h +++ b/fs/xfs/xfs_symlink.h @@ -22,6 +22,6 @@ int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, umode_t mode, struct xfs_inode **ipp); int xfs_readlink(struct xfs_inode *ip, char *link); -int xfs_inactive_symlink(struct xfs_inode *ip, struct xfs_trans **tpp); +int xfs_inactive_symlink(struct xfs_inode *ip); #endif /* __XFS_SYMLINK_H */ diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/xfs_symlink_remote.c index 01c85e3f647..23c2f2577c8 100644 --- a/fs/xfs/xfs_symlink_remote.c +++ b/fs/xfs/xfs_symlink_remote.c @@ -19,8 +19,9 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_format.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" #include "xfs_ag.h" #include "xfs_sb.h" #include "xfs_mount.h" @@ -30,6 +31,7 @@ #include "xfs_trace.h" #include "xfs_symlink.h" #include "xfs_cksum.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" @@ -78,7 +80,6 @@ xfs_symlink_hdr_set( */ bool xfs_symlink_hdr_ok( - struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset, uint32_t size, @@ -131,12 +132,13 @@ xfs_symlink_read_verify( if (!xfs_sb_version_hascrc(&mp->m_sb)) return; - if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - offsetof(struct xfs_dsymlink_hdr, sl_crc)) || - !xfs_symlink_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_symlink_verify(bp)) xfs_buf_ioerror(bp, EFSCORRUPTED); - } + + if (bp->b_error) + xfs_verifier_error(bp); } static void @@ -151,8 +153,8 @@ xfs_symlink_write_verify( return; if (!xfs_symlink_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } @@ -160,8 +162,7 @@ xfs_symlink_write_verify( struct xfs_dsymlink_hdr *dsl = bp->b_addr; dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn); } - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), - offsetof(struct xfs_dsymlink_hdr, sl_crc)); + xfs_buf_update_cksum(bp, XFS_SYMLINK_CRC_OFF); } const struct xfs_buf_ops xfs_symlink_buf_ops = { diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 5d7b3e40705..1e85bcd0e41 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -17,19 +17,16 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" #include "xfs_format.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_inode.h" #include "xfs_btree.h" -#include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_ialloc.h" #include "xfs_itable.h" @@ -37,6 +34,8 @@ #include "xfs_bmap.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" +#include "xfs_trans.h" +#include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_buf_item.h" #include "xfs_quota.h" @@ -46,6 +45,8 @@ #include "xfs_dquot.h" #include "xfs_log_recover.h" #include "xfs_inode_item.h" +#include "xfs_bmap_btree.h" +#include "xfs_filestream.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 47910e638c1..152f8278263 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -31,8 +31,8 @@ struct xfs_da_args; struct xfs_da_node_entry; struct xfs_dquot; struct xfs_log_item; -struct xlog_ticket; struct xlog; +struct xlog_ticket; struct xlog_recover; struct xlog_recover_item; struct xfs_buf_log_format; @@ -135,6 +135,31 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks); +DECLARE_EVENT_CLASS(xfs_ag_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), + TP_ARGS(mp, agno), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + ), + TP_printk("dev %d:%d agno %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno) +); +#define DEFINE_AG_EVENT(name) \ +DEFINE_EVENT(xfs_ag_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), \ + TP_ARGS(mp, agno)) + +DEFINE_AG_EVENT(xfs_read_agf); +DEFINE_AG_EVENT(xfs_alloc_read_agf); +DEFINE_AG_EVENT(xfs_read_agi); +DEFINE_AG_EVENT(xfs_ialloc_read_agi); + TRACE_EVENT(xfs_attr_list_node_descend, TP_PROTO(struct xfs_attr_list_context *ctx, struct xfs_da_node_entry *btree), @@ -513,6 +538,64 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered); +DECLARE_EVENT_CLASS(xfs_filestream_class, + TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), + TP_ARGS(ip, agno), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_agnumber_t, agno) + __field(int, streams) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->agno = agno; + __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno); + ), + TP_printk("dev %d:%d ino 0x%llx agno %u streams %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->agno, + __entry->streams) +) +#define DEFINE_FILESTREAM_EVENT(name) \ +DEFINE_EVENT(xfs_filestream_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), \ + TP_ARGS(ip, agno)) +DEFINE_FILESTREAM_EVENT(xfs_filestream_free); +DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup); +DEFINE_FILESTREAM_EVENT(xfs_filestream_scan); + +TRACE_EVENT(xfs_filestream_pick, + TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno, + xfs_extlen_t free, int nscan), + TP_ARGS(ip, agno, free, nscan), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_agnumber_t, agno) + __field(int, streams) + __field(xfs_extlen_t, free) + __field(int, nscan) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->agno = agno; + __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno); + __entry->free = free; + __entry->nscan = nscan; + ), + TP_printk("dev %d:%d ino 0x%llx agno %u streams %d free %d nscan %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->agno, + __entry->streams, + __entry->free, + __entry->nscan) +); + DECLARE_EVENT_CLASS(xfs_lock_class, TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, unsigned long caller_ip), @@ -578,6 +661,8 @@ DEFINE_INODE_EVENT(xfs_readlink); DEFINE_INODE_EVENT(xfs_inactive_symlink); DEFINE_INODE_EVENT(xfs_alloc_file_space); DEFINE_INODE_EVENT(xfs_free_file_space); +DEFINE_INODE_EVENT(xfs_zero_file_space); +DEFINE_INODE_EVENT(xfs_collapse_file_space); DEFINE_INODE_EVENT(xfs_readdir); #ifdef CONFIG_XFS_POSIX_ACL DEFINE_INODE_EVENT(xfs_get_acl); @@ -938,6 +1023,63 @@ DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned); DEFINE_LOG_ITEM_EVENT(xfs_ail_locked); DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing); +DECLARE_EVENT_CLASS(xfs_ail_class, + TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn), + TP_ARGS(lip, old_lsn, new_lsn), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(void *, lip) + __field(uint, type) + __field(uint, flags) + __field(xfs_lsn_t, old_lsn) + __field(xfs_lsn_t, new_lsn) + ), + TP_fast_assign( + __entry->dev = lip->li_mountp->m_super->s_dev; + __entry->lip = lip; + __entry->type = lip->li_type; + __entry->flags = lip->li_flags; + __entry->old_lsn = old_lsn; + __entry->new_lsn = new_lsn; + ), + TP_printk("dev %d:%d lip 0x%p old lsn %d/%d new lsn %d/%d type %s flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->lip, + CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn), + CYCLE_LSN(__entry->new_lsn), BLOCK_LSN(__entry->new_lsn), + __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), + __print_flags(__entry->flags, "|", XFS_LI_FLAGS)) +) + +#define DEFINE_AIL_EVENT(name) \ +DEFINE_EVENT(xfs_ail_class, name, \ + TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn), \ + TP_ARGS(lip, old_lsn, new_lsn)) +DEFINE_AIL_EVENT(xfs_ail_insert); +DEFINE_AIL_EVENT(xfs_ail_move); +DEFINE_AIL_EVENT(xfs_ail_delete); + +TRACE_EVENT(xfs_log_assign_tail_lsn, + TP_PROTO(struct xlog *log, xfs_lsn_t new_lsn), + TP_ARGS(log, new_lsn), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_lsn_t, new_lsn) + __field(xfs_lsn_t, old_lsn) + __field(xfs_lsn_t, last_sync_lsn) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->new_lsn = new_lsn; + __entry->old_lsn = atomic64_read(&log->l_tail_lsn); + __entry->last_sync_lsn = atomic64_read(&log->l_last_sync_lsn); + ), + TP_printk("dev %d:%d new tail lsn %d/%d, old lsn %d/%d, last sync %d/%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + CYCLE_LSN(__entry->new_lsn), BLOCK_LSN(__entry->new_lsn), + CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn), + CYCLE_LSN(__entry->last_sync_lsn), BLOCK_LSN(__entry->last_sync_lsn)) +) DECLARE_EVENT_CLASS(xfs_file_class, TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), @@ -976,7 +1118,6 @@ DEFINE_RW_EVENT(xfs_file_read); DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_direct_write); DEFINE_RW_EVENT(xfs_file_splice_read); -DEFINE_RW_EVENT(xfs_file_splice_write); DECLARE_EVENT_CLASS(xfs_page_class, TP_PROTO(struct inode *inode, struct page *page, unsigned long off, diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 5411e01ab45..d03932564cc 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -18,32 +18,21 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_error.h" -#include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_alloc.h" #include "xfs_extent_busy.h" -#include "xfs_bmap.h" #include "xfs_quota.h" -#include "xfs_qm.h" +#include "xfs_trans.h" #include "xfs_trans_priv.h" -#include "xfs_trans_space.h" -#include "xfs_inode_item.h" -#include "xfs_log_priv.h" -#include "xfs_buf_item.h" +#include "xfs_log.h" #include "xfs_trace.h" +#include "xfs_error.h" kmem_zone_t *xfs_trans_zone; kmem_zone_t *xfs_log_item_desc_zone; @@ -838,7 +827,7 @@ xfs_trans_committed_bulk( xfs_log_item_batch_insert(ailp, &cur, log_items, i, commit_lsn); spin_lock(&ailp->xa_lock); - xfs_trans_ail_cursor_done(ailp, &cur); + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); } @@ -898,12 +887,7 @@ xfs_trans_commit( xfs_trans_apply_sb_deltas(tp); xfs_trans_apply_dquot_deltas(tp); - error = xfs_log_commit_cil(mp, tp, &commit_lsn, flags); - if (error == ENOMEM) { - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - error = XFS_ERROR(EIO); - goto out_unreserve; - } + xfs_log_commit_cil(mp, tp, &commit_lsn, flags); current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); xfs_trans_free(tp); @@ -913,10 +897,7 @@ xfs_trans_commit( * log out now and wait for it. */ if (sync) { - if (!error) { - error = _xfs_log_force_lsn(mp, commit_lsn, - XFS_LOG_SYNC, NULL); - } + error = _xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL); XFS_STATS_INC(xs_trans_sync); } else { XFS_STATS_INC(xs_trans_async); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 09cf40b89e8..b5bc1ab3c4d 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -18,10 +18,6 @@ #ifndef __XFS_TRANS_H__ #define __XFS_TRANS_H__ -struct xfs_log_item; - -#include "xfs_trans_resv.h" - /* kernel only transaction subsystem defines */ struct xfs_buf; @@ -68,7 +64,7 @@ typedef struct xfs_log_item { struct xfs_item_ops { void (*iop_size)(xfs_log_item_t *, int *, int *); - void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); + void (*iop_format)(xfs_log_item_t *, struct xfs_log_vec *); void (*iop_pin)(xfs_log_item_t *); void (*iop_unpin)(xfs_log_item_t *, int remove); uint (*iop_push)(struct xfs_log_item *, struct list_head *); @@ -77,6 +73,9 @@ struct xfs_item_ops { void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); }; +void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, + int type, const struct xfs_item_ops *ops); + /* * Return values for the iop_push() routines. */ @@ -85,18 +84,12 @@ struct xfs_item_ops { #define XFS_ITEM_LOCKED 2 #define XFS_ITEM_FLUSHING 3 -/* - * This is the type of function which can be given to xfs_trans_callback() - * to be called upon the transaction's commit to disk. - */ -typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *); /* * This is the structure maintained for every active transaction. */ typedef struct xfs_trans { unsigned int t_magic; /* magic number */ - xfs_log_callback_t t_logcb; /* log callback struct */ unsigned int t_type; /* transaction type */ unsigned int t_log_res; /* amt of log space resvd */ unsigned int t_log_count; /* count for perm log res */ @@ -132,7 +125,6 @@ typedef struct xfs_trans { int64_t t_rextents_delta;/* superblocks rextents chg */ int64_t t_rextslog_delta;/* superblocks rextslog chg */ struct list_head t_items; /* log item descriptors */ - xfs_trans_header_t t_header; /* header for in-log trans */ struct list_head t_busy; /* list of busy extents */ unsigned long t_pflags; /* saved process flags state */ } xfs_trans_t; @@ -237,10 +229,16 @@ void xfs_trans_log_efd_extent(xfs_trans_t *, xfs_fsblock_t, xfs_extlen_t); int xfs_trans_commit(xfs_trans_t *, uint flags); +int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); void xfs_trans_cancel(xfs_trans_t *, int); int xfs_trans_ail_init(struct xfs_mount *); void xfs_trans_ail_destroy(struct xfs_mount *); +void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *, + enum xfs_blft); +void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, + struct xfs_buf *src_bp); + extern kmem_zone_t *xfs_trans_zone; extern kmem_zone_t *xfs_log_item_desc_zone; diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 21c6d7ddbc0..cb0f3a84cc6 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -18,15 +18,16 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_trace.h" #include "xfs_error.h" +#include "xfs_log.h" #ifdef DEBUG /* @@ -172,7 +173,6 @@ xfs_trans_ail_cursor_next( */ void xfs_trans_ail_cursor_done( - struct xfs_ail *ailp, struct xfs_ail_cursor *cur) { cur->item = NULL; @@ -367,7 +367,7 @@ xfsaild_push( * If the AIL is empty or our push has reached the end we are * done now. */ - xfs_trans_ail_cursor_done(ailp, &cur); + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); goto out_done; } @@ -452,7 +452,7 @@ xfsaild_push( break; lsn = lip->li_lsn; } - xfs_trans_ail_cursor_done(ailp, &cur); + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); if (xfs_buf_delwri_submit_nowait(&ailp->xa_buf_list)) @@ -658,11 +658,13 @@ xfs_trans_ail_update_bulk( if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0) continue; + trace_xfs_ail_move(lip, lip->li_lsn, lsn); xfs_ail_delete(ailp, lip); if (mlip == lip) mlip_changed = 1; } else { lip->li_flags |= XFS_LI_IN_AIL; + trace_xfs_ail_insert(lip, 0, lsn); } lip->li_lsn = lsn; list_add(&lip->li_ail, &tmp); @@ -731,6 +733,7 @@ xfs_trans_ail_delete_bulk( return; } + trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); xfs_ail_delete(ailp, lip); lip->li_flags &= ~XFS_LI_IN_AIL; lip->li_lsn = 0; diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 8c75b8f6727..b8eef0549f3 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -17,17 +17,15 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" #include "xfs_error.h" @@ -277,6 +275,10 @@ xfs_trans_read_buf_map( XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); xfs_buf_relse(bp); + + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; return error; } #ifdef DEBUG @@ -316,7 +318,18 @@ xfs_trans_read_buf_map( ASSERT(bp->b_iodone == NULL); XFS_BUF_READ(bp); bp->b_ops = ops; - xfsbdstrat(tp->t_mountp, bp); + + /* + * XXX(hch): clean up the error handling here to be less + * of a mess.. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + trace_xfs_bdstrat_shut(bp, _RET_IP_); + xfs_bioerror_relse(bp); + } else { + xfs_buf_iorequest(bp); + } + error = xfs_buf_iowait(bp); if (error) { xfs_buf_ioerror_alert(bp, __func__); @@ -329,6 +342,9 @@ xfs_trans_read_buf_map( if (tp->t_flags & XFS_TRANS_DIRTY) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; return error; } } @@ -366,6 +382,10 @@ xfs_trans_read_buf_map( if (tp->t_flags & XFS_TRANS_DIRTY) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); xfs_buf_relse(bp); + + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; return error; } #ifdef DEBUG diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 54ee3c5dee7..41172861e85 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -17,23 +17,18 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_alloc.h" -#include "xfs_quota.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" #include "xfs_inode.h" -#include "xfs_itable.h" -#include "xfs_bmap.h" -#include "xfs_rtalloc.h" #include "xfs_error.h" -#include "xfs_attr.h" -#include "xfs_buf_item.h" +#include "xfs_trans.h" #include "xfs_trans_priv.h" +#include "xfs_quota.h" #include "xfs_qm.h" STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *); @@ -300,8 +295,8 @@ xfs_trans_mod_dquot( /* * Given an array of dqtrx structures, lock all the dquots associated and join * them to the transaction, provided they have been modified. We know that the - * highest number of dquots of one type - usr, grp OR prj - involved in a - * transaction is 2 so we don't need to make this very generic. + * highest number of dquots of one type - usr, grp and prj - involved in a + * transaction is 3 so we don't need to make this very generic. */ STATIC void xfs_trans_dqlockedjoin( diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index 8d71b16ecca..47978ba89da 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -17,12 +17,13 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_extfree_item.h" diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 53dfe46f368..50c3f561428 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -17,18 +17,15 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_btree.h" +#include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_inode_item.h" #include "xfs_trace.h" @@ -114,12 +111,14 @@ xfs_trans_log_inode( /* * First time we log the inode in a transaction, bump the inode change - * counter if it is configured for this to occur. + * counter if it is configured for this to occur. We don't use + * inode_inc_version() because there is no need for extra locking around + * i_version as we already hold the inode locked exclusively for + * metadata modification. */ if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) && IS_I_VERSION(VFS_I(ip))) { - inode_inc_iversion(VFS_I(ip)); - ip->i_d.di_changecount = VFS_I(ip)->i_version; + ip->i_d.di_changecount = ++VFS_I(ip)->i_version; flags |= XFS_ILOG_CORE; } diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index c52def0b441..bd1281862ad 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -27,7 +27,6 @@ struct xfs_log_vec; void xfs_trans_init(struct xfs_mount *); -int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); void xfs_trans_del_item(struct xfs_log_item *); void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, @@ -134,8 +133,7 @@ struct xfs_log_item * xfs_trans_ail_cursor_last(struct xfs_ail *ailp, xfs_lsn_t lsn); struct xfs_log_item * xfs_trans_ail_cursor_next(struct xfs_ail *ailp, struct xfs_ail_cursor *cur); -void xfs_trans_ail_cursor_done(struct xfs_ail *ailp, - struct xfs_ail_cursor *cur); +void xfs_trans_ail_cursor_done(struct xfs_ail_cursor *cur); #if BITS_PER_LONG != 64 static inline void diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c index a65a3cc4061..f2bda7c76b8 100644 --- a/fs/xfs/xfs_trans_resv.c +++ b/fs/xfs/xfs_trans_resv.c @@ -18,27 +18,20 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" -#include "xfs_log.h" +#include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_error.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_btree.h" +#include "xfs_bmap_btree.h" #include "xfs_ialloc.h" -#include "xfs_alloc.h" -#include "xfs_extent_busy.h" -#include "xfs_bmap.h" -#include "xfs_bmap_util.h" #include "xfs_quota.h" +#include "xfs_trans.h" #include "xfs_qm.h" #include "xfs_trans_space.h" #include "xfs_trace.h" @@ -89,20 +82,69 @@ xfs_calc_buf_res( * on disk. Hence we need an inode reservation function that calculates all this * correctly. So, we log: * - * - log op headers for object + * - 4 log op headers for object + * - for the ilf, the inode core and 2 forks * - inode log format object - * - the entire inode contents (core + 2 forks) - * - two bmap btree block headers + * - the inode core + * - two inode forks containing bmap btree root blocks. + * - the btree data contained by both forks will fit into the inode size, + * hence when combined with the inode core above, we have a total of the + * actual inode size. + * - the BMBT headers need to be accounted separately, as they are + * additional to the records and pointers that fit inside the inode + * forks. */ STATIC uint xfs_calc_inode_res( struct xfs_mount *mp, uint ninodes) { - return ninodes * (sizeof(struct xlog_op_header) + - sizeof(struct xfs_inode_log_format) + - mp->m_sb.sb_inodesize + - 2 * XFS_BMBT_BLOCK_LEN(mp)); + return ninodes * + (4 * sizeof(struct xlog_op_header) + + sizeof(struct xfs_inode_log_format) + + mp->m_sb.sb_inodesize + + 2 * XFS_BMBT_BLOCK_LEN(mp)); +} + +/* + * The free inode btree is a conditional feature and the log reservation + * requirements differ slightly from that of the traditional inode allocation + * btree. The finobt tracks records for inode chunks with at least one free + * inode. A record can be removed from the tree for an inode allocation + * or free and thus the finobt reservation is unconditional across: + * + * - inode allocation + * - inode free + * - inode chunk allocation + * + * The 'modify' param indicates to include the record modification scenario. The + * 'alloc' param indicates to include the reservation for free space btree + * modifications on behalf of finobt modifications. This is required only for + * transactions that do not already account for free space btree modifications. + * + * the free inode btree: max depth * block size + * the allocation btrees: 2 trees * (max depth - 1) * block size + * the free inode btree entry: block size + */ +STATIC uint +xfs_calc_finobt_res( + struct xfs_mount *mp, + int alloc, + int modify) +{ + uint res; + + if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + return 0; + + res = xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)); + if (alloc) + res += xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); + if (modify) + res += (uint)XFS_FSB_TO_B(mp, 1); + + return res; } /* @@ -182,7 +224,7 @@ xfs_calc_itruncate_reservation( xfs_calc_buf_res(5, 0) + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + + xfs_calc_buf_res(2 + mp->m_ialloc_blks + mp->m_in_maxlevels, 0))); } @@ -212,6 +254,19 @@ xfs_calc_rename_reservation( } /* + * For removing an inode from unlinked list at first, we can modify: + * the agi hash list and counters: sector size + * the on disk inode before ours in the agi hash list: inode cluster size + */ +STATIC uint +xfs_calc_iunlink_remove_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size); +} + +/* * For creating a link to an inode: * the parent directory inode: inode size * the linked inode: inode size @@ -228,6 +283,7 @@ xfs_calc_link_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + + xfs_calc_iunlink_remove_reservation(mp) + MAX((xfs_calc_inode_res(mp, 2) + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), @@ -237,6 +293,18 @@ xfs_calc_link_reservation( } /* + * For adding an inode to unlinked list we can modify: + * the agi hash list: sector size + * the unlinked inode: inode size + */ +STATIC uint +xfs_calc_iunlink_add_reservation(xfs_mount_t *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_inode_res(mp, 1); +} + +/* * For removing a directory entry we can modify: * the parent directory inode: inode size * the removed inode: inode size @@ -253,10 +321,11 @@ xfs_calc_remove_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((xfs_calc_inode_res(mp, 2) + + xfs_calc_iunlink_add_reservation(mp) + + MAX((xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), XFS_FSB_TO_B(mp, 1)))); } @@ -275,6 +344,7 @@ xfs_calc_remove_reservation( * the superblock for the nlink flag: sector size * the directory btree: (max depth + v2) * dir block size * the directory inode's bmap btree: (max depth + v2) * block size + * the finobt (record modification and allocation btrees) */ STATIC uint xfs_calc_create_resv_modify( @@ -283,14 +353,15 @@ xfs_calc_create_resv_modify( return xfs_calc_inode_res(mp, 2) + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + (uint)XFS_FSB_TO_B(mp, 1) + - xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)) + + xfs_calc_finobt_res(mp, 1, 1); } /* * For create we can allocate some inodes giving: * the agi and agf of the ag getting the new inodes: 2 * sectorsize * the superblock for the nlink flag: sector size - * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize + * the inode blocks allocated: mp->m_ialloc_blks * blocksize * the inode btree: max depth * blocksize * the allocation btrees: 2 trees * (max depth - 1) * block size */ @@ -300,7 +371,7 @@ xfs_calc_create_resv_alloc( { return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + mp->m_sb.sb_sectsize + - xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) + xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), XFS_FSB_TO_B(mp, 1)); @@ -321,6 +392,7 @@ __xfs_calc_create_reservation( * the superblock for the nlink flag: sector size * the inode btree: max depth * blocksize * the allocation btrees: 2 trees * (max depth - 1) * block size + * the finobt (record insertion) */ STATIC uint xfs_calc_icreate_resv_alloc( @@ -330,7 +402,8 @@ xfs_calc_icreate_resv_alloc( mp->m_sb.sb_sectsize + xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), - XFS_FSB_TO_B(mp, 1)); + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_finobt_res(mp, 0, 0); } STATIC uint @@ -351,6 +424,20 @@ xfs_calc_create_reservation( } +STATIC uint +xfs_calc_create_tmpfile_reservation( + struct xfs_mount *mp) +{ + uint res = XFS_DQUOT_LOGRES(mp); + + if (xfs_sb_version_hascrc(&mp->m_sb)) + res += xfs_calc_icreate_resv_alloc(mp); + else + res += xfs_calc_create_resv_alloc(mp); + + return res + xfs_calc_iunlink_add_reservation(mp); +} + /* * Making a new directory is the same as creating a new file. */ @@ -384,6 +471,7 @@ xfs_calc_symlink_reservation( * the on disk inode before ours in the agi hash list: inode cluster size * the inode btree: max depth * blocksize * the allocation btrees: 2 trees * (max depth - 1) * block size + * the finobt (record insertion, removal or modification) */ STATIC uint xfs_calc_ifree_reservation( @@ -391,15 +479,15 @@ xfs_calc_ifree_reservation( { return XFS_DQUOT_LOGRES(mp) + xfs_calc_inode_res(mp, 1) + - xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + - MAX((__uint16_t)XFS_FSB_TO_B(mp, 1), - XFS_INODE_CLUSTER_SIZE(mp)) + + xfs_calc_iunlink_remove_reservation(mp) + xfs_calc_buf_res(1, 0) + - xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + + xfs_calc_buf_res(2 + mp->m_ialloc_blks + mp->m_in_maxlevels, 0) + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), - XFS_FSB_TO_B(mp, 1)); + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_finobt_res(mp, 0, 1); } /* @@ -522,7 +610,7 @@ xfs_calc_addafork_reservation( return XFS_DQUOT_LOGRES(mp) + xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(1, mp->m_dirblksize) + + xfs_calc_buf_res(1, mp->m_dir_geo->blksize) + xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1, XFS_FSB_TO_B(mp, 1)) + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), @@ -653,15 +741,14 @@ xfs_calc_qm_setqlim_reservation( /* * Allocating quota on disk if needed. - * the write transaction log space: M_RES(mp)->tr_write.tr_logres + * the write transaction log space for quota file extent allocation * the unit of quota allocation: one system block size */ STATIC uint xfs_calc_qm_dqalloc_reservation( struct xfs_mount *mp) { - ASSERT(M_RES(mp)->tr_write.tr_logres); - return M_RES(mp)->tr_write.tr_logres + + return xfs_calc_write_reservation(mp) + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1); } @@ -738,6 +825,11 @@ xfs_trans_resv_calc( resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT; resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + resp->tr_create_tmpfile.tr_logres = + xfs_calc_create_tmpfile_reservation(mp); + resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT; + resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp); resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT; resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES; @@ -793,7 +885,6 @@ xfs_trans_resv_calc( /* The following transaction are logged in logical format */ resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp); resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp); - resp->tr_swrite.tr_logres = xfs_calc_swrite_reservation(mp); resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp); resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp); resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp); diff --git a/fs/xfs/xfs_trans_resv.h b/fs/xfs/xfs_trans_resv.h index de7de9aaad8..1097d14cd58 100644 --- a/fs/xfs/xfs_trans_resv.h +++ b/fs/xfs/xfs_trans_resv.h @@ -38,11 +38,11 @@ struct xfs_trans_resv { struct xfs_trans_res tr_remove; /* unlink trans */ struct xfs_trans_res tr_symlink; /* symlink trans */ struct xfs_trans_res tr_create; /* create trans */ + struct xfs_trans_res tr_create_tmpfile; /* create O_TMPFILE trans */ struct xfs_trans_res tr_mkdir; /* mkdir trans */ struct xfs_trans_res tr_ifree; /* inode free trans */ struct xfs_trans_res tr_ichange; /* inode update trans */ struct xfs_trans_res tr_growdata; /* fs data section grow trans */ - struct xfs_trans_res tr_swrite; /* sync write inode trans */ struct xfs_trans_res tr_addafork; /* add inode attr fork trans */ struct xfs_trans_res tr_writeid; /* write setuid/setgid file */ struct xfs_trans_res tr_attrinval; /* attr fork buffer @@ -100,6 +100,7 @@ struct xfs_trans_resv { #define XFS_ITRUNCATE_LOG_COUNT 2 #define XFS_INACTIVE_LOG_COUNT 2 #define XFS_CREATE_LOG_COUNT 2 +#define XFS_CREATE_TMPFILE_LOG_COUNT 2 #define XFS_MKDIR_LOG_COUNT 3 #define XFS_SYMLINK_LOG_COUNT 3 #define XFS_REMOVE_LOG_COUNT 2 diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h index 7d2c920dfb9..bf9c4579334 100644 --- a/fs/xfs/xfs_trans_space.h +++ b/fs/xfs/xfs_trans_space.h @@ -28,7 +28,8 @@ (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ XFS_EXTENTADD_SPACE_RES(mp,w)) -#define XFS_DAENTER_1B(mp,w) ((w) == XFS_DATA_FORK ? (mp)->m_dirblkfsbs : 1) +#define XFS_DAENTER_1B(mp,w) \ + ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1) #define XFS_DAENTER_DBS(mp,w) \ (XFS_DA_NODE_MAXDEPTH + (((w) == XFS_DATA_FORK) ? 2 : 0)) #define XFS_DAENTER_BLOCKS(mp,w) \ @@ -47,13 +48,15 @@ #define XFS_DIRREMOVE_SPACE_RES(mp) \ XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) #define XFS_IALLOC_SPACE_RES(mp) \ - (XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels - 1) + ((mp)->m_ialloc_blks + \ + (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \ + ((mp)->m_in_maxlevels - 1))) /* * Space reservation values for various transactions. */ #define XFS_ADDAFORK_SPACE_RES(mp) \ - ((mp)->m_dirblkfsbs + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK)) + ((mp)->m_dir_geo->fsbcount + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK)) #define XFS_ATTRRM_SPACE_RES(mp) \ XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK) /* This macro is not used - see inline code in xfs_attr_set */ @@ -82,5 +85,8 @@ (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) #define XFS_SYMLINK_SPACE_RES(mp,nl,b) \ (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b)) +#define XFS_IFREE_SPACE_RES(mp) \ + (xfs_sb_version_hasfinobt(&mp->m_sb) ? (mp)->m_in_maxlevels : 0) + #endif /* __XFS_TRANS_SPACE_H__ */ diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index 82bbc34d54a..65c6e6650b1 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -134,7 +134,7 @@ typedef enum { typedef enum { XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi, - XFS_BTNUM_MAX + XFS_BTNUM_FINOi, XFS_BTNUM_MAX } xfs_btnum_t; struct xfs_name { diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h index db14d0c0868..e8a77383c0d 100644 --- a/fs/xfs/xfs_vnode.h +++ b/fs/xfs/xfs_vnode.h @@ -25,14 +25,6 @@ struct xfs_inode; struct attrlist_cursor_kern; /* - * Return values for xfs_inactive. A return value of - * VN_INACTIVE_NOCACHE implies that the file system behavior - * has disassociated its state and bhv_desc_t from the vnode. - */ -#define VN_INACTIVE_CACHE 0 -#define VN_INACTIVE_NOCACHE 1 - -/* * Flags for read/write calls - same values as IRIX */ #define IO_ISDIRECT 0x00004 /* bypass page cache */ @@ -43,15 +35,6 @@ struct attrlist_cursor_kern; { IO_INVIS, "INVIS"} /* - * Flush/Invalidate options for vop_toss/flush/flushinval_pages. - */ -#define FI_NONE 0 /* none */ -#define FI_REMAPF 1 /* Do a remapf prior to the operation */ -#define FI_REMAPF_LOCKED 2 /* Do a remapf prior to the operation. - Prevent VM access to the pages until - the operation completes. */ - -/* * Some useful predicates. */ #define VN_MAPPED(vp) mapping_mapped(vp->i_mapping) diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index e01f35ea76b..78ed92a46fd 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -17,9 +17,13 @@ */ #include "xfs.h" +#include "xfs_format.h" #include "xfs_log_format.h" -#include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_inode.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" @@ -98,8 +102,8 @@ const struct xattr_handler *xfs_xattr_handlers[] = { &xfs_xattr_trusted_handler, &xfs_xattr_security_handler, #ifdef CONFIG_XFS_POSIX_ACL - &xfs_xattr_acl_access_handler, - &xfs_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif NULL }; |
